diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1816,13 +1816,15 @@ WorklistInserter AddNodes(*this); + DAG.AssignTopologicalOrder(); + // Add all the dag nodes to the worklist. // // Note: All nodes are not added to PruningList here, this is because the only // nodes which can be deleted are those which have no uses and all other nodes // which would otherwise be added to the worklist by the first call to // getNextWorklistEntry are already present in it. - for (SDNode &Node : DAG.allnodes()) + for (SDNode &Node : reverse(DAG.allnodes())) AddToWorklist(&Node, /* IsCandidateForPruning */ Node.use_empty()); // Create a dummy node (which is not added to allnodes), that adds a reference diff --git a/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll b/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll --- a/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll +++ b/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll @@ -12,14 +12,14 @@ ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %eax, (%esp) +; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %eax, (%esp) ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK-NEXT: movq (%esp), %mm0 -; CHECK-NEXT: movq {{[0-9]+}}(%esp), %mm1 +; CHECK-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; CHECK-NEXT: movq (%esp), %mm1 ; CHECK-NEXT: maskmovq %mm0, %mm1 ; CHECK-NEXT: addl $16, %esp ; CHECK-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/2008-05-12-tailmerge-5.ll b/llvm/test/CodeGen/X86/2008-05-12-tailmerge-5.ll --- a/llvm/test/CodeGen/X86/2008-05-12-tailmerge-5.ll +++ b/llvm/test/CodeGen/X86/2008-05-12-tailmerge-5.ll @@ -15,9 +15,10 @@ ; CHECK-NEXT: movq %rdi, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %ah, {{[0-9]+}}(%rsp) -; CHECK-NEXT: shrq $16, %rax +; CHECK-NEXT: shrl $16, %esi +; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp) +; CHECK-NEXT: shrl $24, %eax ; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %ah, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %r8b, {{[0-9]+}}(%rsp) diff --git a/llvm/test/CodeGen/X86/2008-06-13-VolatileLoadStore.ll b/llvm/test/CodeGen/X86/2008-06-13-VolatileLoadStore.ll --- a/llvm/test/CodeGen/X86/2008-06-13-VolatileLoadStore.ll +++ b/llvm/test/CodeGen/X86/2008-06-13-VolatileLoadStore.ll @@ -10,11 +10,13 @@ ; CHECK-LABEL: f: ; CHECK: # %bb.0: ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: movsd %xmm1, atomic -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: movsd %xmm1, atomic2 -; CHECK-NEXT: movsd %xmm0, anything +; CHECK-NEXT: movsd %xmm0, atomic +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movsd %xmm0, atomic2 +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %ecx, anything+4 +; CHECK-NEXT: movl %eax, anything ; CHECK-NEXT: movl ioport, %ecx ; CHECK-NEXT: movl ioport, %eax ; CHECK-NEXT: shrl $16, %eax diff --git a/llvm/test/CodeGen/X86/2008-12-02-dagcombine-1.ll b/llvm/test/CodeGen/X86/2008-12-02-dagcombine-1.ll --- a/llvm/test/CodeGen/X86/2008-12-02-dagcombine-1.ll +++ b/llvm/test/CodeGen/X86/2008-12-02-dagcombine-1.ll @@ -8,8 +8,11 @@ ; CHECK-LABEL: test: ; CHECK: ## %bb.0: ## %entry ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: subl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl $-2, %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl %ecx, %edx +; CHECK-NEXT: subl %edx, %eax +; CHECK-NEXT: leal -2(%eax,%ecx), %eax ; CHECK-NEXT: retl entry: %0 = ptrtoint ptr %a to i32 diff --git a/llvm/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll b/llvm/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll --- a/llvm/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll +++ b/llvm/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll @@ -16,9 +16,9 @@ ; CHECK-NEXT: ## InlineAsm Start ; CHECK-NEXT: cpuid ; CHECK-NEXT: ## InlineAsm End -; CHECK-NEXT: movl %ebx, 8(%esi) -; CHECK-NEXT: movl %ecx, 12(%esi) ; CHECK-NEXT: movl %edx, 16(%esi) +; CHECK-NEXT: movl %ecx, 12(%esi) +; CHECK-NEXT: movl %ebx, 8(%esi) ; CHECK-NEXT: movl %eax, 4(%esi) ; CHECK-NEXT: popl %esi ; CHECK-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll b/llvm/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll --- a/llvm/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll +++ b/llvm/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll @@ -17,17 +17,17 @@ ; CHECK-NEXT: movq %rdx, (%rsp) ; CHECK-NEXT: movq 24(%rdi), %rdx ; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rsi, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq 16(%rdi), %rdx +; CHECK-NEXT: movq 56(%rdi), %rdx ; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq 32(%rdi), %rdx +; CHECK-NEXT: movq 48(%rdi), %rdx ; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq 40(%rdi), %rdx ; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq 48(%rdi), %rdx +; CHECK-NEXT: movq 32(%rdi), %rdx ; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq 56(%rdi), %rdx +; CHECK-NEXT: movq 16(%rdi), %rdx ; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rsi, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %al, (%rsp) ; CHECK-NEXT: movb %cl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq __stack_chk_guard(%rip), %rax diff --git a/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll b/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll --- a/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll +++ b/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll @@ -17,21 +17,19 @@ define dso_local i32 @main() nounwind uwtable { ; CHECK-LABEL: main: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl i(%rip), %esi -; CHECK-NEXT: movl j(%rip), %eax -; CHECK-NEXT: movl %esi, %edx +; CHECK-NEXT: movq i(%rip), %rdx +; CHECK-NEXT: movq j(%rip), %rsi +; CHECK-NEXT: movsbl %sil, %eax +; CHECK-NEXT: idivb %dl +; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $8, %edx -; CHECK-NEXT: movsbl %al, %ecx -; CHECK-NEXT: shrl $8, %eax -; CHECK-NEXT: cbtw +; CHECK-NEXT: shrl $8, %esi +; CHECK-NEXT: movsbl %sil, %eax ; CHECK-NEXT: idivb %dl -; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: idivb %sil -; CHECK-NEXT: movzbl %dl, %ecx +; CHECK-NEXT: movzbl %cl, %ecx +; CHECK-NEXT: movd %ecx, %xmm0 ; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: pinsrb $1, %ecx, %xmm0 +; CHECK-NEXT: pinsrb $1, %eax, %xmm0 ; CHECK-NEXT: pextrw $0, %xmm0, res(%rip) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/abds.ll b/llvm/test/CodeGen/X86/abds.ll --- a/llvm/test/CodeGen/X86/abds.ll +++ b/llvm/test/CodeGen/X86/abds.ll @@ -20,13 +20,15 @@ ; ; X64-LABEL: abd_ext_i8: ; X64: # %bb.0: -; X64-NEXT: movsbl %sil, %eax -; X64-NEXT: movsbl %dil, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %ecx, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: movsbq %dil, %rcx +; X64-NEXT: movsbq %sil, %rax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmovsq %rcx, %rax +; X64-NEXT: # kill: def $al killed $al killed $rax ; X64-NEXT: retq %aext = sext i8 %a to i64 %bext = sext i8 %b to i64 @@ -50,13 +52,15 @@ ; ; X64-LABEL: abd_ext_i8_undef: ; X64: # %bb.0: -; X64-NEXT: movsbl %sil, %eax -; X64-NEXT: movsbl %dil, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %ecx, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: movsbq %dil, %rcx +; X64-NEXT: movsbq %sil, %rax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmovsq %rcx, %rax +; X64-NEXT: # kill: def $al killed $al killed $rax ; X64-NEXT: retq %aext = sext i8 %a to i64 %bext = sext i8 %b to i64 @@ -80,13 +84,15 @@ ; ; X64-LABEL: abd_ext_i16: ; X64: # %bb.0: -; X64-NEXT: movswl %si, %eax -; X64-NEXT: movswl %di, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %ecx, %eax -; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: movswq %di, %rcx +; X64-NEXT: movswq %si, %rax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmovsq %rcx, %rax +; X64-NEXT: # kill: def $ax killed $ax killed $rax ; X64-NEXT: retq %aext = sext i16 %a to i64 %bext = sext i16 %b to i64 @@ -110,13 +116,15 @@ ; ; X64-LABEL: abd_ext_i16_undef: ; X64: # %bb.0: -; X64-NEXT: movswl %si, %eax -; X64-NEXT: movswl %di, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %ecx, %eax -; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: movswq %di, %rcx +; X64-NEXT: movswq %si, %rax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmovsq %rcx, %rax +; X64-NEXT: # kill: def $ax killed $ax killed $rax ; X64-NEXT: retq %aext = sext i16 %a to i64 %bext = sext i16 %b to i64 @@ -129,13 +137,19 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind { ; X86-LABEL: abd_ext_i32: ; X86: # %bb.0: +; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: sarl $31, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: negl %edx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: sarl $31, %esi ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovlel %edx, %eax +; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: xorl %esi, %eax +; X86-NEXT: subl %esi, %eax +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: abd_ext_i32: @@ -159,13 +173,19 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind { ; X86-LABEL: abd_ext_i32_undef: ; X86: # %bb.0: +; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: sarl $31, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: negl %edx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: sarl $31, %esi ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovlel %edx, %eax +; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: xorl %esi, %eax +; X86-NEXT: subl %esi, %eax +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: abd_ext_i32_undef: diff --git a/llvm/test/CodeGen/X86/abdu.ll b/llvm/test/CodeGen/X86/abdu.ll --- a/llvm/test/CodeGen/X86/abdu.ll +++ b/llvm/test/CodeGen/X86/abdu.ll @@ -20,13 +20,13 @@ ; ; X64-LABEL: abd_ext_i8: ; X64: # %bb.0: -; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: movzbl %dil, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %ecx, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: movzbl %sil, %eax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmovsq %rcx, %rax +; X64-NEXT: # kill: def $al killed $al killed $rax ; X64-NEXT: retq %aext = zext i8 %a to i64 %bext = zext i8 %b to i64 @@ -50,13 +50,13 @@ ; ; X64-LABEL: abd_ext_i8_undef: ; X64: # %bb.0: -; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: movzbl %dil, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %ecx, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: movzbl %sil, %eax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmovsq %rcx, %rax +; X64-NEXT: # kill: def $al killed $al killed $rax ; X64-NEXT: retq %aext = zext i8 %a to i64 %bext = zext i8 %b to i64 @@ -80,13 +80,13 @@ ; ; X64-LABEL: abd_ext_i16: ; X64: # %bb.0: -; X64-NEXT: movzwl %si, %eax ; X64-NEXT: movzwl %di, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %ecx, %eax -; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: movzwl %si, %eax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmovsq %rcx, %rax +; X64-NEXT: # kill: def $ax killed $ax killed $rax ; X64-NEXT: retq %aext = zext i16 %a to i64 %bext = zext i16 %b to i64 @@ -110,13 +110,13 @@ ; ; X64-LABEL: abd_ext_i16_undef: ; X64: # %bb.0: -; X64-NEXT: movzwl %si, %eax ; X64-NEXT: movzwl %di, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %ecx, %eax -; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: movzwl %si, %eax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmovsq %rcx, %rax +; X64-NEXT: # kill: def $ax killed $ax killed $rax ; X64-NEXT: retq %aext = zext i16 %a to i64 %bext = zext i16 %b to i64 @@ -129,13 +129,13 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind { ; X86-LABEL: abd_ext_i32: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: negl %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: subl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl %ecx, %ecx +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovbel %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: abd_ext_i32: @@ -159,13 +159,13 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind { ; X86-LABEL: abd_ext_i32_undef: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: negl %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: subl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl %ecx, %ecx +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovbel %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: abd_ext_i32_undef: diff --git a/llvm/test/CodeGen/X86/absolute-constant.ll b/llvm/test/CodeGen/X86/absolute-constant.ll --- a/llvm/test/CodeGen/X86/absolute-constant.ll +++ b/llvm/test/CodeGen/X86/absolute-constant.ll @@ -10,7 +10,10 @@ define void @bar(ptr %x) { ; CHECK-LABEL: bar: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: testb $foo, (%rdi) +; CHECK-NEXT: movsbl (%rdi), %eax +; CHECK-NEXT: movl $foo, %ecx +; CHECK-NEXT: movsbl %cl, %ecx +; CHECK-NEXT: testl %ecx, %eax ; CHECK-NEXT: je .LBB0_1 ; CHECK-NEXT: # %bb.2: # %if.then ; CHECK-NEXT: xorl %eax, %eax @@ -20,7 +23,10 @@ ; ; PIC-LABEL: bar: ; PIC: # %bb.0: # %entry -; PIC-NEXT: testb $foo, (%rdi) +; PIC-NEXT: movsbl (%rdi), %eax +; PIC-NEXT: movl $foo, %ecx +; PIC-NEXT: movsbl %cl, %ecx +; PIC-NEXT: testl %ecx, %eax ; PIC-NEXT: je .LBB0_1 ; PIC-NEXT: # %bb.2: # %if.then ; PIC-NEXT: xorl %eax, %eax diff --git a/llvm/test/CodeGen/X86/add-cmov.ll b/llvm/test/CodeGen/X86/add-cmov.ll --- a/llvm/test/CodeGen/X86/add-cmov.ll +++ b/llvm/test/CodeGen/X86/add-cmov.ll @@ -368,7 +368,7 @@ ; CHECK-NEXT: addq $66, %rsi ; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: cmovneq %rax, %rsi -; CHECK-NEXT: decw (%rdx,%rsi) +; CHECK-NEXT: decw (%rsi,%rdx) ; CHECK-NEXT: retq %i = ptrtoint ptr %ptr to i64 %i66 = add i64 %i, 66 @@ -414,7 +414,7 @@ ; CHECK-NEXT: addq $66, %rdx ; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: cmovneq %rax, %rdx -; CHECK-NEXT: decw (%rsi,%rdx) +; CHECK-NEXT: decw (%rdx,%rsi) ; CHECK-NEXT: retq %i = ptrtoint ptr %ptr to i64 %i66 = add i64 %idx, 66 diff --git a/llvm/test/CodeGen/X86/add-of-mul.ll b/llvm/test/CodeGen/X86/add-of-mul.ll --- a/llvm/test/CodeGen/X86/add-of-mul.ll +++ b/llvm/test/CodeGen/X86/add-of-mul.ll @@ -26,7 +26,8 @@ define <4 x i32> @test_vector(<4 x i32> %x) { ; CHECK-LABEL: test_vector: ; CHECK: # %bb.0: -; CHECK-NEXT: pslld $2, %xmm0 +; CHECK-NEXT: paddd %xmm0, %xmm0 +; CHECK-NEXT: paddd %xmm0, %xmm0 ; CHECK-NEXT: retq %mul = mul <4 x i32> %x, %add = add <4 x i32> %mul, %x diff --git a/llvm/test/CodeGen/X86/add-sub-bool.ll b/llvm/test/CodeGen/X86/add-sub-bool.ll --- a/llvm/test/CodeGen/X86/add-sub-bool.ll +++ b/llvm/test/CodeGen/X86/add-sub-bool.ll @@ -344,7 +344,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: adcl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; @@ -367,7 +367,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: adcl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; @@ -430,10 +430,10 @@ ; X86-LABEL: test_i32_add_sub_var: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: subl {{[0-9]+}}(%esp), %eax -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: adcl $0, %eax ; X86-NEXT: retl ; @@ -455,10 +455,10 @@ ; X86-LABEL: test_i32_add_sub_commute_var: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: subl {{[0-9]+}}(%esp), %eax -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: adcl $0, %eax ; X86-NEXT: retl ; @@ -480,10 +480,10 @@ ; X86-LABEL: test_i32_sub_add_var: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: retl ; @@ -508,7 +508,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; @@ -559,7 +559,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: adcl $0, %eax ; X86-NEXT: subl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl @@ -584,7 +584,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; @@ -605,10 +605,10 @@ ; X86-LABEL: test_i32_sub_sum_var: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: negl %eax ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/addcarry.ll b/llvm/test/CodeGen/X86/addcarry.ll --- a/llvm/test/CodeGen/X86/addcarry.ll +++ b/llvm/test/CodeGen/X86/addcarry.ll @@ -317,21 +317,13 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: addq (%rsi), %rdx -; CHECK-NEXT: movq 8(%rsi), %rdi -; CHECK-NEXT: adcq $0, %rdi -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: movzbl %r10b, %r10d -; CHECK-NEXT: addq %rcx, %rdi -; CHECK-NEXT: adcq 16(%rsi), %r10 -; CHECK-NEXT: setb %cl -; CHECK-NEXT: movzbl %cl, %ecx -; CHECK-NEXT: addq %r8, %r10 -; CHECK-NEXT: adcq 24(%rsi), %rcx -; CHECK-NEXT: addq %r9, %rcx -; CHECK-NEXT: movq %rdx, (%rax) -; CHECK-NEXT: movq %rdi, 8(%rax) -; CHECK-NEXT: movq %r10, 16(%rax) -; CHECK-NEXT: movq %rcx, 24(%rax) +; CHECK-NEXT: adcq 8(%rsi), %rcx +; CHECK-NEXT: adcq 16(%rsi), %r8 +; CHECK-NEXT: adcq 24(%rsi), %r9 +; CHECK-NEXT: movq %rdx, (%rdi) +; CHECK-NEXT: movq %rcx, 8(%rdi) +; CHECK-NEXT: movq %r8, 16(%rdi) +; CHECK-NEXT: movq %r9, 24(%rdi) ; CHECK-NEXT: retq entry: %0 = extractvalue %S %arg.b, 0 @@ -420,15 +412,15 @@ define i128 @addcarry_to_subcarry(i64 %a, i64 %b) nounwind { ; CHECK-LABEL: addcarry_to_subcarry: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movq %rsi, %rcx +; CHECK-NEXT: notq %rcx +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: addq %rdi, %rcx +; CHECK-NEXT: setb %dl +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpq %rsi, %rdi -; CHECK-NEXT: notq %rsi -; CHECK-NEXT: setae %cl -; CHECK-NEXT: addb $-1, %cl -; CHECK-NEXT: adcq $0, %rax -; CHECK-NEXT: setb %cl -; CHECK-NEXT: movzbl %cl, %edx -; CHECK-NEXT: addq %rsi, %rax +; CHECK-NEXT: setae %al +; CHECK-NEXT: addq %rcx, %rax ; CHECK-NEXT: adcq $0, %rdx ; CHECK-NEXT: retq %notb = xor i64 %b, -1 @@ -794,17 +786,20 @@ define i32 @add_U320_without_i128_add(ptr nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) nounwind { ; CHECK-LABEL: add_U320_without_i128_add: ; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movq 16(%rdi), %rax ; CHECK-NEXT: movq 24(%rdi), %r10 ; CHECK-NEXT: movq 32(%rdi), %r11 +; CHECK-NEXT: addq 8(%rdi), %rdx +; CHECK-NEXT: movq %rax, %rbx +; CHECK-NEXT: adcq %rcx, %rbx ; CHECK-NEXT: addq %rsi, (%rdi) -; CHECK-NEXT: adcq %rdx, 8(%rdi) -; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: adcq %rcx, %rdx +; CHECK-NEXT: adcq $0, %rdx +; CHECK-NEXT: adcq $0, %rbx ; CHECK-NEXT: addq %rcx, %rax ; CHECK-NEXT: movq %r10, %rcx ; CHECK-NEXT: adcq %r8, %rcx -; CHECK-NEXT: cmpq %rax, %rdx +; CHECK-NEXT: cmpq %rax, %rbx ; CHECK-NEXT: adcq $0, %rcx ; CHECK-NEXT: leaq (%r11,%r9), %rsi ; CHECK-NEXT: addq %r8, %r10 @@ -816,10 +811,12 @@ ; CHECK-NEXT: cmpq %rsi, %r8 ; CHECK-NEXT: setb %al ; CHECK-NEXT: addq %r9, %r11 -; CHECK-NEXT: movq %rdx, 16(%rdi) +; CHECK-NEXT: movq %rdx, 8(%rdi) +; CHECK-NEXT: movq %rbx, 16(%rdi) ; CHECK-NEXT: movq %rcx, 24(%rdi) ; CHECK-NEXT: movq %r8, 32(%rdi) ; CHECK-NEXT: adcl $0, %eax +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: retq %7 = load i64, ptr %0, align 8 %8 = getelementptr inbounds %struct.U320, ptr %0, i64 0, i32 0, i64 1 @@ -1326,12 +1323,14 @@ ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movq (%rdx), %rcx ; CHECK-NEXT: movq 8(%rdx), %rdi -; CHECK-NEXT: addq (%rsi), %rcx -; CHECK-NEXT: adcq 8(%rsi), %rdi ; CHECK-NEXT: movq 16(%rdx), %r8 ; CHECK-NEXT: movq 24(%rdx), %rdx -; CHECK-NEXT: adcq 16(%rsi), %r8 +; CHECK-NEXT: addq 16(%rsi), %r8 ; CHECK-NEXT: adcq 24(%rsi), %rdx +; CHECK-NEXT: addq (%rsi), %rcx +; CHECK-NEXT: adcq 8(%rsi), %rdi +; CHECK-NEXT: adcq $0, %r8 +; CHECK-NEXT: adcq $0, %rdx ; CHECK-NEXT: movq %rcx, (%rax) ; CHECK-NEXT: movq %rdi, 8(%rax) ; CHECK-NEXT: movq %r8, 16(%rax) diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -316,7 +316,7 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -326,7 +326,7 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -336,7 +336,7 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -976,38 +976,77 @@ ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; -; AVX512F-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] -; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpermd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq ; -; AVX512DQ-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512F-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,3] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq ; -; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,0,7] -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512DQ-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,3] +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,0,7] +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,0,3] +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias @@ -1027,22 +1066,22 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -1050,21 +1089,21 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> -; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: pshufb %xmm3, %xmm2 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE42-NEXT: pshufb %xmm3, %xmm2 -; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm4 -; SSE42-NEXT: movdqa %xmm4, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm4 +; SSE42-NEXT: movdqa %xmm4, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -1072,21 +1111,21 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX-NEXT: # xmm3 = mem[0,0] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -1160,20 +1199,20 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: paddb 16(%rdx), %xmm3 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -1181,19 +1220,19 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] -; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] -; SSE42-NEXT: pshufb %xmm1, %xmm3 -; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] -; SSE42-NEXT: pshufb %xmm1, %xmm0 -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm3 -; SSE42-NEXT: movdqa %xmm3, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] +; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] +; SSE42-NEXT: pshufb %xmm2, %xmm3 +; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; SSE42-NEXT: pshufb %xmm2, %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm3 +; SSE42-NEXT: movdqa %xmm3, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -1201,18 +1240,18 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -1285,20 +1324,20 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: paddb 16(%rdx), %xmm3 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1306,19 +1345,19 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] -; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] -; SSE42-NEXT: pshufb %xmm1, %xmm3 -; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] -; SSE42-NEXT: pshufb %xmm1, %xmm0 -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm3 -; SSE42-NEXT: movdqa %xmm3, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] +; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] +; SSE42-NEXT: pshufb %xmm2, %xmm3 +; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; SSE42-NEXT: pshufb %xmm2, %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm3 +; SSE42-NEXT: movdqa %xmm3, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1326,18 +1365,18 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1410,19 +1449,19 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: paddb 16(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1430,17 +1469,17 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm1 ; SSE42-NEXT: movdqa 32(%rdi), %xmm2 ; SSE42-NEXT: movdqa 48(%rdi), %xmm3 -; SSE42-NEXT: paddb 48(%rsi), %xmm3 ; SSE42-NEXT: paddb 32(%rsi), %xmm2 +; SSE42-NEXT: paddb 48(%rsi), %xmm3 ; SSE42-NEXT: paddb (%rsi), %xmm1 ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; SSE42-NEXT: movdqa %xmm1, %xmm4 -; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm4 -; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm1 -; SSE42-NEXT: paddb 16(%rdx), %xmm1 -; SSE42-NEXT: paddb (%rdx), %xmm4 -; SSE42-NEXT: movdqa %xmm4, (%rcx) -; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm4 +; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: paddb 16(%rdx), %xmm4 +; SSE42-NEXT: movdqa %xmm4, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1448,16 +1487,16 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm2 +; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1535,25 +1574,25 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,1,2,3,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1561,20 +1600,20 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: pshufb %xmm3, %xmm2 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE42-NEXT: pshufb %xmm3, %xmm2 -; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm4 -; SSE42-NEXT: movdqa %xmm4, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm4 +; SSE42-NEXT: movdqa %xmm4, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1582,19 +1621,20 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15] +; AVX-NEXT: # xmm3 = mem[0,0] ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1665,21 +1705,21 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 -; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa 48(%rdi), %xmm2 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,0,65535,65535,65535] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: paddb 16(%rdx), %xmm3 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1687,16 +1727,16 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] ; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: paddb 16(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1704,16 +1744,16 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1785,19 +1825,19 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: paddb 16(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1805,15 +1845,15 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1821,15 +1861,15 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1901,19 +1941,19 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: @@ -1921,16 +1961,16 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: paddb 16(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: @@ -1945,11 +1985,11 @@ ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -2059,15 +2099,15 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: paddb 16(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: @@ -2075,15 +2115,15 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: @@ -2097,11 +2137,11 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -2211,15 +2251,15 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm2[1] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: @@ -2227,15 +2267,15 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: @@ -2249,11 +2289,11 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -2285,12 +2325,12 @@ ; ; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,0,7] -; AVX512F-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-FAST-NEXT: vzeroupper @@ -2311,12 +2351,12 @@ ; ; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,0,7] -; AVX512DQ-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-FAST-NEXT: vzeroupper @@ -2337,9 +2377,9 @@ ; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,13,0,15] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper @@ -2414,22 +2454,23 @@ ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm2 +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -2439,11 +2480,11 @@ ; ; AVX512F-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 @@ -2455,11 +2496,11 @@ ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 @@ -2548,10 +2589,10 @@ ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -2580,13 +2621,14 @@ ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] -; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2598,13 +2640,14 @@ ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] -; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2683,36 +2726,36 @@ ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm2 ; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 @@ -2725,10 +2768,10 @@ ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastd %xmm0, %ymm2 ; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 @@ -2803,8 +2846,8 @@ ; SSE42-NEXT: paddb %xmm1, %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: @@ -2818,11 +2861,11 @@ ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: @@ -2851,7 +2894,6 @@ ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -2869,7 +2911,6 @@ ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -2953,36 +2994,36 @@ ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm2 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 @@ -2995,10 +3036,10 @@ ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm2 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 @@ -3084,13 +3125,13 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -3120,7 +3161,6 @@ ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -3138,7 +3178,6 @@ ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -3189,10 +3228,10 @@ ; SSE2-NEXT: paddb (%rdx), %xmm3 ; SSE2-NEXT: movdqa 16(%rdx), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: paddb 48(%rdx), %xmm1 ; SSE2-NEXT: paddb 32(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rcx) +; SSE2-NEXT: paddb 48(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 48(%rcx) +; SSE2-NEXT: movdqa %xmm0, 32(%rcx) ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) ; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: retq @@ -3211,10 +3250,10 @@ ; SSE42-NEXT: paddb (%rdx), %xmm4 ; SSE42-NEXT: movdqa 16(%rdx), %xmm0 ; SSE42-NEXT: paddb %xmm1, %xmm0 -; SSE42-NEXT: paddb 48(%rdx), %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rcx) +; SSE42-NEXT: paddb 48(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 48(%rcx) +; SSE42-NEXT: movdqa %xmm1, 32(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: movdqa %xmm4, (%rcx) ; SSE42-NEXT: retq @@ -3230,12 +3269,12 @@ ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: retq ; @@ -3345,8 +3384,8 @@ ; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm1 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: paddb 16(%rdx), %xmm3 -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm3, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: @@ -3360,8 +3399,8 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: @@ -3486,18 +3525,18 @@ ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] @@ -3514,9 +3553,8 @@ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm2 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -3530,9 +3568,8 @@ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -3616,10 +3653,10 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -3641,35 +3678,37 @@ ; ; AVX512F-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3746,26 +3785,26 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm2 -; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX2-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3879,10 +3918,10 @@ ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5],xmm3[6],xmm1[7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -3904,35 +3943,35 @@ ; ; AVX512F-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4008,12 +4047,12 @@ ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: retq ; @@ -4116,8 +4155,8 @@ ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: @@ -4130,8 +4169,8 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: @@ -4150,13 +4189,14 @@ ; ; AVX512F-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX512F-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -4164,13 +4204,14 @@ ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -4247,29 +4288,29 @@ ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd %xmm0, %xmm2 -; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm1 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX2-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4,5,6,7] ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4282,11 +4323,11 @@ ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,0,15] ; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4299,11 +4340,11 @@ ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,0,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4378,10 +4419,10 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -4411,11 +4452,11 @@ ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,0] ; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4428,11 +4469,11 @@ ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,0] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4511,17 +4552,19 @@ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3],ymm3[4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: @@ -4548,11 +4591,11 @@ ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,15] ; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4565,11 +4608,11 @@ ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4608,8 +4651,8 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: @@ -4622,8 +4665,8 @@ ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: @@ -4636,8 +4679,8 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: @@ -4760,15 +4803,15 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[2] -; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm3 +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm2, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm3, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -4796,11 +4839,11 @@ ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4813,11 +4856,11 @@ ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4870,8 +4913,8 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: @@ -4884,8 +4927,8 @@ ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: @@ -4896,11 +4939,11 @@ ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -4988,17 +5031,17 @@ ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa 16(%rdx), %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rdx), %xmm3 +; SSE-NEXT: movdqa 32(%rdx), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rcx) -; SSE-NEXT: movdqa %xmm3, 48(%rcx) -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm1, 16(%rcx) +; SSE-NEXT: paddb 48(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rcx) +; SSE-NEXT: movdqa %xmm3, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: @@ -5007,14 +5050,14 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: @@ -5022,10 +5065,10 @@ ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5034,10 +5077,10 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5046,10 +5089,10 @@ ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5078,17 +5121,17 @@ ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa 16(%rdx), %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rdx), %xmm3 +; SSE-NEXT: movdqa 32(%rdx), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rcx) -; SSE-NEXT: movdqa %xmm3, 48(%rcx) -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm1, 16(%rcx) +; SSE-NEXT: paddb 48(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rcx) +; SSE-NEXT: movdqa %xmm3, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: @@ -5096,14 +5139,14 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: @@ -5111,10 +5154,10 @@ ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5123,10 +5166,10 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5135,10 +5178,10 @@ ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5167,17 +5210,17 @@ ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa 16(%rdx), %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rdx), %xmm3 +; SSE-NEXT: movdqa 32(%rdx), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rcx) -; SSE-NEXT: movdqa %xmm3, 48(%rcx) -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm1, 16(%rcx) +; SSE-NEXT: paddb 48(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rcx) +; SSE-NEXT: movdqa %xmm3, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: @@ -5185,14 +5228,14 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: @@ -5200,10 +5243,10 @@ ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5212,10 +5255,10 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5224,10 +5267,10 @@ ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5255,31 +5298,31 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: movdqa 16(%rdx), %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rdx), %xmm3 +; SSE-NEXT: movdqa 32(%rdx), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rcx) -; SSE-NEXT: movdqa %xmm3, 48(%rcx) -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm1, 16(%rcx) +; SSE-NEXT: paddb 48(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rcx) +; SSE-NEXT: movdqa %xmm3, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: @@ -5287,10 +5330,10 @@ ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5299,10 +5342,10 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5311,10 +5354,10 @@ ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5342,44 +5385,44 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: paddb 16(%rsi), %xmm1 -; SSE-NEXT: movdqa 16(%rdx), %xmm2 -; SSE-NEXT: paddb %xmm1, %xmm2 -; SSE-NEXT: movdqa (%rdx), %xmm3 -; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 48(%rdx), %xmm1 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: paddb %xmm0, %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 ; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rcx) +; SSE-NEXT: paddb 48(%rdx), %xmm1 ; SSE-NEXT: movdqa %xmm1, 48(%rcx) -; SSE-NEXT: movdqa %xmm3, (%rcx) -; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm0, 32(%rcx) +; SSE-NEXT: movdqa %xmm3, 16(%rcx) +; SSE-NEXT: movdqa %xmm2, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm3 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5387,10 +5430,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5398,10 +5441,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5547,44 +5590,44 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: paddb 16(%rsi), %xmm1 -; SSE-NEXT: movdqa 16(%rdx), %xmm2 -; SSE-NEXT: paddb %xmm1, %xmm2 -; SSE-NEXT: movdqa (%rdx), %xmm3 -; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 48(%rdx), %xmm1 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: paddb %xmm0, %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 ; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rcx) +; SSE-NEXT: paddb 48(%rdx), %xmm1 ; SSE-NEXT: movdqa %xmm1, 48(%rcx) -; SSE-NEXT: movdqa %xmm3, (%rcx) -; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm0, 32(%rcx) +; SSE-NEXT: movdqa %xmm3, 16(%rcx) +; SSE-NEXT: movdqa %xmm2, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm3 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5592,10 +5635,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5603,10 +5646,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -283,7 +283,7 @@ ; AVX512F-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper @@ -292,7 +292,7 @@ ; AVX512DQ-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper @@ -301,7 +301,7 @@ ; AVX512BW-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper @@ -609,16 +609,16 @@ ; ; AVX512F-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX512F-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512F-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX512F-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512F-NEXT: movl (%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX512F-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] +; AVX512F-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] +; AVX512F-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper @@ -626,40 +626,37 @@ ; ; AVX512DQ-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-NEXT: vmovd %xmm0, %eax -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX512DQ-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX512DQ-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQ-NEXT: movl (%rdi), %eax +; AVX512DQ-NEXT: vmovd %eax, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX512DQ-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] +; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] +; AVX512DQ-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: -; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-SLOW-NEXT: vzeroupper -; AVX512BW-SLOW-NEXT: retq -; -; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: -; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0 -; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7] -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-FAST-NEXT: vzeroupper -; AVX512BW-FAST-NEXT: retq +; AVX512BW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512BW-NEXT: movl (%rdi), %eax +; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX512BW-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] +; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] +; AVX512BW-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <8 x i32> @@ -711,12 +708,12 @@ ; ; AVX512F-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7] -; AVX512F-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512F-NEXT: movl (%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512F-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper @@ -724,36 +721,29 @@ ; ; AVX512DQ-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-NEXT: vmovd %xmm0, %eax -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQ-NEXT: movl (%rdi), %eax +; AVX512DQ-NEXT: vmovd %eax, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: -; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-SLOW-NEXT: vzeroupper -; AVX512BW-SLOW-NEXT: retq -; -; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: -; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7] -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-FAST-NEXT: vzeroupper -; AVX512BW-FAST-NEXT: retq +; AVX512BW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512BW-NEXT: movl (%rdi), %eax +; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <8 x i32> @@ -855,19 +845,19 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm3 -; SSE2-NEXT: movdqa %xmm3, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -876,17 +866,17 @@ ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> -; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: pshufb %xmm3, %xmm2 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE42-NEXT: pshufb %xmm3, %xmm2 -; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm4 -; SSE42-NEXT: movdqa %xmm4, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm4 +; SSE42-NEXT: movdqa %xmm4, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -896,16 +886,16 @@ ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX-NEXT: # xmm3 = mem[0,0] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -966,18 +956,18 @@ ; SSE2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: pand 32(%rdi), %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -989,10 +979,10 @@ ; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; SSE42-NEXT: pshufb %xmm2, %xmm0 -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -1003,10 +993,10 @@ ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -1064,18 +1054,18 @@ ; SSE2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,1,0,1] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: pand 32(%rdi), %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1087,10 +1077,10 @@ ; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; SSE42-NEXT: pshufb %xmm2, %xmm0 -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1101,10 +1091,10 @@ ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1164,15 +1154,15 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pandn (%rdi), %xmm1 -; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: pand 32(%rdi), %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1180,24 +1170,24 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm1 ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; SSE42-NEXT: movdqa %xmm1, %xmm2 -; SSE42-NEXT: pblendvb %xmm0, 32(%rdi), %xmm2 -; SSE42-NEXT: pblendvb %xmm0, 48(%rdi), %xmm1 -; SSE42-NEXT: paddb 16(%rsi), %xmm1 -; SSE42-NEXT: paddb (%rsi), %xmm2 -; SSE42-NEXT: movdqa %xmm2, (%rdx) -; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: pblendvb %xmm0, 48(%rdi), %xmm2 +; SSE42-NEXT: pblendvb %xmm0, 32(%rdi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm1 +; SSE42-NEXT: paddb 16(%rsi), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vpblendvb %xmm1, 32(%rdi), %xmm0, %xmm2 -; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm2 +; AVX-NEXT: vpblendvb %xmm1, 32(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1268,10 +1258,10 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1279,16 +1269,16 @@ ; SSE42-NEXT: movdqa 32(%rdi), %xmm0 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; SSE42-NEXT: pshufb %xmm2, %xmm0 +; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = mem[0,0,0,0,4,5,6,7] ; SSE42-NEXT: movdqa %xmm3, %xmm4 -; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE42-NEXT: pshufb %xmm2, %xmm1 -; SSE42-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE42-NEXT: paddb 16(%rsi), %xmm3 -; SSE42-NEXT: paddb (%rsi), %xmm4 -; SSE42-NEXT: movdqa %xmm4, (%rdx) -; SSE42-NEXT: movdqa %xmm3, 16(%rdx) +; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE42-NEXT: pshufb %xmm2, %xmm0 +; SSE42-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE42-NEXT: paddb (%rsi), %xmm3 +; SSE42-NEXT: paddb 16(%rsi), %xmm4 +; SSE42-NEXT: movdqa %xmm4, 16(%rdx) +; SSE42-NEXT: movdqa %xmm3, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1296,15 +1286,15 @@ ; AVX-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = mem[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = mem[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1361,30 +1351,30 @@ ; SSE2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,0,65535,65535,65535] -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,1,0,1] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: pand 32(%rdi), %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1392,10 +1382,10 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1452,27 +1442,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,65535] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pandn (%rdi), %xmm1 -; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: pand 32(%rdi), %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5,6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1480,10 +1470,10 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1542,22 +1532,22 @@ ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: paddb 16(%rsi), %xmm1 -; SSE2-NEXT: paddb (%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm1 +; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: @@ -1565,11 +1555,11 @@ ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],mem[1,3],ymm0[4,4],mem[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -1626,35 +1616,35 @@ ; SSE2-NEXT: movaps (%rdi), %xmm0 ; SSE2-NEXT: movaps 32(%rdi), %xmm1 ; SSE2-NEXT: movaps 48(%rdi), %xmm2 -; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: paddb 16(%rsi), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -1738,36 +1728,36 @@ ; SSE2-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; SSE2: # %bb.0: ; SSE2-NEXT: movapd (%rdi), %xmm0 -; SSE2-NEXT: movapd 32(%rdi), %xmm1 +; SSE2-NEXT: movapd 48(%rdi), %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],mem[1] -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],mem[4,5,6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -1780,62 +1770,32 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vpbroadcastq (%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-SLOW-NEXT: vzeroupper -; AVX512F-SLOW-NEXT: retq -; -; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] -; AVX512F-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-FAST-NEXT: vzeroupper -; AVX512F-FAST-NEXT: retq -; -; AVX512DQ-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vpbroadcastq (%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-SLOW-NEXT: vzeroupper -; AVX512DQ-SLOW-NEXT: retq -; -; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] -; AVX512DQ-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-FAST-NEXT: vzeroupper -; AVX512DQ-FAST-NEXT: retq +; AVX512F-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq ; -; AVX512BW-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vpbroadcastq (%rdi), %ymm0 -; AVX512BW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-SLOW-NEXT: vzeroupper -; AVX512BW-SLOW-NEXT: retq +; AVX512DQ-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] -; AVX512BW-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm1, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-FAST-NEXT: vzeroupper -; AVX512BW-FAST-NEXT: retq +; AVX512BW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64> %broadcast.of.zextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> poison, <4 x i32> @@ -1930,11 +1890,10 @@ ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1945,11 +1904,10 @@ ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2020,10 +1978,10 @@ ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; @@ -2048,10 +2006,12 @@ ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] ; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2062,10 +2022,12 @@ ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2159,11 +2121,10 @@ ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX512F-NEXT: vpternlogd $202, (%rdi){1to8}, %ymm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2173,11 +2134,10 @@ ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX512DQ-NEXT: vpternlogd $202, (%rdi){1to8}, %ymm0, %ymm1 ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2229,13 +2189,13 @@ ; SSE42-NEXT: palignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: movdqa 16(%rsi), %xmm2 ; SSE42-NEXT: paddb %xmm1, %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rdx) -; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: @@ -2247,12 +2207,12 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: @@ -2271,33 +2231,31 @@ ; ; AVX512F-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] -; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] +; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] -; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2392,11 +2350,10 @@ ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpternlogq $202, (%rdi){1to4}, %ymm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2406,11 +2363,10 @@ ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpternlogq $202, (%rdi){1to4}, %ymm0, %ymm1 ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2502,33 +2458,31 @@ ; ; AVX512F-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] -; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] +; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] -; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2567,10 +2521,10 @@ ; SSE2-NEXT: paddb (%rsi), %xmm2 ; SSE2-NEXT: movdqa 16(%rsi), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: paddb 32(%rsi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) +; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 48(%rdx) +; SSE2-NEXT: movdqa %xmm0, 32(%rdx) ; SSE2-NEXT: movdqa %xmm3, 16(%rdx) ; SSE2-NEXT: movdqa %xmm2, (%rdx) ; SSE2-NEXT: retq @@ -2585,10 +2539,10 @@ ; SSE42-NEXT: paddb (%rsi), %xmm3 ; SSE42-NEXT: movdqa 16(%rsi), %xmm0 ; SSE42-NEXT: paddb %xmm1, %xmm0 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rdx) +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 48(%rdx) +; SSE42-NEXT: movdqa %xmm1, 32(%rdx) ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) ; SSE42-NEXT: movdqa %xmm3, (%rdx) ; SSE42-NEXT: retq @@ -2600,12 +2554,12 @@ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX-NEXT: vpblendvb %xmm0, 48(%rdi), %xmm1, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm3 +; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) ; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; @@ -2700,8 +2654,8 @@ ; SSE42-NEXT: pblendvb %xmm0, 48(%rdi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm1 ; SSE42-NEXT: paddb 16(%rsi), %xmm2 -; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: @@ -2712,8 +2666,8 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: @@ -2927,10 +2881,10 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; @@ -2950,10 +2904,12 @@ ; AVX512F-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] -; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm1 +; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],mem[1,2],xmm2[3],mem[4,5],xmm2[6],mem[7] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) @@ -2963,10 +2919,12 @@ ; AVX512DQ-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm1 -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],mem[1,2],xmm2[3],mem[4,5],xmm2[6],mem[7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) @@ -3029,10 +2987,10 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; @@ -3135,13 +3093,13 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; @@ -3243,13 +3201,13 @@ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],mem[1,2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa (%rdi), %xmm3 ; AVX-NEXT: vpaddb 32(%rsi), %xmm3, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: retq ; @@ -3333,10 +3291,10 @@ ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) +; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: @@ -3344,10 +3302,10 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: @@ -3440,13 +3398,13 @@ ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[1,3],ymm1[4,4],ymm0[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,1,0,1] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -3468,11 +3426,11 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,0,15] ; AVX512F-NEXT: vpermd (%rdi), %zmm0, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3480,11 +3438,11 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,0,15] ; AVX512DQ-NEXT: vpermd (%rdi), %zmm0, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3571,11 +3529,11 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,0] ; AVX512F-NEXT: vpermd (%rdi), %zmm0, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3583,11 +3541,11 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,0] ; AVX512DQ-NEXT: vpermd (%rdi), %zmm0, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3644,14 +3602,14 @@ ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa (%rdi), %xmm2 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3 ; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3 ; AVX-NEXT: vmovdqa %xmm3, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -3673,11 +3631,11 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,15] ; AVX512F-NEXT: vpermd (%rdi), %zmm0, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3685,11 +3643,11 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,15] ; AVX512DQ-NEXT: vpermd (%rdi), %zmm0, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3715,14 +3673,14 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %in.elt.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movaps 48(%rdi), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm1 +; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) ; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: @@ -3731,10 +3689,10 @@ ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) +; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: @@ -3830,14 +3788,14 @@ ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa (%rdi), %xmm2 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3 ; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3 ; AVX-NEXT: vmovdqa %xmm3, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -3858,11 +3816,11 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpermq (%rdi), %zmm0, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3870,11 +3828,11 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermq (%rdi), %zmm0, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3904,10 +3862,10 @@ ; SSE2-NEXT: movapd 48(%rdi), %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: @@ -3916,10 +3874,10 @@ ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) +; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: @@ -3927,11 +3885,11 @@ ; AVX-NEXT: vmovapd (%rdi), %ymm0 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm0[0,1] ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -3987,40 +3945,40 @@ ; SSE: # %bb.0: ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: ; AVX: # %bb.0: ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4028,10 +3986,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4039,10 +3997,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4065,40 +4023,40 @@ ; SSE-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: ; SSE: # %bb.0: ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0] -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastss (%rdi), %ymm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4106,10 +4064,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4117,10 +4075,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4143,40 +4101,40 @@ ; SSE-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: ; SSE: # %bb.0: ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4184,10 +4142,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4195,10 +4153,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4221,39 +4179,39 @@ ; SSE-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) -; AVX-NEXT: retq +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4261,10 +4219,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4272,10 +4230,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4299,59 +4257,59 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rsi), %xmm2 -; SSE-NEXT: paddb %xmm1, %xmm2 -; SSE-NEXT: movdqa (%rsi), %xmm3 -; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 48(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: paddb %xmm0, %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 ; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm1, 48(%rdx) -; SSE-NEXT: movdqa %xmm3, (%rdx) -; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: movdqa %xmm3, 16(%rdx) +; SSE-NEXT: movdqa %xmm2, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm3 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4374,40 +4332,40 @@ ; SSE-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: ; SSE: # %bb.0: ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0] -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastss (%rdi), %ymm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4415,10 +4373,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4426,10 +4384,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4454,40 +4412,40 @@ ; SSE-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: ; SSE: # %bb.0: ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4495,10 +4453,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4506,10 +4464,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4534,39 +4492,39 @@ ; SSE-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4574,10 +4532,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4585,10 +4543,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4614,59 +4572,59 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rsi), %xmm2 -; SSE-NEXT: paddb %xmm1, %xmm2 -; SSE-NEXT: movdqa (%rsi), %xmm3 -; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 48(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: paddb %xmm0, %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 ; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm1, 48(%rdx) -; SSE-NEXT: movdqa %xmm3, (%rdx) -; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: movdqa %xmm3, 16(%rdx) +; SSE-NEXT: movdqa %xmm2, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm3 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4691,60 +4649,60 @@ ; SSE-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: ; SSE: # %bb.0: ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastd (%rdi), %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastd (%rdi), %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4769,59 +4727,59 @@ ; SSE-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastd (%rdi), %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastd (%rdi), %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4847,59 +4805,59 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rsi), %xmm2 -; SSE-NEXT: paddb %xmm1, %xmm2 -; SSE-NEXT: movdqa (%rsi), %xmm3 -; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 48(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: paddb %xmm0, %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 ; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm1, 48(%rdx) -; SSE-NEXT: movdqa %xmm3, (%rdx) -; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: movdqa %xmm3, 16(%rdx) +; SSE-NEXT: movdqa %xmm2, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm3 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4924,59 +4882,59 @@ ; SSE-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastq (%rdi), %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastq (%rdi), %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5002,59 +4960,59 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rsi), %xmm2 -; SSE-NEXT: paddb %xmm1, %xmm2 -; SSE-NEXT: movdqa (%rsi), %xmm3 -; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 48(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: paddb %xmm0, %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 ; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm1, 48(%rdx) -; SSE-NEXT: movdqa %xmm3, (%rdx) -; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: movdqa %xmm3, 16(%rdx) +; SSE-NEXT: movdqa %xmm2, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm3 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5080,31 +5038,31 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 48(%rsi), %xmm2 -; SSE-NEXT: paddb %xmm1, %xmm2 -; SSE-NEXT: paddb 16(%rsi), %xmm1 -; SSE-NEXT: movdqa 32(%rsi), %xmm3 -; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rdx) -; SSE-NEXT: movdqa %xmm3, 32(%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) -; SSE-NEXT: movdqa %xmm2, 48(%rdx) +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: paddb %xmm0, %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 +; SSE-NEXT: paddb 32(%rsi), %xmm0 +; SSE-NEXT: paddb 48(%rsi), %xmm1 +; SSE-NEXT: movdqa %xmm1, 48(%rdx) +; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: movdqa %xmm3, 16(%rdx) +; SSE-NEXT: movdqa %xmm2, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: pushq %rbx -; AVX-NEXT: movq 16(%rdi), %rcx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: movq %rcx, %r8 -; AVX-NEXT: movq %rcx, %r9 -; AVX-NEXT: movq %rcx, %r10 -; AVX-NEXT: movl %ecx, %r11d -; AVX-NEXT: movl %ecx, %ebx -; AVX-NEXT: vmovd %ecx, %xmm0 -; AVX-NEXT: shrl $8, %ecx -; AVX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 +; AVX-NEXT: movq (%rdi), %rax +; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: movq %rax, %r8 +; AVX-NEXT: movq %rax, %r9 +; AVX-NEXT: movq %rax, %r10 +; AVX-NEXT: movl %eax, %r11d +; AVX-NEXT: movl %eax, %ebx +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: shrl $8, %eax +; AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX-NEXT: shrl $16, %ebx ; AVX-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 ; AVX-NEXT: shrl $24, %r11d @@ -5115,82 +5073,82 @@ ; AVX-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 ; AVX-NEXT: shrq $48, %r8 ; AVX-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 -; AVX-NEXT: movq 24(%rdi), %rcx -; AVX-NEXT: shrq $56, %rax -; AVX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: shrl $8, %eax -; AVX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: shrl $16, %eax -; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: shrl $24, %eax -; AVX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: shrq $32, %rax -; AVX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: shrq $40, %rax -; AVX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: shrq $48, %rax -; AVX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX-NEXT: movq (%rdi), %rax +; AVX-NEXT: movq 8(%rdi), %rax ; AVX-NEXT: shrq $56, %rcx -; AVX-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 ; AVX-NEXT: movl %eax, %ecx ; AVX-NEXT: shrl $8, %ecx -; AVX-NEXT: vmovd %eax, %xmm1 -; AVX-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 ; AVX-NEXT: movl %eax, %ecx ; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; AVX-NEXT: movl %eax, %ecx ; AVX-NEXT: shrl $24, %ecx -; AVX-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 ; AVX-NEXT: movq %rax, %rcx ; AVX-NEXT: shrq $32, %rcx -; AVX-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; AVX-NEXT: movq %rax, %rcx ; AVX-NEXT: shrq $40, %rcx -; AVX-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; AVX-NEXT: movq %rax, %rcx ; AVX-NEXT: shrq $48, %rcx -; AVX-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 -; AVX-NEXT: movq 8(%rdi), %rcx +; AVX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX-NEXT: movq 16(%rdi), %rcx ; AVX-NEXT: shrq $56, %rax -; AVX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX-NEXT: movl %ecx, %eax ; AVX-NEXT: shrl $8, %eax -; AVX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 ; AVX-NEXT: movl %ecx, %eax ; AVX-NEXT: shrl $16, %eax -; AVX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 ; AVX-NEXT: movl %ecx, %eax ; AVX-NEXT: shrl $24, %eax -; AVX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: shrq $32, %rax -; AVX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: shrq $40, %rax -; AVX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: shrq $48, %rax -; AVX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX-NEXT: movq 24(%rdi), %rax ; AVX-NEXT: shrq $56, %rcx -; AVX-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1 +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: shrl $8, %ecx +; AVX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: shrl $16, %ecx +; AVX-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: shrl $24, %ecx +; AVX-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: shrq $32, %rcx +; AVX-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: shrq $40, %rcx +; AVX-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: shrq $48, %rcx +; AVX-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; AVX-NEXT: shrq $56, %rax +; AVX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm3 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: popq %rbx ; AVX-NEXT: retq ; @@ -5286,39 +5244,310 @@ ; AVX2-NEXT: shrq $56, %rcx ; AVX2-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: popq %rbx ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: movq 16(%rdi), %rcx +; AVX512F-NEXT: movq %rcx, %rax +; AVX512F-NEXT: movq %rcx, %r8 +; AVX512F-NEXT: movq %rcx, %r9 +; AVX512F-NEXT: movq %rcx, %r10 +; AVX512F-NEXT: movl %ecx, %r11d +; AVX512F-NEXT: movl %ecx, %ebx +; AVX512F-NEXT: vmovd %ecx, %xmm0 +; AVX512F-NEXT: shrl $8, %ecx +; AVX512F-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: shrl $16, %ebx +; AVX512F-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 +; AVX512F-NEXT: shrl $24, %r11d +; AVX512F-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; AVX512F-NEXT: shrq $32, %r10 +; AVX512F-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0 +; AVX512F-NEXT: shrq $40, %r9 +; AVX512F-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 +; AVX512F-NEXT: shrq $48, %r8 +; AVX512F-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 +; AVX512F-NEXT: movq 24(%rdi), %rcx +; AVX512F-NEXT: shrq $56, %rax +; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: movl %ecx, %eax +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: movl %ecx, %eax +; AVX512F-NEXT: shrl $16, %eax +; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: movl %ecx, %eax +; AVX512F-NEXT: shrl $24, %eax +; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: movq %rcx, %rax +; AVX512F-NEXT: shrq $32, %rax +; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: movq %rcx, %rax +; AVX512F-NEXT: shrq $40, %rax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: movq %rcx, %rax +; AVX512F-NEXT: shrq $48, %rax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: movq (%rdi), %rax +; AVX512F-NEXT: shrq $56, %rcx +; AVX512F-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: shrl $8, %ecx +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: shrl $16, %ecx +; AVX512F-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: shrl $24, %ecx +; AVX512F-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $32, %rcx +; AVX512F-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $40, %rcx +; AVX512F-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $48, %rcx +; AVX512F-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; AVX512F-NEXT: movq 8(%rdi), %rcx +; AVX512F-NEXT: shrq $56, %rax +; AVX512F-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: movl %ecx, %eax +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1 +; AVX512F-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: movl %ecx, %eax +; AVX512F-NEXT: shrl $16, %eax +; AVX512F-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: movl %ecx, %eax +; AVX512F-NEXT: shrl $24, %eax +; AVX512F-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: movq %rcx, %rax +; AVX512F-NEXT: shrq $32, %rax +; AVX512F-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: movq %rcx, %rax +; AVX512F-NEXT: shrq $40, %rax +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: movq %rcx, %rax +; AVX512F-NEXT: shrq $48, %rax +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: shrq $56, %rcx +; AVX512F-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: popq %rbx ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: pushq %rbx +; AVX512DQ-NEXT: movq 16(%rdi), %rcx +; AVX512DQ-NEXT: movq %rcx, %rax +; AVX512DQ-NEXT: movq %rcx, %r8 +; AVX512DQ-NEXT: movq %rcx, %r9 +; AVX512DQ-NEXT: movq %rcx, %r10 +; AVX512DQ-NEXT: movl %ecx, %r11d +; AVX512DQ-NEXT: movl %ecx, %ebx +; AVX512DQ-NEXT: vmovd %ecx, %xmm0 +; AVX512DQ-NEXT: shrl $8, %ecx +; AVX512DQ-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrl $16, %ebx +; AVX512DQ-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrl $24, %r11d +; AVX512DQ-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrq $32, %r10 +; AVX512DQ-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrq $40, %r9 +; AVX512DQ-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrq $48, %r8 +; AVX512DQ-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 +; AVX512DQ-NEXT: movq 24(%rdi), %rcx +; AVX512DQ-NEXT: shrq $56, %rax +; AVX512DQ-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: movl %ecx, %eax +; AVX512DQ-NEXT: shrl $8, %eax +; AVX512DQ-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: movl %ecx, %eax +; AVX512DQ-NEXT: shrl $16, %eax +; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: movl %ecx, %eax +; AVX512DQ-NEXT: shrl $24, %eax +; AVX512DQ-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: movq %rcx, %rax +; AVX512DQ-NEXT: shrq $32, %rax +; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: movq %rcx, %rax +; AVX512DQ-NEXT: shrq $40, %rax +; AVX512DQ-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: movq %rcx, %rax +; AVX512DQ-NEXT: shrq $48, %rax +; AVX512DQ-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: movq (%rdi), %rax +; AVX512DQ-NEXT: shrq $56, %rcx +; AVX512DQ-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movl %eax, %ecx +; AVX512DQ-NEXT: shrl $8, %ecx +; AVX512DQ-NEXT: vmovd %eax, %xmm1 +; AVX512DQ-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; AVX512DQ-NEXT: movl %eax, %ecx +; AVX512DQ-NEXT: shrl $16, %ecx +; AVX512DQ-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; AVX512DQ-NEXT: movl %eax, %ecx +; AVX512DQ-NEXT: shrl $24, %ecx +; AVX512DQ-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shrq $32, %rcx +; AVX512DQ-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shrq $40, %rcx +; AVX512DQ-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shrq $48, %rcx +; AVX512DQ-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; AVX512DQ-NEXT: movq 8(%rdi), %rcx +; AVX512DQ-NEXT: shrq $56, %rax +; AVX512DQ-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: movl %ecx, %eax +; AVX512DQ-NEXT: shrl $8, %eax +; AVX512DQ-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: movl %ecx, %eax +; AVX512DQ-NEXT: shrl $16, %eax +; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: movl %ecx, %eax +; AVX512DQ-NEXT: shrl $24, %eax +; AVX512DQ-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: movq %rcx, %rax +; AVX512DQ-NEXT: shrq $32, %rax +; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: movq %rcx, %rax +; AVX512DQ-NEXT: shrq $40, %rax +; AVX512DQ-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: movq %rcx, %rax +; AVX512DQ-NEXT: shrq $48, %rax +; AVX512DQ-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: shrq $56, %rcx +; AVX512DQ-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: popq %rbx ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: pushq %rbx +; AVX512BW-NEXT: movq 16(%rdi), %rcx +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: movq %rcx, %r8 +; AVX512BW-NEXT: movq %rcx, %r9 +; AVX512BW-NEXT: movq %rcx, %r10 +; AVX512BW-NEXT: movl %ecx, %r11d +; AVX512BW-NEXT: movl %ecx, %ebx +; AVX512BW-NEXT: vmovd %ecx, %xmm0 +; AVX512BW-NEXT: shrl $8, %ecx +; AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: shrl $16, %ebx +; AVX512BW-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 +; AVX512BW-NEXT: shrl $24, %r11d +; AVX512BW-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; AVX512BW-NEXT: shrq $32, %r10 +; AVX512BW-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0 +; AVX512BW-NEXT: shrq $40, %r9 +; AVX512BW-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 +; AVX512BW-NEXT: shrq $48, %r8 +; AVX512BW-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 +; AVX512BW-NEXT: movq 24(%rdi), %rcx +; AVX512BW-NEXT: shrq $56, %rax +; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: movl %ecx, %eax +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: movl %ecx, %eax +; AVX512BW-NEXT: shrl $16, %eax +; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: movl %ecx, %eax +; AVX512BW-NEXT: shrl $24, %eax +; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: shrq $32, %rax +; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: shrq $40, %rax +; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: shrq $48, %rax +; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: movq (%rdi), %rax +; AVX512BW-NEXT: shrq $56, %rcx +; AVX512BW-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $8, %ecx +; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $16, %ecx +; AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $24, %ecx +; AVX512BW-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shrq $32, %rcx +; AVX512BW-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shrq $40, %rcx +; AVX512BW-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shrq $48, %rcx +; AVX512BW-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; AVX512BW-NEXT: movq 8(%rdi), %rcx +; AVX512BW-NEXT: shrq $56, %rax +; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: movl %ecx, %eax +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1 +; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: movl %ecx, %eax +; AVX512BW-NEXT: shrl $16, %eax +; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: movl %ecx, %eax +; AVX512BW-NEXT: shrl $24, %eax +; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: shrq $32, %rax +; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: shrq $40, %rax +; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: shrq $48, %rax +; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: shrq $56, %rcx +; AVX512BW-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1 +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: popq %rbx ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64 diff --git a/llvm/test/CodeGen/X86/atomic-fp.ll b/llvm/test/CodeGen/X86/atomic-fp.ll --- a/llvm/test/CodeGen/X86/atomic-fp.ll +++ b/llvm/test/CodeGen/X86/atomic-fp.ll @@ -93,8 +93,8 @@ ; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOSSE-NEXT: movl %ecx, (%esp) ; X86-NOSSE-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl %ecx, (%esp) ; X86-NOSSE-NEXT: fildll (%esp) ; X86-NOSSE-NEXT: fistpll (%eax) ; X86-NOSSE-NEXT: movl %ebp, %esp @@ -258,8 +258,8 @@ ; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl %eax, (%esp) ; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl %eax, (%esp) ; X86-NOSSE-NEXT: fildll (%esp) ; X86-NOSSE-NEXT: fistpll glob64 ; X86-NOSSE-NEXT: movl %ebp, %esp @@ -421,8 +421,8 @@ ; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl %eax, (%esp) ; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl %eax, (%esp) ; X86-NOSSE-NEXT: fildll (%esp) ; X86-NOSSE-NEXT: fistpll -559038737 ; X86-NOSSE-NEXT: movl %ebp, %esp @@ -589,8 +589,8 @@ ; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl %eax, (%esp) ; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl %eax, (%esp) ; X86-NOSSE-NEXT: fildll (%esp) ; X86-NOSSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl %ebp, %esp @@ -691,8 +691,8 @@ ; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOSSE-NEXT: movl %edx, (%esp) ; X86-NOSSE-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl %edx, (%esp) ; X86-NOSSE-NEXT: fildll (%esp) ; X86-NOSSE-NEXT: fistpll (%ecx,%eax,8) ; X86-NOSSE-NEXT: leal -4(%ebp), %esp diff --git a/llvm/test/CodeGen/X86/atomic-idempotent.ll b/llvm/test/CodeGen/X86/atomic-idempotent.ll --- a/llvm/test/CodeGen/X86/atomic-idempotent.ll +++ b/llvm/test/CodeGen/X86/atomic-idempotent.ll @@ -233,10 +233,10 @@ ; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SLM-NEXT: movl %edi, 8(%esi) -; X86-SLM-NEXT: movl %edx, 12(%esi) -; X86-SLM-NEXT: movl %eax, (%esi) +; X86-SLM-NEXT: movl %edi, 12(%esi) +; X86-SLM-NEXT: movl %edx, 8(%esi) ; X86-SLM-NEXT: movl %ecx, 4(%esi) +; X86-SLM-NEXT: movl %eax, (%esi) ; X86-SLM-NEXT: movl %esi, %eax ; X86-SLM-NEXT: leal -8(%ebp), %esp ; X86-SLM-NEXT: popl %esi @@ -273,11 +273,11 @@ ; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-ATOM-NEXT: movl %eax, 8(%esi) -; X86-ATOM-NEXT: movl %edi, 12(%esi) -; X86-ATOM-NEXT: movl %ecx, (%esi) -; X86-ATOM-NEXT: movl %esi, %eax +; X86-ATOM-NEXT: movl %eax, 12(%esi) +; X86-ATOM-NEXT: movl %edi, 8(%esi) ; X86-ATOM-NEXT: movl %edx, 4(%esi) +; X86-ATOM-NEXT: movl %esi, %eax +; X86-ATOM-NEXT: movl %ecx, (%esi) ; X86-ATOM-NEXT: leal -8(%ebp), %esp ; X86-ATOM-NEXT: popl %esi ; X86-ATOM-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/atomic-mi.ll b/llvm/test/CodeGen/X86/atomic-mi.ll --- a/llvm/test/CodeGen/X86/atomic-mi.ll +++ b/llvm/test/CodeGen/X86/atomic-mi.ll @@ -751,10 +751,10 @@ ; X32-NEXT: fistpll {{[0-9]+}}(%esp) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: andl 16(%ebp), %edx ; X32-NEXT: andl 12(%ebp), %ecx -; X32-NEXT: movl %ecx, (%esp) +; X32-NEXT: andl 16(%ebp), %edx ; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-NEXT: movl %ecx, (%esp) ; X32-NEXT: fildll (%esp) ; X32-NEXT: fistpll (%eax) ; X32-NEXT: movl %ebp, %esp @@ -973,10 +973,10 @@ ; X32-NEXT: fistpll {{[0-9]+}}(%esp) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: orl 16(%ebp), %edx ; X32-NEXT: orl 12(%ebp), %ecx -; X32-NEXT: movl %ecx, (%esp) +; X32-NEXT: orl 16(%ebp), %edx ; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-NEXT: movl %ecx, (%esp) ; X32-NEXT: fildll (%esp) ; X32-NEXT: fistpll (%eax) ; X32-NEXT: movl %ebp, %esp @@ -1195,10 +1195,10 @@ ; X32-NEXT: fistpll {{[0-9]+}}(%esp) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: xorl 16(%ebp), %edx ; X32-NEXT: xorl 12(%ebp), %ecx -; X32-NEXT: movl %ecx, (%esp) +; X32-NEXT: xorl 16(%ebp), %edx ; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-NEXT: movl %ecx, (%esp) ; X32-NEXT: fildll (%esp) ; X32-NEXT: fistpll (%eax) ; X32-NEXT: movl %ebp, %esp @@ -1603,10 +1603,10 @@ ; X32-NEXT: fistpll {{[0-9]+}}(%esp) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: notl %edx ; X32-NEXT: notl %ecx -; X32-NEXT: movl %ecx, (%esp) +; X32-NEXT: notl %edx ; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-NEXT: movl %ecx, (%esp) ; X32-NEXT: fildll (%esp) ; X32-NEXT: fistpll (%eax) ; X32-NEXT: movl %ebp, %esp diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll --- a/llvm/test/CodeGen/X86/atomic-non-integer.ll +++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll @@ -436,10 +436,10 @@ ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SSE1-NEXT: movl %edi, 8(%esi) -; X86-SSE1-NEXT: movl %edx, 12(%esi) -; X86-SSE1-NEXT: movl %eax, (%esi) +; X86-SSE1-NEXT: movl %edi, 12(%esi) +; X86-SSE1-NEXT: movl %edx, 8(%esi) ; X86-SSE1-NEXT: movl %ecx, 4(%esi) +; X86-SSE1-NEXT: movl %eax, (%esi) ; X86-SSE1-NEXT: movl %esi, %eax ; X86-SSE1-NEXT: addl $20, %esp ; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 @@ -561,10 +561,10 @@ ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NOSSE-NEXT: movl %edi, 8(%esi) -; X86-NOSSE-NEXT: movl %edx, 12(%esi) -; X86-NOSSE-NEXT: movl %eax, (%esi) +; X86-NOSSE-NEXT: movl %edi, 12(%esi) +; X86-NOSSE-NEXT: movl %edx, 8(%esi) ; X86-NOSSE-NEXT: movl %ecx, 4(%esi) +; X86-NOSSE-NEXT: movl %eax, (%esi) ; X86-NOSSE-NEXT: movl %esi, %eax ; X86-NOSSE-NEXT: addl $20, %esp ; X86-NOSSE-NEXT: .cfi_def_cfa_offset 12 diff --git a/llvm/test/CodeGen/X86/atomic-xor.ll b/llvm/test/CodeGen/X86/atomic-xor.ll --- a/llvm/test/CodeGen/X86/atomic-xor.ll +++ b/llvm/test/CodeGen/X86/atomic-xor.ll @@ -40,10 +40,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/atomic128.ll b/llvm/test/CodeGen/X86/atomic128.ll --- a/llvm/test/CodeGen/X86/atomic128.ll +++ b/llvm/test/CodeGen/X86/atomic128.ll @@ -63,10 +63,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK32-NEXT: movl %edi, 8(%esi) -; CHECK32-NEXT: movl %edx, 12(%esi) -; CHECK32-NEXT: movl %eax, (%esi) +; CHECK32-NEXT: movl %edi, 12(%esi) +; CHECK32-NEXT: movl %edx, 8(%esi) ; CHECK32-NEXT: movl %ecx, 4(%esi) +; CHECK32-NEXT: movl %eax, (%esi) ; CHECK32-NEXT: movl %esi, %eax ; CHECK32-NEXT: addl $20, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 12 @@ -173,10 +173,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK32-NEXT: movl %esi, var+8 -; CHECK32-NEXT: movl %edx, var+12 -; CHECK32-NEXT: movl %eax, var +; CHECK32-NEXT: movl %esi, var+12 +; CHECK32-NEXT: movl %edx, var+8 ; CHECK32-NEXT: movl %ecx, var+4 +; CHECK32-NEXT: movl %eax, var ; CHECK32-NEXT: addl $24, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 8 ; CHECK32-NEXT: popl %esi @@ -241,10 +241,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK32-NEXT: movl %esi, var+8 -; CHECK32-NEXT: movl %edx, var+12 -; CHECK32-NEXT: movl %eax, var +; CHECK32-NEXT: movl %esi, var+12 +; CHECK32-NEXT: movl %edx, var+8 ; CHECK32-NEXT: movl %ecx, var+4 +; CHECK32-NEXT: movl %eax, var ; CHECK32-NEXT: addl $24, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 8 ; CHECK32-NEXT: popl %esi @@ -309,10 +309,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK32-NEXT: movl %esi, var+8 -; CHECK32-NEXT: movl %edx, var+12 -; CHECK32-NEXT: movl %eax, var +; CHECK32-NEXT: movl %esi, var+12 +; CHECK32-NEXT: movl %edx, var+8 ; CHECK32-NEXT: movl %ecx, var+4 +; CHECK32-NEXT: movl %eax, var ; CHECK32-NEXT: addl $24, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 8 ; CHECK32-NEXT: popl %esi @@ -377,10 +377,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK32-NEXT: movl %esi, var+8 -; CHECK32-NEXT: movl %edx, var+12 -; CHECK32-NEXT: movl %eax, var +; CHECK32-NEXT: movl %esi, var+12 +; CHECK32-NEXT: movl %edx, var+8 ; CHECK32-NEXT: movl %ecx, var+4 +; CHECK32-NEXT: movl %eax, var ; CHECK32-NEXT: addl $24, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 8 ; CHECK32-NEXT: popl %esi @@ -448,10 +448,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK32-NEXT: movl %esi, var+8 -; CHECK32-NEXT: movl %edx, var+12 -; CHECK32-NEXT: movl %eax, var +; CHECK32-NEXT: movl %esi, var+12 +; CHECK32-NEXT: movl %edx, var+8 ; CHECK32-NEXT: movl %ecx, var+4 +; CHECK32-NEXT: movl %eax, var ; CHECK32-NEXT: addl $24, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 8 ; CHECK32-NEXT: popl %esi @@ -519,10 +519,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK32-NEXT: movl %esi, var+8 -; CHECK32-NEXT: movl %edx, var+12 -; CHECK32-NEXT: movl %eax, var +; CHECK32-NEXT: movl %esi, var+12 +; CHECK32-NEXT: movl %edx, var+8 ; CHECK32-NEXT: movl %ecx, var+4 +; CHECK32-NEXT: movl %eax, var ; CHECK32-NEXT: addl $24, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 8 ; CHECK32-NEXT: popl %esi @@ -590,10 +590,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK32-NEXT: movl %esi, var+8 -; CHECK32-NEXT: movl %edx, var+12 -; CHECK32-NEXT: movl %eax, var +; CHECK32-NEXT: movl %esi, var+12 +; CHECK32-NEXT: movl %edx, var+8 ; CHECK32-NEXT: movl %ecx, var+4 +; CHECK32-NEXT: movl %eax, var ; CHECK32-NEXT: addl $24, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 8 ; CHECK32-NEXT: popl %esi @@ -661,10 +661,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK32-NEXT: movl %esi, var+8 -; CHECK32-NEXT: movl %edx, var+12 -; CHECK32-NEXT: movl %eax, var +; CHECK32-NEXT: movl %esi, var+12 +; CHECK32-NEXT: movl %edx, var+8 ; CHECK32-NEXT: movl %ecx, var+4 +; CHECK32-NEXT: movl %eax, var ; CHECK32-NEXT: addl $24, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 8 ; CHECK32-NEXT: popl %esi @@ -731,10 +731,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK32-NEXT: movl %edi, 8(%esi) -; CHECK32-NEXT: movl %edx, 12(%esi) -; CHECK32-NEXT: movl %eax, (%esi) +; CHECK32-NEXT: movl %edi, 12(%esi) +; CHECK32-NEXT: movl %edx, 8(%esi) ; CHECK32-NEXT: movl %ecx, 4(%esi) +; CHECK32-NEXT: movl %eax, (%esi) ; CHECK32-NEXT: movl %esi, %eax ; CHECK32-NEXT: addl $20, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 12 @@ -803,10 +803,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK32-NEXT: movl %edi, 8(%esi) -; CHECK32-NEXT: movl %edx, 12(%esi) -; CHECK32-NEXT: movl %eax, (%esi) +; CHECK32-NEXT: movl %edi, 12(%esi) +; CHECK32-NEXT: movl %edx, 8(%esi) ; CHECK32-NEXT: movl %ecx, 4(%esi) +; CHECK32-NEXT: movl %eax, (%esi) ; CHECK32-NEXT: movl %esi, %eax ; CHECK32-NEXT: addl $20, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 12 diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -14,13 +14,38 @@ ; SSE2-NEXT: movd %xmm1, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vpavgb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovd %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX2-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, (%rax) +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v4i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovd %xmm0, (%rax) +; AVX512-NEXT: retq %1 = load <4 x i8>, ptr %a %2 = load <4 x i8>, ptr %b %3 = zext <4 x i8> %1 to <4 x i32> @@ -42,13 +67,45 @@ ; SSE2-NEXT: movq %xmm1, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpavgb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovq %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <8 x i8>, ptr %a %2 = load <8 x i8>, ptr %b %3 = zext <8 x i8> %1 to <8 x i32> @@ -69,12 +126,54 @@ ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <16 x i8>, ptr %a %2 = load <16 x i8>, ptr %b %3 = zext <16 x i8> %1 to <16 x i32> @@ -90,28 +189,28 @@ define void @avg_v24i8(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: avg_v24i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pavgb (%rdi), %xmm0 -; SSE2-NEXT: pavgb 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: pavgb (%rsi), %xmm0 +; SSE2-NEXT: pavgb 16(%rsi), %xmm1 ; SSE2-NEXT: movq %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v24i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgb 16(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0 +; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 ; AVX1-NEXT: vmovq %xmm1, (%rax) ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v24i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovq %xmm1, (%rax) ; AVX2-NEXT: vmovdqu %xmm0, (%rax) @@ -120,8 +219,8 @@ ; ; AVX512-LABEL: avg_v24i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpavgb (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmovq %xmm1, (%rax) ; AVX512-NEXT: vmovdqu %xmm0, (%rax) @@ -142,36 +241,89 @@ define void @avg_v32i8(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: avg_v32i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pavgb (%rdi), %xmm0 -; SSE2-NEXT: pavgb 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: pavgb (%rsi), %xmm0 +; SSE2-NEXT: pavgb 16(%rsi), %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpavgb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpavgb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: avg_v32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vpavgb (%rsi), %ymm0, %ymm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vpavgb %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -190,12 +342,12 @@ define void @avg_v48i8(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: avg_v48i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: pavgb (%rdi), %xmm0 -; SSE2-NEXT: pavgb 16(%rdi), %xmm1 -; SSE2-NEXT: pavgb 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: pavgb (%rsi), %xmm0 +; SSE2-NEXT: pavgb 16(%rsi), %xmm1 +; SSE2-NEXT: pavgb 32(%rsi), %xmm2 ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) @@ -203,12 +355,12 @@ ; ; AVX1-LABEL: avg_v48i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX1-NEXT: vpavgb 32(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgb 16(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vpavgb 32(%rsi), %xmm2, %xmm2 +; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0 +; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 ; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vmovdqu %xmm2, (%rax) @@ -216,10 +368,10 @@ ; ; AVX2-LABEL: avg_v48i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-NEXT: vpavgb 32(%rdi), %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX2-NEXT: vpavgb 32(%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vmovdqu %xmm1, (%rax) ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -227,10 +379,10 @@ ; ; AVX512F-LABEL: avg_v48i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512F-NEXT: vpavgb 32(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpavgb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512F-NEXT: vpavgb 32(%rsi), %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqu %xmm1, (%rax) ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) ; AVX512F-NEXT: vzeroupper @@ -238,8 +390,8 @@ ; ; AVX512BW-LABEL: avg_v48i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpavgb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, (%rax) ; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -259,14 +411,14 @@ define void @avg_v64i8(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: avg_v64i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 -; SSE2-NEXT: pavgb (%rdi), %xmm0 -; SSE2-NEXT: pavgb 16(%rdi), %xmm1 -; SSE2-NEXT: pavgb 32(%rdi), %xmm2 -; SSE2-NEXT: pavgb 48(%rdi), %xmm3 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm3 +; SSE2-NEXT: pavgb (%rsi), %xmm0 +; SSE2-NEXT: pavgb 16(%rsi), %xmm1 +; SSE2-NEXT: pavgb 32(%rsi), %xmm2 +; SSE2-NEXT: pavgb 48(%rsi), %xmm3 ; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) @@ -275,46 +427,173 @@ ; ; AVX1-LABEL: avg_v64i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3 -; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgb 16(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpavgb 32(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vpavgb 48(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vmovdqu %xmm3, (%rax) -; AVX1-NEXT: vmovdqu %xmm2, (%rax) -; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpavgb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpavgb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpavgb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpavgb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm3, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v64i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, (%rax) +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm5, %ymm6, %ymm5 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm6, %ymm7, %ymm6 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm7, %ymm6, %ymm6 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpavgb %ymm6, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpavgb %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqu %ymm2, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpavgb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpavgb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqu %ymm1, (%rax) +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovdb %zmm7, %xmm7 +; AVX512F-NEXT: vpmovdb %zmm6, %xmm6 +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpavgb %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpmovdb %zmm5, %xmm3 +; AVX512F-NEXT: vpmovdb %zmm4, %xmm4 +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpavgb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vmovdqu %ymm2, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpavgb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovdb %zmm7, %xmm7 +; AVX512BW-NEXT: vpmovdb %zmm6, %xmm6 +; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX512BW-NEXT: vpmovdb %zmm5, %xmm5 +; AVX512BW-NEXT: vpmovdb %zmm4, %xmm4 +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512BW-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512BW-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512BW-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpavgb %zmm4, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -341,9 +620,11 @@ ; ; AVX-LABEL: avg_v4i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpavgw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq %1 = load <4 x i16>, ptr %a @@ -366,12 +647,41 @@ ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpavgw (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <8 x i16>, ptr %a %2 = load <8 x i16>, ptr %b %3 = zext <8 x i16> %1 to <8 x i32> @@ -387,36 +697,55 @@ define void @avg_v16i16(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: avg_v16i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pavgw (%rdi), %xmm0 -; SSE2-NEXT: pavgw 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: pavgw (%rsi), %xmm0 +; SSE2-NEXT: pavgw 16(%rsi), %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpavgw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpavgw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpavgw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: avg_v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -435,14 +764,14 @@ define void @avg_v32i16(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: avg_v32i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 -; SSE2-NEXT: pavgw (%rdi), %xmm0 -; SSE2-NEXT: pavgw 16(%rdi), %xmm1 -; SSE2-NEXT: pavgw 32(%rdi), %xmm2 -; SSE2-NEXT: pavgw 48(%rdi), %xmm3 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm3 +; SSE2-NEXT: pavgw (%rsi), %xmm0 +; SSE2-NEXT: pavgw 16(%rsi), %xmm1 +; SSE2-NEXT: pavgw 32(%rsi), %xmm2 +; SSE2-NEXT: pavgw 48(%rsi), %xmm3 ; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) @@ -451,46 +780,93 @@ ; ; AVX1-LABEL: avg_v32i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3 -; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgw 16(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpavgw 32(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vpavgw 48(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vmovdqu %xmm3, (%rax) -; AVX1-NEXT: vmovdqu %xmm2, (%rax) -; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm3, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, (%rax) +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpavgw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpavgw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqu %ymm1, (%rax) +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512BW-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpavgw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -509,16 +885,16 @@ define void @avg_v40i16(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: avg_v40i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 -; SSE2-NEXT: pavgw (%rdi), %xmm0 -; SSE2-NEXT: pavgw 16(%rdi), %xmm1 -; SSE2-NEXT: pavgw 32(%rdi), %xmm2 -; SSE2-NEXT: pavgw 48(%rdi), %xmm3 -; SSE2-NEXT: movdqa 64(%rsi), %xmm4 -; SSE2-NEXT: pavgw 64(%rdi), %xmm4 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm3 +; SSE2-NEXT: pavgw (%rsi), %xmm0 +; SSE2-NEXT: pavgw 16(%rsi), %xmm1 +; SSE2-NEXT: pavgw 32(%rsi), %xmm2 +; SSE2-NEXT: pavgw 48(%rsi), %xmm3 +; SSE2-NEXT: movdqa 64(%rdi), %xmm4 +; SSE2-NEXT: pavgw 64(%rsi), %xmm4 ; SSE2-NEXT: movdqu %xmm4, (%rax) ; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) @@ -528,16 +904,16 @@ ; ; AVX1-LABEL: avg_v40i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX1-NEXT: vpavgw 64(%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa (%rsi), %xmm1 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm2 -; AVX1-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX1-NEXT: vmovdqa 48(%rsi), %xmm4 -; AVX1-NEXT: vpavgw (%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpavgw 16(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vpavgw 32(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vpavgw 48(%rdi), %xmm4, %xmm4 +; AVX1-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX1-NEXT: vpavgw 64(%rsi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX1-NEXT: vpavgw (%rsi), %xmm1, %xmm1 +; AVX1-NEXT: vpavgw 16(%rsi), %xmm2, %xmm2 +; AVX1-NEXT: vpavgw 32(%rsi), %xmm3, %xmm3 +; AVX1-NEXT: vpavgw 48(%rsi), %xmm4, %xmm4 ; AVX1-NEXT: vmovdqu %xmm4, (%rax) ; AVX1-NEXT: vmovdqu %xmm3, (%rax) ; AVX1-NEXT: vmovdqu %xmm2, (%rax) @@ -547,12 +923,12 @@ ; ; AVX2-LABEL: avg_v40i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX2-NEXT: vpavgw 64(%rdi), %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-NEXT: vpavgw 64(%rsi), %xmm2, %xmm2 ; AVX2-NEXT: vmovdqu %xmm2, (%rax) ; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vmovdqu %ymm0, (%rax) @@ -561,12 +937,12 @@ ; ; AVX512F-LABEL: avg_v40i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512F-NEXT: vpavgw 64(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512F-NEXT: vpavgw 64(%rsi), %xmm2, %xmm2 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) ; AVX512F-NEXT: vmovdqu %xmm2, (%rax) @@ -575,10 +951,10 @@ ; ; AVX512BW-LABEL: avg_v40i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512BW-NEXT: vpavgw 64(%rdi), %xmm1, %xmm1 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512BW-NEXT: vpavgw 64(%rsi), %xmm1, %xmm1 ; AVX512BW-NEXT: vmovdqu %xmm1, (%rax) ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -604,13 +980,38 @@ ; SSE2-NEXT: movd %xmm1, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v4i8_2: -; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v4i8_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v4i8_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX2-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, (%rax) +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v4i8_2: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovd %xmm0, (%rax) +; AVX512-NEXT: retq %1 = load <4 x i8>, ptr %a %2 = load <4 x i8>, ptr %b %3 = zext <4 x i8> %1 to <4 x i32> @@ -632,13 +1033,45 @@ ; SSE2-NEXT: movq %xmm1, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v8i8_2: -; AVX: # %bb.0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v8i8_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v8i8_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v8i8_2: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <8 x i8>, ptr %a %2 = load <8 x i8>, ptr %b %3 = zext <8 x i8> %1 to <8 x i32> @@ -659,12 +1092,54 @@ ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v16i8_2: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v16i8_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v16i8_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v16i8_2: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <16 x i8>, ptr %a %2 = load <16 x i8>, ptr %b %3 = zext <16 x i8> %1 to <16 x i32> @@ -690,26 +1165,79 @@ ; ; AVX1-LABEL: avg_v32i8_2: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpavgb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i8_2: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpavgb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: avg_v32i8_2: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vpavgb (%rsi), %ymm0, %ymm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vpavgb %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -740,28 +1268,96 @@ ; ; AVX1-LABEL: avg_v64i8_2: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps (%rsi), %ymm0 -; AVX1-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX1-NEXT: vmovups %ymm1, (%rax) -; AVX1-NEXT: vmovups %ymm0, (%rax) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm3, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v64i8_2: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps (%rsi), %ymm0 -; AVX2-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX2-NEXT: vmovups %ymm1, (%rax) -; AVX2-NEXT: vmovups %ymm0, (%rax) +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqu %ymm2, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: avg_v64i8_2: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovaps (%rsi), %zmm0 -; AVX512-NEXT: vmovups %zmm0, (%rax) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: avg_v64i8_2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovdb %zmm3, (%rax) +; AVX512F-NEXT: vpmovdb %zmm2, (%rax) +; AVX512F-NEXT: vpmovdb %zmm1, (%rax) +; AVX512F-NEXT: vpmovdb %zmm0, (%rax) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: avg_v64i8_2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512BW-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512BW-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %1 = load <64 x i8>, ptr %a %2 = load <64 x i8>, ptr %b %3 = zext <64 x i8> %1 to <64 x i32> @@ -786,8 +1382,10 @@ ; ; AVX-LABEL: avg_v4i16_2: ; AVX: # %bb.0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq @@ -811,12 +1409,41 @@ ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v8i16_2: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpavgw (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v8i16_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v8i16_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v8i16_2: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <8 x i16>, ptr %a %2 = load <8 x i16>, ptr %b %3 = zext <8 x i16> %1 to <8 x i32> @@ -842,26 +1469,45 @@ ; ; AVX1-LABEL: avg_v16i16_2: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpavgw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpavgw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v16i16_2: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpavgw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: avg_v16i16_2: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -896,46 +1542,93 @@ ; ; AVX1-LABEL: avg_v32i16_2: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vpavgw 32(%rsi), %xmm2, %xmm2 -; AVX1-NEXT: vpavgw 48(%rsi), %xmm3, %xmm3 -; AVX1-NEXT: vmovdqu %xmm3, (%rax) -; AVX1-NEXT: vmovdqu %xmm2, (%rax) -; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm3, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i16_2: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, (%rax) +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpavgw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpavgw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v32i16_2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqu %ymm1, (%rax) +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v32i16_2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512BW-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpavgw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -961,7 +1654,9 @@ ; ; AVX-LABEL: avg_v4i8_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, (%rax) ; AVX-NEXT: retq @@ -982,12 +1677,35 @@ ; SSE2-NEXT: movq %xmm0, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v8i8_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v8i8_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v8i8_const: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v8i8_const: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <8 x i8>, ptr %a %2 = zext <8 x i8> %1 to <8 x i32> %3 = add nuw nsw <8 x i32> %2, @@ -1005,12 +1723,40 @@ ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v16i8_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v16i8_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v16i8_const: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v16i8_const: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <16 x i8>, ptr %a %2 = zext <16 x i8> %1 to <16 x i32> %3 = add nuw nsw <16 x i32> %2, @@ -1033,17 +1779,40 @@ ; ; AVX1-LABEL: avg_v32i8_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX1-NEXT: # xmm0 = mem[0,0] -; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpavgb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i8_const: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -1051,7 +1820,11 @@ ; ; AVX512-LABEL: avg_v32i8_const: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) ; AVX512-NEXT: vzeroupper @@ -1084,41 +1857,109 @@ ; ; AVX1-LABEL: avg_v64i8_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX1-NEXT: # xmm0 = mem[0,0] -; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm2 -; AVX1-NEXT: vpavgb 32(%rdi), %xmm0, %xmm3 -; AVX1-NEXT: vpavgb 48(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX1-NEXT: # xmm4 = mem[0,0] +; AVX1-NEXT: vpavgb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpavgb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpavgb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpavgb %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) -; AVX1-NEXT: vmovdqu %xmm3, (%rax) -; AVX1-NEXT: vmovdqu %xmm2, (%rax) ; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm3, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v64i8_const: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm1 -; AVX2-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpavgb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpavgb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) -; AVX2-NEXT: vmovdqu %ymm1, (%rax) +; AVX2-NEXT: vmovdqu %ymm2, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v64i8_const: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512F-NEXT: vpavgb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpavgb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) -; AVX512F-NEXT: vmovdqu %ymm1, (%rax) +; AVX512F-NEXT: vmovdqu %ymm2, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v64i8_const: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512BW-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512BW-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -1142,7 +1983,8 @@ ; ; AVX-LABEL: avg_v4i16_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq @@ -1163,12 +2005,33 @@ ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v8i16_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v8i16_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v8i16_const: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v8i16_const: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <8 x i16>, ptr %a %2 = zext <8 x i16> %1 to <8 x i32> %3 = add nuw nsw <8 x i32> %2, @@ -1191,16 +2054,25 @@ ; ; AVX1-LABEL: avg_v16i16_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] +; AVX1-NEXT: vpavgw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpavgw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v16i16_const: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -1208,7 +2080,8 @@ ; ; AVX512-LABEL: avg_v16i16_const: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) ; AVX512-NEXT: vzeroupper @@ -1241,23 +2114,43 @@ ; ; AVX1-LABEL: avg_v32i16_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm2 -; AVX1-NEXT: vpavgw 32(%rdi), %xmm0, %xmm3 -; AVX1-NEXT: vpavgw 48(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7] +; AVX1-NEXT: vpavgw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpavgw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpavgw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpavgw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) -; AVX1-NEXT: vmovdqu %xmm3, (%rax) -; AVX1-NEXT: vmovdqu %xmm2, (%rax) ; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm3, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i16_const: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX2-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm1 -; AVX2-NEXT: vpavgw 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vpavgw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpavgw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vzeroupper @@ -1265,10 +2158,14 @@ ; ; AVX512F-LABEL: avg_v32i16_const: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vpavgw 32(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-NEXT: vpavgw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) ; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vzeroupper @@ -1276,7 +2173,11 @@ ; ; AVX512BW-LABEL: avg_v32i16_const: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -1739,141 +2640,114 @@ ; SSE2-NEXT: pushq %r13 ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movaps (%rdi), %xmm1 -; SSE2-NEXT: movaps (%rsi), %xmm0 -; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movaps (%rsi), %xmm1 +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: addq %rdx, %rbp -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: addq %rcx, %rdx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: leaq -1(%r13,%rcx), %r13 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: leaq -1(%r12,%rcx), %r12 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: leaq -1(%r15,%rcx), %r15 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: leaq -1(%r14,%rcx), %r14 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: leaq -1(%rbx,%rcx), %rbx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: leaq -1(%r11,%rcx), %r11 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: leaq -1(%r10,%rcx), %r10 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: leaq -1(%r9,%rcx), %r9 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: leaq -1(%r8,%rcx), %r8 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: leaq -1(%rdi,%rcx), %rdi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: leaq -1(%rsi,%rcx), %rsi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: leaq -1(%rax,%rcx), %rax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE2-NEXT: leal -1(%rbp,%r9), %r9d +; SSE2-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE2-NEXT: leal -1(%r8,%r9), %r8d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE2-NEXT: leal -1(%rdi,%r9), %edi +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE2-NEXT: leal -1(%r15,%r9), %r15d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE2-NEXT: leal -1(%rsi,%r9), %esi +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE2-NEXT: leal -1(%r14,%r9), %r14d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE2-NEXT: leal -1(%r13,%r9), %ebp +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE2-NEXT: leal -1(%rcx,%r9), %ecx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE2-NEXT: leal -1(%rdx,%r9), %edx +; SSE2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE2-NEXT: leal -1(%rbx,%r9), %ebx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE2-NEXT: leal -1(%r11,%r9), %r11d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE2-NEXT: leal -1(%r10,%r9), %r13d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE2-NEXT: leal -1(%rax,%r9), %r10d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE2-NEXT: leaq -1(%rcx,%rax), %rax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: leal -1(%r12,%rax), %r12d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE2-NEXT: leaq -1(%rcx,%rax), %rax -; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: addq $-1, %rbp -; SSE2-NEXT: movl $0, %eax -; SSE2-NEXT: adcq $-1, %rax -; SSE2-NEXT: addq $-1, %rdx -; SSE2-NEXT: adcq $-1, %rcx -; SSE2-NEXT: shldq $63, %rdx, %rcx -; SSE2-NEXT: shldq $63, %rbp, %rax -; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: movq %rcx, %xmm0 -; SSE2-NEXT: shrq %r13 -; SSE2-NEXT: movq %r13, %xmm3 -; SSE2-NEXT: shrq %r12 -; SSE2-NEXT: movq %r12, %xmm2 -; SSE2-NEXT: shrq %r15 -; SSE2-NEXT: movq %r15, %xmm5 -; SSE2-NEXT: shrq %r14 -; SSE2-NEXT: movq %r14, %xmm4 -; SSE2-NEXT: shrq %rbx -; SSE2-NEXT: movq %rbx, %xmm6 -; SSE2-NEXT: shrq %r11 -; SSE2-NEXT: movq %r11, %xmm7 -; SSE2-NEXT: shrq %r10 -; SSE2-NEXT: movq %r10, %xmm9 -; SSE2-NEXT: shrq %r9 -; SSE2-NEXT: movq %r9, %xmm8 -; SSE2-NEXT: shrq %r8 -; SSE2-NEXT: movq %r8, %xmm11 -; SSE2-NEXT: shrq %rdi -; SSE2-NEXT: movq %rdi, %xmm12 -; SSE2-NEXT: shrq %rsi -; SSE2-NEXT: movq %rsi, %xmm13 -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: shrq %rax -; SSE2-NEXT: movq %rax, %xmm10 -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: shrq %rax -; SSE2-NEXT: movq %rax, %xmm14 -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: shrq %rax -; SSE2-NEXT: movq %rax, %xmm15 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE2-NEXT: psllq $48, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pandn %xmm4, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; SSE2-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,0,0] -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm8, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3],xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] -; SSE2-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE2-NEXT: leal -1(%rdx,%rax), %r9d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE2-NEXT: leal -1(%rdx,%rax), %eax +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload +; SSE2-NEXT: shrl %edx +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: shrl %r8d +; SSE2-NEXT: movd %r8d, %xmm1 +; SSE2-NEXT: shrl %edi +; SSE2-NEXT: movd %edi, %xmm4 +; SSE2-NEXT: shrl %r15d +; SSE2-NEXT: movd %r15d, %xmm2 +; SSE2-NEXT: shrl %esi +; SSE2-NEXT: movd %esi, %xmm5 +; SSE2-NEXT: shrl %r14d +; SSE2-NEXT: movd %r14d, %xmm6 +; SSE2-NEXT: shrl %ebp +; SSE2-NEXT: movd %ebp, %xmm7 +; SSE2-NEXT: shrl %ecx +; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; SSE2-NEXT: shrl %ecx +; SSE2-NEXT: movd %ecx, %xmm8 +; SSE2-NEXT: shrl %ebx +; SSE2-NEXT: movd %ebx, %xmm9 +; SSE2-NEXT: shrl %r11d +; SSE2-NEXT: movd %r11d, %xmm10 +; SSE2-NEXT: shrl %r13d +; SSE2-NEXT: movd %r13d, %xmm11 +; SSE2-NEXT: shrl %r10d +; SSE2-NEXT: movd %r10d, %xmm12 +; SSE2-NEXT: shrl %r12d +; SSE2-NEXT: movd %r12d, %xmm13 +; SSE2-NEXT: shrl %r9d +; SSE2-NEXT: movd %r9d, %xmm14 +; SSE2-NEXT: shrl %eax +; SSE2-NEXT: movd %eax, %xmm15 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,1,0,1] -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm10, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE2-NEXT: movupd %xmm2, (%rax) +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm3[0] +; SSE2-NEXT: movdqu %xmm15, (%rax) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r13 @@ -1894,102 +2768,92 @@ ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpextrw $4, %xmm0, %eax -; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $5, %xmm0, %eax +; AVX1-NEXT: vpextrw $6, %xmm0, %eax ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $6, %xmm0, %ebx -; AVX1-NEXT: vpextrw $7, %xmm0, %esi -; AVX1-NEXT: vpextrw $0, %xmm3, %edi -; AVX1-NEXT: vpextrw $1, %xmm3, %r8d -; AVX1-NEXT: vpextrw $2, %xmm3, %r9d -; AVX1-NEXT: vpextrw $3, %xmm3, %r10d -; AVX1-NEXT: vpextrw $4, %xmm3, %r11d -; AVX1-NEXT: vpextrw $5, %xmm3, %r14d -; AVX1-NEXT: vpextrw $6, %xmm3, %r15d -; AVX1-NEXT: vpextrw $7, %xmm3, %edx -; AVX1-NEXT: vpextrw $1, %xmm0, %eax -; AVX1-NEXT: vpextrw $0, %xmm0, %r12d -; AVX1-NEXT: vpextrw $1, %xmm1, %ecx -; AVX1-NEXT: addq %rax, %rcx -; AVX1-NEXT: vpextrw $0, %xmm1, %eax -; AVX1-NEXT: addq %r12, %rax -; AVX1-NEXT: vpextrw $7, %xmm2, %r12d -; AVX1-NEXT: leaq -1(%rdx,%r12), %rdx -; AVX1-NEXT: vpextrw $6, %xmm2, %r12d -; AVX1-NEXT: leaq -1(%r15,%r12), %rbp -; AVX1-NEXT: vpextrw $5, %xmm2, %r15d -; AVX1-NEXT: leaq -1(%r14,%r15), %r13 -; AVX1-NEXT: vpextrw $4, %xmm2, %r14d -; AVX1-NEXT: leaq -1(%r11,%r14), %r12 -; AVX1-NEXT: vpextrw $3, %xmm2, %r11d -; AVX1-NEXT: leaq -1(%r10,%r11), %r15 -; AVX1-NEXT: vpextrw $2, %xmm2, %r10d -; AVX1-NEXT: leaq -1(%r9,%r10), %r14 -; AVX1-NEXT: vpextrw $1, %xmm2, %r9d -; AVX1-NEXT: leaq -1(%r8,%r9), %r11 -; AVX1-NEXT: vpextrw $0, %xmm2, %r8d -; AVX1-NEXT: leaq -1(%rdi,%r8), %r10 -; AVX1-NEXT: vpextrw $7, %xmm1, %edi -; AVX1-NEXT: leaq -1(%rsi,%rdi), %r9 -; AVX1-NEXT: vpextrw $6, %xmm1, %esi -; AVX1-NEXT: leaq -1(%rbx,%rsi), %r8 -; AVX1-NEXT: vpextrw $5, %xmm1, %esi -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX1-NEXT: leaq -1(%rdi,%rsi), %rsi -; AVX1-NEXT: vpextrw $4, %xmm1, %edi -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX1-NEXT: leaq -1(%rbx,%rdi), %rdi -; AVX1-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $3, %xmm0, %edi -; AVX1-NEXT: vpextrw $3, %xmm1, %ebx -; AVX1-NEXT: leaq -1(%rdi,%rbx), %rdi -; AVX1-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vpextrw $7, %xmm0, %ecx +; AVX1-NEXT: vpextrw $0, %xmm0, %edx +; AVX1-NEXT: vpextrw $1, %xmm0, %esi ; AVX1-NEXT: vpextrw $2, %xmm0, %edi -; AVX1-NEXT: vpextrw $2, %xmm1, %ebx -; AVX1-NEXT: leaq -1(%rdi,%rbx), %rdi -; AVX1-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: xorl %edi, %edi -; AVX1-NEXT: addq $-1, %rcx -; AVX1-NEXT: movl $0, %ebx -; AVX1-NEXT: adcq $-1, %rbx -; AVX1-NEXT: addq $-1, %rax -; AVX1-NEXT: adcq $-1, %rdi -; AVX1-NEXT: shldq $63, %rax, %rdi -; AVX1-NEXT: shldq $63, %rcx, %rbx -; AVX1-NEXT: shrq %rdx -; AVX1-NEXT: vmovq %rdx, %xmm0 -; AVX1-NEXT: shrq %rbp -; AVX1-NEXT: vmovq %rbp, %xmm1 -; AVX1-NEXT: shrq %r13 -; AVX1-NEXT: vmovq %r13, %xmm2 +; AVX1-NEXT: vpextrw $3, %xmm0, %r8d +; AVX1-NEXT: vpextrw $0, %xmm3, %r9d +; AVX1-NEXT: vpextrw $1, %xmm3, %r10d +; AVX1-NEXT: vpextrw $2, %xmm3, %eax +; AVX1-NEXT: vpextrw $3, %xmm3, %r14d +; AVX1-NEXT: vpextrw $4, %xmm3, %r15d +; AVX1-NEXT: vpextrw $5, %xmm3, %ebx +; AVX1-NEXT: vpextrw $6, %xmm3, %r12d +; AVX1-NEXT: vpextrw $7, %xmm3, %r11d +; AVX1-NEXT: vpextrw $7, %xmm2, %r13d +; AVX1-NEXT: leal -1(%r11,%r13), %r11d +; AVX1-NEXT: vpextrw $6, %xmm2, %r13d +; AVX1-NEXT: leal -1(%r12,%r13), %ebp +; AVX1-NEXT: vpextrw $5, %xmm2, %r12d +; AVX1-NEXT: leal -1(%rbx,%r12), %ebx +; AVX1-NEXT: vpextrw $4, %xmm2, %r12d +; AVX1-NEXT: leal -1(%r15,%r12), %r13d +; AVX1-NEXT: vpextrw $3, %xmm2, %r15d +; AVX1-NEXT: leal -1(%r14,%r15), %r14d +; AVX1-NEXT: vpextrw $2, %xmm2, %r15d +; AVX1-NEXT: leal -1(%rax,%r15), %eax +; AVX1-NEXT: vpextrw $1, %xmm2, %r15d +; AVX1-NEXT: leaq -1(%r10,%r15), %r12 +; AVX1-NEXT: vpextrw $0, %xmm2, %r10d +; AVX1-NEXT: leaq -1(%r9,%r10), %r15 +; AVX1-NEXT: vpextrw $3, %xmm1, %r9d +; AVX1-NEXT: leaq -1(%r8,%r9), %r10 +; AVX1-NEXT: vpextrw $2, %xmm1, %r8d +; AVX1-NEXT: leaq -1(%rdi,%r8), %r9 +; AVX1-NEXT: vpextrw $1, %xmm1, %edi +; AVX1-NEXT: leaq -1(%rsi,%rdi), %r8 +; AVX1-NEXT: vpextrw $0, %xmm1, %esi +; AVX1-NEXT: leaq -1(%rdx,%rsi), %rdi +; AVX1-NEXT: vpextrw $7, %xmm1, %edx +; AVX1-NEXT: leaq -1(%rcx,%rdx), %rsi +; AVX1-NEXT: vpextrw $6, %xmm1, %ecx +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX1-NEXT: leaq -1(%rdx,%rcx), %rcx +; AVX1-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vpextrw $5, %xmm0, %ecx +; AVX1-NEXT: vpextrw $5, %xmm1, %edx +; AVX1-NEXT: leaq -1(%rcx,%rdx), %rcx +; AVX1-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vpextrw $4, %xmm0, %ecx +; AVX1-NEXT: vpextrw $4, %xmm1, %edx +; AVX1-NEXT: leaq -1(%rcx,%rdx), %rcx +; AVX1-NEXT: shrl %r11d +; AVX1-NEXT: vmovd %r11d, %xmm0 +; AVX1-NEXT: shrl %ebp +; AVX1-NEXT: vmovd %ebp, %xmm1 +; AVX1-NEXT: shrl %ebx +; AVX1-NEXT: vmovd %ebx, %xmm2 +; AVX1-NEXT: shrl %r13d +; AVX1-NEXT: vmovd %r13d, %xmm3 +; AVX1-NEXT: shrl %r14d +; AVX1-NEXT: vmovd %r14d, %xmm4 +; AVX1-NEXT: shrl %eax +; AVX1-NEXT: vmovd %eax, %xmm5 ; AVX1-NEXT: shrq %r12 -; AVX1-NEXT: vmovq %r12, %xmm3 +; AVX1-NEXT: vmovq %r12, %xmm6 ; AVX1-NEXT: shrq %r15 -; AVX1-NEXT: vmovq %r15, %xmm4 -; AVX1-NEXT: shrq %r14 -; AVX1-NEXT: vmovq %r14, %xmm5 -; AVX1-NEXT: shrq %r11 -; AVX1-NEXT: vmovq %r11, %xmm6 +; AVX1-NEXT: vmovq %r15, %xmm7 ; AVX1-NEXT: shrq %r10 -; AVX1-NEXT: vmovq %r10, %xmm7 +; AVX1-NEXT: vmovq %r10, %xmm8 ; AVX1-NEXT: shrq %r9 -; AVX1-NEXT: vmovq %r9, %xmm8 +; AVX1-NEXT: vmovq %r9, %xmm9 ; AVX1-NEXT: shrq %r8 -; AVX1-NEXT: vmovq %r8, %xmm9 +; AVX1-NEXT: vmovq %r8, %xmm10 +; AVX1-NEXT: shrq %rdi +; AVX1-NEXT: vmovq %rdi, %xmm11 ; AVX1-NEXT: shrq %rsi -; AVX1-NEXT: vmovq %rsi, %xmm10 +; AVX1-NEXT: vmovq %rsi, %xmm12 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX1-NEXT: shrq %rax -; AVX1-NEXT: vmovq %rax, %xmm11 -; AVX1-NEXT: vmovq %rbx, %xmm12 -; AVX1-NEXT: vmovq %rdi, %xmm13 +; AVX1-NEXT: vmovq %rax, %xmm13 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: vmovq %rax, %xmm14 -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: vmovq %rax, %xmm15 +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: vmovq %rcx, %xmm15 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] @@ -2003,14 +2867,13 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX1-NEXT: vpsllq $48, %xmm2, %xmm2 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX1-NEXT: vpslld $16, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: popq %rbx @@ -2029,165 +2892,119 @@ ; AVX2-NEXT: pushq %r13 ; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX2-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX2-NEXT: vpextrq $1, %xmm2, %rcx +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vmovq %xmm2, %rdx +; AVX2-NEXT: vpextrq $1, %xmm2, %rsi ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-NEXT: vpextrq $1, %xmm5, %rbx +; AVX2-NEXT: vmovq %xmm5, %r12 +; AVX2-NEXT: vpextrq $1, %xmm2, %r13 +; AVX2-NEXT: vmovq %xmm2, %r14 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm2, %r15 +; AVX2-NEXT: vmovq %xmm2, %r11 +; AVX2-NEXT: vpextrq $1, %xmm1, %r10 +; AVX2-NEXT: vmovq %xmm1, %r9 +; AVX2-NEXT: vpextrq $1, %xmm3, %r8 +; AVX2-NEXT: vmovq %xmm3, %rdi +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX2-NEXT: vmovq %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-NEXT: vmovq %xmm7, %rsi -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX2-NEXT: vmovq %xmm2, %rdx -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm8 -; AVX2-NEXT: vmovq %xmm8, %r8 -; AVX2-NEXT: vpextrq $1, %xmm8, %r13 -; AVX2-NEXT: vpextrq $1, %xmm2, %r14 -; AVX2-NEXT: vpextrq $1, %xmm7, %r15 -; AVX2-NEXT: vpextrq $1, %xmm6, %r12 -; AVX2-NEXT: vpextrq $1, %xmm4, %rbx -; AVX2-NEXT: vpextrq $1, %xmm1, %rdi -; AVX2-NEXT: vpextrq $1, %xmm3, %rcx -; AVX2-NEXT: vmovq %xmm3, %rax -; AVX2-NEXT: vpextrq $1, %xmm0, %r11 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm2 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm8 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-NEXT: vpextrq $1, %xmm9, %r9 -; AVX2-NEXT: addq %r13, %r9 -; AVX2-NEXT: movq %r9, %r13 -; AVX2-NEXT: vpextrq $1, %xmm8, %r9 -; AVX2-NEXT: addq %r14, %r9 -; AVX2-NEXT: movq %r9, %r14 -; AVX2-NEXT: vpextrq $1, %xmm7, %r10 -; AVX2-NEXT: addq %r15, %r10 -; AVX2-NEXT: vpextrq $1, %xmm5, %r15 -; AVX2-NEXT: addq %r12, %r15 -; AVX2-NEXT: vpextrq $1, %xmm4, %r12 -; AVX2-NEXT: addq %rbx, %r12 -; AVX2-NEXT: vpextrq $1, %xmm3, %rbp -; AVX2-NEXT: addq %rdi, %rbp -; AVX2-NEXT: vpextrq $1, %xmm6, %r9 -; AVX2-NEXT: addq %rcx, %r9 -; AVX2-NEXT: vmovq %xmm6, %rdi -; AVX2-NEXT: addq %rax, %rdi -; AVX2-NEXT: vpextrq $1, %xmm2, %rcx -; AVX2-NEXT: addq %r11, %rcx -; AVX2-NEXT: vmovq %xmm9, %r11 -; AVX2-NEXT: leaq -1(%r8,%r11), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm8, %r8 -; AVX2-NEXT: leaq -1(%rdx,%r8), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm7, %rdx -; AVX2-NEXT: leaq -1(%rsi,%rdx), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm5, %rdx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: leaq -1(%rax,%rdx), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm4, %rdx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: leaq -1(%rax,%rdx), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm1, %rdx +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-NEXT: vpextrq $1, %xmm6, %rax +; AVX2-NEXT: leal -1(%rbx,%rax), %ebx +; AVX2-NEXT: vmovq %xmm6, %rax +; AVX2-NEXT: leal -1(%r12,%rax), %r12d +; AVX2-NEXT: vpextrq $1, %xmm5, %rax +; AVX2-NEXT: leal -1(%r13,%rax), %ebp +; AVX2-NEXT: vmovq %xmm5, %rax +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-NEXT: leal -1(%r14,%rax), %r13d +; AVX2-NEXT: vpextrq $1, %xmm5, %rax +; AVX2-NEXT: leal -1(%r15,%rax), %r14d +; AVX2-NEXT: vmovq %xmm5, %rax +; AVX2-NEXT: leal -1(%r11,%rax), %r11d +; AVX2-NEXT: vpextrq $1, %xmm4, %rax +; AVX2-NEXT: leal -1(%r10,%rax), %r10d +; AVX2-NEXT: vmovq %xmm4, %rax +; AVX2-NEXT: leal -1(%r9,%rax), %r9d +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: leal -1(%r8,%rax), %eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: leal -1(%rdi,%rax), %eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrq $1, %xmm3, %rax +; AVX2-NEXT: leaq -1(%rsi,%rax), %rax ; AVX2-NEXT: vmovq %xmm3, %rsi -; AVX2-NEXT: leaq -1(%rdx,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm0, %rdx -; AVX2-NEXT: vmovq %xmm2, %rsi -; AVX2-NEXT: leaq -1(%rdx,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: addq $-1, %r13 -; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl $0, %edx -; AVX2-NEXT: adcq $-1, %rdx -; AVX2-NEXT: addq $-1, %r14 -; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl $0, %esi -; AVX2-NEXT: adcq $-1, %rsi -; AVX2-NEXT: addq $-1, %r10 -; AVX2-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl $0, %r8d -; AVX2-NEXT: adcq $-1, %r8 -; AVX2-NEXT: addq $-1, %r15 -; AVX2-NEXT: movl $0, %r10d -; AVX2-NEXT: adcq $-1, %r10 -; AVX2-NEXT: addq $-1, %r12 -; AVX2-NEXT: movl $0, %ebx -; AVX2-NEXT: adcq $-1, %rbx -; AVX2-NEXT: addq $-1, %rbp -; AVX2-NEXT: movl $0, %r14d -; AVX2-NEXT: adcq $-1, %r14 -; AVX2-NEXT: addq $-1, %r9 -; AVX2-NEXT: movl $0, %r13d -; AVX2-NEXT: adcq $-1, %r13 -; AVX2-NEXT: addq $-1, %rdi -; AVX2-NEXT: movl $0, %r11d -; AVX2-NEXT: adcq $-1, %r11 -; AVX2-NEXT: addq $-1, %rcx -; AVX2-NEXT: movl $0, %eax -; AVX2-NEXT: adcq $-1, %rax -; AVX2-NEXT: shldq $63, %rcx, %rax -; AVX2-NEXT: shldq $63, %rdi, %r11 -; AVX2-NEXT: shldq $63, %r9, %r13 -; AVX2-NEXT: shldq $63, %rbp, %r14 -; AVX2-NEXT: shldq $63, %r12, %rbx -; AVX2-NEXT: shldq $63, %r15, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shldq $63, %rcx, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shldq $63, %rcx, %rsi -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shldq $63, %rcx, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm0 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm1 -; AVX2-NEXT: vmovq %rsi, %xmm2 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm3 -; AVX2-NEXT: vmovq %r8, %xmm4 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm5 -; AVX2-NEXT: vmovq %r10, %xmm6 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm7 -; AVX2-NEXT: vmovq %rbx, %xmm8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm9 -; AVX2-NEXT: vmovq %r14, %xmm10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm11 -; AVX2-NEXT: vmovq %r13, %xmm12 -; AVX2-NEXT: vmovq %r11, %xmm13 -; AVX2-NEXT: vmovq %rax, %xmm14 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: leaq -1(%rdx,%rsi), %r15 +; AVX2-NEXT: vpextrq $1, %xmm2, %rdx +; AVX2-NEXT: leaq -1(%rcx,%rdx), %rsi +; AVX2-NEXT: vmovq %xmm2, %rcx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX2-NEXT: leaq -1(%rdx,%rcx), %rdx +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rdi +; AVX2-NEXT: leaq -1(%rcx,%rdi), %rdi +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: vmovq %xmm1, %r8 +; AVX2-NEXT: leaq -1(%rcx,%r8), %rcx +; AVX2-NEXT: shrl %ebx +; AVX2-NEXT: vmovd %ebx, %xmm0 +; AVX2-NEXT: shrl %r12d +; AVX2-NEXT: vmovd %r12d, %xmm1 +; AVX2-NEXT: shrl %ebp +; AVX2-NEXT: vmovd %ebp, %xmm2 +; AVX2-NEXT: shrl %r13d +; AVX2-NEXT: vmovd %r13d, %xmm3 +; AVX2-NEXT: shrl %r14d +; AVX2-NEXT: vmovd %r14d, %xmm4 +; AVX2-NEXT: shrl %r11d +; AVX2-NEXT: vmovd %r11d, %xmm5 +; AVX2-NEXT: shrl %r10d +; AVX2-NEXT: vmovd %r10d, %xmm6 +; AVX2-NEXT: shrl %r9d +; AVX2-NEXT: vmovd %r9d, %xmm7 ; AVX2-NEXT: shrq %rax -; AVX2-NEXT: vmovq %rax, %xmm15 +; AVX2-NEXT: vmovq %rax, %xmm8 +; AVX2-NEXT: shrq %r15 +; AVX2-NEXT: vmovq %r15, %xmm9 +; AVX2-NEXT: shrq %rsi +; AVX2-NEXT: vmovq %rsi, %xmm10 +; AVX2-NEXT: shrq %rdx +; AVX2-NEXT: vmovq %rdx, %xmm11 +; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; AVX2-NEXT: shrl %eax +; AVX2-NEXT: vmovd %eax, %xmm12 +; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; AVX2-NEXT: shrl %eax +; AVX2-NEXT: vmovd %eax, %xmm13 +; AVX2-NEXT: shrq %rdi +; AVX2-NEXT: vmovq %rdi, %xmm14 +; AVX2-NEXT: shrq %rcx +; AVX2-NEXT: vmovq %rcx, %xmm15 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 @@ -2197,18 +3014,17 @@ ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5],xmm2[6,7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX2-NEXT: vpslld $16, %xmm3, %xmm3 -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX2-NEXT: vpsllq $48, %xmm3, %xmm3 +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: popq %rbx @@ -2228,138 +3044,119 @@ ; AVX512-NEXT: pushq %r13 ; AVX512-NEXT: pushq %r12 ; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm0 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm0 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512-NEXT: vmovq %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: vpextrq $1, %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-NEXT: vmovq %xmm3, %r13 -; AVX512-NEXT: vpextrq $1, %xmm3, %rsi -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512-NEXT: vmovq %xmm3, %rdi -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512-NEXT: vmovq %xmm5, %r8 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512-NEXT: vmovq %xmm2, %r9 -; AVX512-NEXT: vpextrq $1, %xmm2, %r10 +; AVX512-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: vpextrq $1, %xmm2, %rcx ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vmovq %xmm2, %rdx +; AVX512-NEXT: vpextrq $1, %xmm2, %rsi +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512-NEXT: vpextrq $1, %xmm5, %rbx +; AVX512-NEXT: vmovq %xmm5, %r12 +; AVX512-NEXT: vpextrq $1, %xmm2, %r13 +; AVX512-NEXT: vmovq %xmm2, %r14 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vpextrq $1, %xmm2, %r15 ; AVX512-NEXT: vmovq %xmm2, %r11 -; AVX512-NEXT: vpextrq $1, %xmm2, %rbx -; AVX512-NEXT: vpextrq $1, %xmm5, %rdx -; AVX512-NEXT: vpextrq $1, %xmm3, %rcx -; AVX512-NEXT: vpextrq $1, %xmm1, %rax -; AVX512-NEXT: vpextrq $1, %xmm0, %r14 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm2 +; AVX512-NEXT: vpextrq $1, %xmm1, %r10 +; AVX512-NEXT: vmovq %xmm1, %r9 +; AVX512-NEXT: vpextrq $1, %xmm3, %r8 +; AVX512-NEXT: vmovq %xmm3, %rdi +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm7 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm4 +; AVX512-NEXT: vpextrq $1, %xmm6, %rax +; AVX512-NEXT: leal -1(%rbx,%rax), %ebx +; AVX512-NEXT: vmovq %xmm6, %rax +; AVX512-NEXT: leal -1(%r12,%rax), %r12d +; AVX512-NEXT: vpextrq $1, %xmm5, %rax +; AVX512-NEXT: leal -1(%r13,%rax), %ebp +; AVX512-NEXT: vmovq %xmm5, %rax ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm8 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm9 -; AVX512-NEXT: vpextrq $1, %xmm8, %rbp -; AVX512-NEXT: addq %rdx, %rbp -; AVX512-NEXT: vpextrq $1, %xmm4, %rdx -; AVX512-NEXT: addq %rcx, %rdx -; AVX512-NEXT: vpextrq $1, %xmm3, %rcx -; AVX512-NEXT: addq %rax, %rcx -; AVX512-NEXT: vpextrq $1, %xmm2, %rax -; AVX512-NEXT: addq %r14, %rax -; AVX512-NEXT: vpextrq $1, %xmm9, %r14 -; AVX512-NEXT: leaq -1(%rbx,%r14), %r12 -; AVX512-NEXT: vmovq %xmm9, %rbx -; AVX512-NEXT: leaq -1(%r11,%rbx), %r15 -; AVX512-NEXT: vpextrq $1, %xmm7, %r11 -; AVX512-NEXT: leaq -1(%r10,%r11), %r14 -; AVX512-NEXT: vmovq %xmm7, %r10 -; AVX512-NEXT: leaq -1(%r9,%r10), %rbx -; AVX512-NEXT: vmovq %xmm8, %r9 -; AVX512-NEXT: leaq -1(%r8,%r9), %r11 -; AVX512-NEXT: vmovq %xmm4, %r8 -; AVX512-NEXT: leaq -1(%rdi,%r8), %r10 -; AVX512-NEXT: vpextrq $1, %xmm6, %rdi -; AVX512-NEXT: leaq -1(%rsi,%rdi), %r9 -; AVX512-NEXT: vmovq %xmm6, %rsi -; AVX512-NEXT: leaq -1(%r13,%rsi), %rsi -; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vpextrq $1, %xmm5, %rsi -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX512-NEXT: leaq -1(%rdi,%rsi), %rsi -; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vmovq %xmm5, %rsi -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX512-NEXT: leaq -1(%rdi,%rsi), %rsi -; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vmovq %xmm1, %rsi -; AVX512-NEXT: vmovq %xmm3, %rdi -; AVX512-NEXT: leaq -1(%rsi,%rdi), %rsi -; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vmovq %xmm0, %rsi -; AVX512-NEXT: vmovq %xmm2, %rdi -; AVX512-NEXT: leaq -1(%rsi,%rdi), %rsi -; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: xorl %r8d, %r8d -; AVX512-NEXT: addq $-1, %rbp -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: adcq $-1, %rsi -; AVX512-NEXT: addq $-1, %rdx -; AVX512-NEXT: movl $0, %edi -; AVX512-NEXT: adcq $-1, %rdi -; AVX512-NEXT: addq $-1, %rcx -; AVX512-NEXT: movl $0, %r13d -; AVX512-NEXT: adcq $-1, %r13 -; AVX512-NEXT: addq $-1, %rax -; AVX512-NEXT: adcq $-1, %r8 -; AVX512-NEXT: shldq $63, %rax, %r8 -; AVX512-NEXT: shldq $63, %rcx, %r13 -; AVX512-NEXT: shldq $63, %rdx, %rdi -; AVX512-NEXT: shldq $63, %rbp, %rsi -; AVX512-NEXT: shrq %r12 -; AVX512-NEXT: vmovq %r12, %xmm0 -; AVX512-NEXT: shrq %r15 -; AVX512-NEXT: vmovq %r15, %xmm1 -; AVX512-NEXT: shrq %r14 -; AVX512-NEXT: vmovq %r14, %xmm2 -; AVX512-NEXT: shrq %rbx -; AVX512-NEXT: vmovq %rbx, %xmm3 -; AVX512-NEXT: vmovq %rsi, %xmm4 -; AVX512-NEXT: shrq %r11 -; AVX512-NEXT: vmovq %r11, %xmm5 -; AVX512-NEXT: vmovq %rdi, %xmm6 -; AVX512-NEXT: shrq %r10 -; AVX512-NEXT: vmovq %r10, %xmm7 -; AVX512-NEXT: shrq %r9 -; AVX512-NEXT: vmovq %r9, %xmm8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: shrq %rax -; AVX512-NEXT: vmovq %rax, %xmm9 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: shrq %rax -; AVX512-NEXT: vmovq %rax, %xmm10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: shrq %rax -; AVX512-NEXT: vmovq %rax, %xmm11 -; AVX512-NEXT: vmovq %r13, %xmm12 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: shrq %rax -; AVX512-NEXT: vmovq %rax, %xmm13 -; AVX512-NEXT: vmovq %r8, %xmm14 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512-NEXT: leal -1(%r14,%rax), %r13d +; AVX512-NEXT: vpextrq $1, %xmm5, %rax +; AVX512-NEXT: leal -1(%r15,%rax), %r14d +; AVX512-NEXT: vmovq %xmm5, %rax +; AVX512-NEXT: leal -1(%r11,%rax), %r11d +; AVX512-NEXT: vpextrq $1, %xmm4, %rax +; AVX512-NEXT: leal -1(%r10,%rax), %r10d +; AVX512-NEXT: vmovq %xmm4, %rax +; AVX512-NEXT: leal -1(%r9,%rax), %r9d +; AVX512-NEXT: vpextrq $1, %xmm1, %rax +; AVX512-NEXT: leal -1(%r8,%rax), %eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vmovq %xmm1, %rax +; AVX512-NEXT: leal -1(%rdi,%rax), %eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrq $1, %xmm3, %rax +; AVX512-NEXT: leaq -1(%rsi,%rax), %rax +; AVX512-NEXT: vmovq %xmm3, %rsi +; AVX512-NEXT: leaq -1(%rdx,%rsi), %r15 +; AVX512-NEXT: vpextrq $1, %xmm2, %rdx +; AVX512-NEXT: leaq -1(%rcx,%rdx), %rsi +; AVX512-NEXT: vmovq %xmm2, %rcx +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX512-NEXT: leaq -1(%rdx,%rcx), %rdx +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpextrq $1, %xmm1, %rdi +; AVX512-NEXT: leaq -1(%rcx,%rdi), %rdi +; AVX512-NEXT: vmovq %xmm0, %rcx +; AVX512-NEXT: vmovq %xmm1, %r8 +; AVX512-NEXT: leaq -1(%rcx,%r8), %rcx +; AVX512-NEXT: shrl %ebx +; AVX512-NEXT: vmovd %ebx, %xmm0 +; AVX512-NEXT: shrl %r12d +; AVX512-NEXT: vmovd %r12d, %xmm1 +; AVX512-NEXT: shrl %ebp +; AVX512-NEXT: vmovd %ebp, %xmm2 +; AVX512-NEXT: shrl %r13d +; AVX512-NEXT: vmovd %r13d, %xmm3 +; AVX512-NEXT: shrl %r14d +; AVX512-NEXT: vmovd %r14d, %xmm4 +; AVX512-NEXT: shrl %r11d +; AVX512-NEXT: vmovd %r11d, %xmm5 +; AVX512-NEXT: shrl %r10d +; AVX512-NEXT: vmovd %r10d, %xmm6 +; AVX512-NEXT: shrl %r9d +; AVX512-NEXT: vmovd %r9d, %xmm7 ; AVX512-NEXT: shrq %rax -; AVX512-NEXT: vmovq %rax, %xmm15 +; AVX512-NEXT: vmovq %rax, %xmm8 +; AVX512-NEXT: shrq %r15 +; AVX512-NEXT: vmovq %r15, %xmm9 +; AVX512-NEXT: shrq %rsi +; AVX512-NEXT: vmovq %rsi, %xmm10 +; AVX512-NEXT: shrq %rdx +; AVX512-NEXT: vmovq %rdx, %xmm11 +; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; AVX512-NEXT: shrl %eax +; AVX512-NEXT: vmovd %eax, %xmm12 +; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; AVX512-NEXT: shrl %eax +; AVX512-NEXT: vmovd %eax, %xmm13 +; AVX512-NEXT: shrq %rdi +; AVX512-NEXT: vmovq %rdi, %xmm14 +; AVX512-NEXT: shrq %rcx +; AVX512-NEXT: vmovq %rcx, %xmm15 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 @@ -2370,15 +3167,15 @@ ; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512-NEXT: vpsllq $48, %xmm2, %xmm2 +; AVX512-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX512-NEXT: vpbroadcastw %xmm3, %xmm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] +; AVX512-NEXT: vpsllq $48, %xmm3, %xmm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512-NEXT: vmovdqu %xmm0, (%rax) diff --git a/llvm/test/CodeGen/X86/avoid-sfb.ll b/llvm/test/CodeGen/X86/avoid-sfb.ll --- a/llvm/test/CodeGen/X86/avoid-sfb.ll +++ b/llvm/test/CodeGen/X86/avoid-sfb.ll @@ -561,12 +561,12 @@ ; CHECK-NEXT: movl %ecx, 28(%rdi) ; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edx -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %esi -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movl %edx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movl %esi, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: retq ; ; DISABLED-LABEL: test_stack: @@ -579,8 +579,8 @@ ; DISABLED-NEXT: movups %xmm0, 16(%rdi) ; DISABLED-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; DISABLED-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; DISABLED-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; DISABLED-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) +; DISABLED-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; DISABLED-NEXT: retq ; ; AVX-LABEL: test_stack: diff --git a/llvm/test/CodeGen/X86/avx-logic.ll b/llvm/test/CodeGen/X86/avx-logic.ll --- a/llvm/test/CodeGen/X86/avx-logic.ll +++ b/llvm/test/CodeGen/X86/avx-logic.ll @@ -338,23 +338,25 @@ define <8 x i32> @andn_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) { ; AVX1-LABEL: andn_disguised_i8_elts: ; AVX1: # %bb.0: -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255] -; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpandn %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vandnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; INT256-LABEL: andn_disguised_i8_elts: ; INT256: # %bb.0: ; INT256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; INT256-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; INT256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; INT256-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; INT256-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; INT256-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; INT256-NEXT: retq %add = add <8 x i32> %y, %x @@ -417,17 +419,17 @@ define <8 x i32> @andn_variable_mask_operand_concat(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z, <8 x i32> %w) { ; AVX1-LABEL: andn_variable_mask_operand_concat: ; AVX1: # %bb.0: -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpandn %xmm2, %xmm4, %xmm1 -; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vandnps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; INT256-LABEL: andn_variable_mask_operand_concat: diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll --- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll @@ -33,8 +33,8 @@ ; X86-NEXT: movl (%ecx), %edx ; X86-NEXT: movl 4(%ecx), %esi ; X86-NEXT: vbroadcastsd (%ecx), %ymm0 -; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -601,8 +601,8 @@ ; X86-NEXT: movl (%ecx), %edx ; X86-NEXT: movl 4(%ecx), %esi ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll b/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll --- a/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll +++ b/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll @@ -8,14 +8,14 @@ ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vmovaps (%ecx), %xmm0 ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: test1: ; X64: ## %bb.0: -; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: vmovaps (%rdi), %xmm0 ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vmovss %xmm0, (%rsi) ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll b/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll --- a/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll +++ b/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll @@ -82,14 +82,19 @@ define <8 x float> @test7(float %a, <8 x float> %b, <8 x float> %c) { ; X86-LABEL: test7: ; X86: # %bb.0: -; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm2 -; X86-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm1 +; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X86-NEXT: vxorps %xmm2, %xmm3, %xmm2 +; X86-NEXT: vbroadcastss %xmm2, %ymm2 +; X86-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 ; X86-NEXT: retl ; ; X64-LABEL: test7: ; X64: # %bb.0: +; X64-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X64-NEXT: vxorps %xmm3, %xmm0, %xmm0 ; X64-NEXT: vbroadcastss %xmm0, %ymm0 -; X64-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 +; X64-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 ; X64-NEXT: retq %t0 = insertelement <8 x float> undef, float %a, i32 0 %t1 = fsub <8 x float> , %t0 @@ -102,14 +107,19 @@ define <8 x float> @test8(float %a, <8 x float> %b, <8 x float> %c) { ; X86-LABEL: test8: ; X86: # %bb.0: -; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm2 -; X86-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm1 +; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X86-NEXT: vxorps %xmm2, %xmm3, %xmm2 +; X86-NEXT: vbroadcastss %xmm2, %ymm2 +; X86-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 ; X86-NEXT: retl ; ; X64-LABEL: test8: ; X64: # %bb.0: +; X64-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X64-NEXT: vxorps %xmm3, %xmm0, %xmm0 ; X64-NEXT: vbroadcastss %xmm0, %ymm0 -; X64-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 +; X64-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 ; X64-NEXT: retq %t0 = fsub float -0.0, %a %t1 = insertelement <8 x float> undef, float %t0, i32 0 diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll b/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll --- a/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll +++ b/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll @@ -29,15 +29,15 @@ ; AVX512F-NEXT: vpmovdb %zmm4, %xmm4 ; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vpternlogq $216, %zmm2, %zmm1, %zmm0 -; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm3 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpandq %zmm1, %zmm2, %zmm3 +; AVX512F-NEXT: vpternlogq $220, %zmm2, %zmm3, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-NEXT: vpaddb %ymm4, %ymm1, %ymm4 +; AVX512F-NEXT: vpaddb %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm4 -; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vpternlogq $226, %zmm4, %zmm2, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: add_v64i8_broadcasts: diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll --- a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll +++ b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll @@ -1695,13 +1695,13 @@ ; CHECK-LABEL: bcast_unfold_fma231_v4f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 -; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB49_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1 -; CHECK-NEXT: vfmadd231ps {{.*#+}} xmm1 = (xmm1 * xmm0) + xmm1 -; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm0, %xmm1 +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovups %xmm0, 4096(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB49_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -1766,13 +1766,13 @@ ; CHECK-LABEL: bcast_unfold_fma231_v8f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 -; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB51_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1 -; CHECK-NEXT: vfmadd231ps {{.*#+}} ymm1 = (ymm1 * ymm0) + ymm1 -; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) +; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm0, %ymm1 +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vmovups %ymm0, 4096(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB51_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -1838,13 +1838,13 @@ ; CHECK-LABEL: bcast_unfold_fma231_v16f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 -; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB53_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1 -; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm1 = (zmm1 * zmm0) + zmm1 -; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm0, %zmm1 +; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vmovups %zmm0, 4096(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB53_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -1910,14 +1910,13 @@ ; CHECK-LABEL: bcast_unfold_fma231_v2f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0] -; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB55_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1 -; CHECK-NEXT: vfmadd231pd {{.*#+}} xmm1 = (xmm1 * xmm0) + xmm1 -; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm0, %xmm1 +; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovupd %xmm0, 8192(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB55_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -1982,13 +1981,13 @@ ; CHECK-LABEL: bcast_unfold_fma231_v4f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB57_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1 -; CHECK-NEXT: vfmadd231pd {{.*#+}} ymm1 = (ymm1 * ymm0) + ymm1 -; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) +; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm0 +; CHECK-NEXT: vaddpd %ymm0, %ymm0, %ymm1 +; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vmovupd %ymm0, 8192(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB57_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -2054,13 +2053,13 @@ ; CHECK-LABEL: bcast_unfold_fma231_v8f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB59_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1 -; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm1 = (zmm1 * zmm0) + zmm1 -; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm0, %zmm1 +; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vmovupd %zmm0, 8192(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB59_1 ; CHECK-NEXT: # %bb.2: # %bb10 diff --git a/llvm/test/CodeGen/X86/avx512-build-vector.ll b/llvm/test/CodeGen/X86/avx512-build-vector.ll --- a/llvm/test/CodeGen/X86/avx512-build-vector.ll +++ b/llvm/test/CodeGen/X86/avx512-build-vector.ll @@ -15,9 +15,9 @@ ; CHECK-LABEL: test3: ; CHECK: ## %bb.0: ; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15] -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15] +; CHECK-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1 ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq %b = extractelement <4 x float> %a, i32 2 diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll --- a/llvm/test/CodeGen/X86/avx512-ext.ll +++ b/llvm/test/CodeGen/X86/avx512-ext.ll @@ -2999,9 +2999,8 @@ ; KNL-LABEL: zext_4xi1_to_4x32: ; KNL: # %bb.0: ; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; KNL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; KNL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_4xi1_to_4x32: @@ -3014,8 +3013,8 @@ ; AVX512DQNOBW-LABEL: zext_4xi1_to_4x32: ; AVX512DQNOBW: # %bb.0: ; AVX512DQNOBW-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX512DQNOBW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512DQNOBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512DQNOBW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512DQNOBW-NEXT: retq %mask = icmp eq <4 x i8> %x, %y %1 = zext <4 x i1> %mask to <4 x i32> @@ -3026,8 +3025,8 @@ ; KNL-LABEL: zext_2xi1_to_2xi64: ; KNL: # %bb.0: ; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; KNL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; KNL-NEXT: retq ; ; SKX-LABEL: zext_2xi1_to_2xi64: @@ -3040,8 +3039,8 @@ ; AVX512DQNOBW-LABEL: zext_2xi1_to_2xi64: ; AVX512DQNOBW: # %bb.0: ; AVX512DQNOBW-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX512DQNOBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512DQNOBW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512DQNOBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512DQNOBW-NEXT: retq %mask = icmp eq <2 x i8> %x, %y %1 = zext <2 x i1> %mask to <2 x i64> diff --git a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll --- a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll +++ b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll @@ -732,11 +732,12 @@ define void @load_v3i1_broadcast_2_v1i1_store(ptr %a0,ptr %a1) { ; AVX512-LABEL: load_v3i1_broadcast_2_v1i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: testb $4, (%rdi) -; AVX512-NEXT: movl $255, %ecx -; AVX512-NEXT: cmovel %eax, %ecx -; AVX512-NEXT: kmovd %ecx, %k0 +; AVX512-NEXT: movzbl (%rdi), %eax +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: btl $2, %eax +; AVX512-NEXT: movl $255, %eax +; AVX512-NEXT: cmovael %ecx, %eax +; AVX512-NEXT: kmovd %eax, %k0 ; AVX512-NEXT: kshiftrb $2, %k0, %k0 ; AVX512-NEXT: kshiftlb $7, %k0, %k0 ; AVX512-NEXT: kshiftrb $7, %k0, %k0 @@ -745,11 +746,12 @@ ; ; AVX512NOTDQ-LABEL: load_v3i1_broadcast_2_v1i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: xorl %eax, %eax -; AVX512NOTDQ-NEXT: testb $4, (%rdi) -; AVX512NOTDQ-NEXT: movl $255, %ecx -; AVX512NOTDQ-NEXT: cmovel %eax, %ecx -; AVX512NOTDQ-NEXT: kmovd %ecx, %k0 +; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax +; AVX512NOTDQ-NEXT: xorl %ecx, %ecx +; AVX512NOTDQ-NEXT: btl $2, %eax +; AVX512NOTDQ-NEXT: movl $255, %eax +; AVX512NOTDQ-NEXT: cmovael %ecx, %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k0 ; AVX512NOTDQ-NEXT: kshiftrw $2, %k0, %k0 ; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 diff --git a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll --- a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll +++ b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll @@ -6,7 +6,7 @@ ; KNL-LABEL: hadd_16: ; KNL: # %bb.0: ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vmovd %xmm0, %eax @@ -15,7 +15,7 @@ ; SKX-LABEL: hadd_16: ; SKX: # %bb.0: ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 ; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; SKX-NEXT: vmovd %xmm0, %eax @@ -33,7 +33,7 @@ ; KNL-LABEL: hsub_16: ; KNL: # %bb.0: ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; KNL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vmovd %xmm0, %eax @@ -42,7 +42,7 @@ ; SKX-LABEL: hsub_16: ; SKX: # %bb.0: ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 ; SKX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; SKX-NEXT: vmovd %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -969,20 +969,22 @@ ; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL-NEXT: vpcmpnleuq %zmm1, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: andl $1, %ecx -; KNL-NEXT: movl $4, %eax -; KNL-NEXT: subl %ecx, %eax +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: movb $4, %cl +; KNL-NEXT: subb %al, %cl +; KNL-NEXT: movzbl %cl, %eax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_v2i1: ; SKX: ## %bb.0: ; SKX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k0 -; SKX-NEXT: kmovd %k0, %ecx -; SKX-NEXT: andl $1, %ecx -; SKX-NEXT: movl $4, %eax -; SKX-NEXT: subl %ecx, %eax +; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: andb $1, %al +; SKX-NEXT: movb $4, %cl +; SKX-NEXT: subb %al, %cl +; SKX-NEXT: movzbl %cl, %eax ; SKX-NEXT: retq %t1 = icmp ugt <2 x i64> %a, %b %t2 = extractelement <2 x i1> %t1, i32 0 @@ -1086,10 +1088,11 @@ ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: andl $1, %ecx -; KNL-NEXT: movl $4, %eax -; KNL-NEXT: subl %ecx, %eax +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: movb $4, %cl +; KNL-NEXT: subb %al, %cl +; KNL-NEXT: movzbl %cl, %eax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; @@ -1097,10 +1100,11 @@ ; SKX: ## %bb.0: ; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; SKX-NEXT: kshiftrq $63, %k0, %k0 -; SKX-NEXT: kmovd %k0, %ecx -; SKX-NEXT: andl $1, %ecx -; SKX-NEXT: movl $4, %eax -; SKX-NEXT: subl %ecx, %eax +; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: andb $1, %al +; SKX-NEXT: movb $4, %cl +; SKX-NEXT: subb %al, %cl +; SKX-NEXT: movzbl %cl, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %t1 = icmp ugt <64 x i8> %a, %b diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -1132,9 +1132,9 @@ define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) { ; X86-LABEL: test_mask_pcmpeq_d: ; X86: ## %bb.0: -; X86-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1] ; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x23,0x44,0x24,0x04] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl ## encoding: [0xc3] @@ -1204,9 +1204,9 @@ define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) { ; X86-LABEL: test_mask_pcmpgt_d: ; X86: ## %bb.0: -; X86-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x66,0xc1] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc1] ; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x23,0x44,0x24,0x04] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl ## encoding: [0xc3] @@ -5975,31 +5975,74 @@ declare <8 x i64> @llvm.x86.avx512.movntdqa(ptr) nounwind readonly define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) { -; CHECK-LABEL: test_cmp_d_512: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] -; CHECK-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 ## encoding: [0x62,0xf1,0x75,0x48,0x66,0xc8] -; CHECK-NEXT: vpcmpled %zmm1, %zmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xd1,0x02] -; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltd %zmm1, %zmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xe1,0x05] -; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k5 ## encoding: [0x62,0xf1,0x7d,0x48,0x66,0xe9] -; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] -; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] -; CHECK-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X86-LABEL: test_cmp_d_512: +; X86: ## %bb.0: +; X86-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10] +; X86-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1] +; X86-NEXT: vmovd %ecx, %xmm2 ## encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpbroadcastd %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x58,0xd2] +; X86-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 ## encoding: [0x62,0xf1,0x75,0x48,0x66,0xc0] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X86-NEXT: vpcmpled %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x02] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X86-NEXT: vpcmpnltd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x05] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X86-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x66,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] +; X86-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X86-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_cmp_d_512: +; X64: ## %bb.0: +; X64-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X64-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10] +; X64-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1] +; X64-NEXT: movq %rcx, %rax ## encoding: [0x48,0x89,0xc8] +; X64-NEXT: shlq $32, %rax ## encoding: [0x48,0xc1,0xe0,0x20] +; X64-NEXT: orq %rcx, %rax ## encoding: [0x48,0x09,0xc8] +; X64-NEXT: vmovq %rax, %xmm2 ## encoding: [0xc4,0xe1,0xf9,0x6e,0xd0] +; X64-NEXT: vpbroadcastq %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X64-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 ## encoding: [0x62,0xf1,0x75,0x48,0x66,0xc0] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpled %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x02] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x05] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x66,0xc1] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] +; X64-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X64-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq ## encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1) @@ -6025,23 +6068,30 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1] -; X86-NEXT: vpcmpgtd %zmm0, %zmm1, %k2 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x66,0xd0] -; X86-NEXT: vpcmpled %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xd9,0x02] -; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04] -; X86-NEXT: vpcmpnltd %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe9,0x05] -; X86-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc9] -; X86-NEXT: kmovw %k2, %ecx ## encoding: [0xc5,0xf8,0x93,0xca] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; X86-NEXT: kmovw %k0, %edx ## encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: shll $16, %edx ## encoding: [0xc1,0xe2,0x10] +; X86-NEXT: orl %ecx, %edx ## encoding: [0x09,0xca] +; X86-NEXT: vmovd %edx, %xmm2 ## encoding: [0xc5,0xf9,0x6e,0xd2] +; X86-NEXT: vpbroadcastd %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x58,0xd2] +; X86-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x66,0xc0] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmpled %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x02] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x04] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x05] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc1] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl ## encoding: [0xc3] @@ -6050,23 +6100,33 @@ ; X64: ## %bb.0: ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1] -; X64-NEXT: vpcmpgtd %zmm0, %zmm1, %k2 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x66,0xd0] -; X64-NEXT: vpcmpled %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xd9,0x02] -; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04] -; X64-NEXT: vpcmpnltd %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe9,0x05] -; X64-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc9] -; X64-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10] +; X64-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1] +; X64-NEXT: movq %rcx, %rax ## encoding: [0x48,0x89,0xc8] +; X64-NEXT: shlq $32, %rax ## encoding: [0x48,0xc1,0xe0,0x20] +; X64-NEXT: orq %rcx, %rax ## encoding: [0x48,0x09,0xc8] +; X64-NEXT: vmovq %rax, %xmm2 ## encoding: [0xc4,0xe1,0xf9,0x6e,0xd0] +; X64-NEXT: vpbroadcastq %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X64-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x66,0xc0] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpled %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x02] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x04] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x05] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc1] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq ## encoding: [0xc3] @@ -6092,31 +6152,74 @@ declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) { -; CHECK-LABEL: test_ucmp_d_512: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] -; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc9,0x01] -; CHECK-NEXT: vpcmpleud %zmm1, %zmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xd1,0x02] -; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xe1,0x05] -; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xe9,0x06] -; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] -; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] -; CHECK-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X86-LABEL: test_ucmp_d_512: +; X86: ## %bb.0: +; X86-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10] +; X86-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1] +; X86-NEXT: vmovd %ecx, %xmm2 ## encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpbroadcastd %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x58,0xd2] +; X86-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x01] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X86-NEXT: vpcmpleud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x02] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X86-NEXT: vpcmpnltud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x05] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X86-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x06] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] +; X86-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X86-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_ucmp_d_512: +; X64: ## %bb.0: +; X64-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X64-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10] +; X64-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1] +; X64-NEXT: movq %rcx, %rax ## encoding: [0x48,0x89,0xc8] +; X64-NEXT: shlq $32, %rax ## encoding: [0x48,0xc1,0xe0,0x20] +; X64-NEXT: orq %rcx, %rax ## encoding: [0x48,0x09,0xc8] +; X64-NEXT: vmovq %rax, %xmm2 ## encoding: [0xc4,0xe1,0xf9,0x6e,0xd0] +; X64-NEXT: vpbroadcastq %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X64-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x01] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x02] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x05] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x06] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] +; X64-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X64-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq ## encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1) @@ -6142,23 +6245,30 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1] -; X86-NEXT: vpcmpltud %zmm1, %zmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd1,0x01] -; X86-NEXT: vpcmpleud %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd9,0x02] -; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04] -; X86-NEXT: vpcmpnltud %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xe9,0x05] -; X86-NEXT: vpcmpnleud %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc9,0x06] -; X86-NEXT: kmovw %k2, %ecx ## encoding: [0xc5,0xf8,0x93,0xca] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; X86-NEXT: kmovw %k0, %edx ## encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: shll $16, %edx ## encoding: [0xc1,0xe2,0x10] +; X86-NEXT: orl %ecx, %edx ## encoding: [0x09,0xca] +; X86-NEXT: vmovd %edx, %xmm2 ## encoding: [0xc5,0xf9,0x6e,0xd2] +; X86-NEXT: vpbroadcastd %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x58,0xd2] +; X86-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x01] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmpleud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x02] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x04] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x05] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x06] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl ## encoding: [0xc3] @@ -6167,23 +6277,33 @@ ; X64: ## %bb.0: ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1] -; X64-NEXT: vpcmpltud %zmm1, %zmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd1,0x01] -; X64-NEXT: vpcmpleud %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd9,0x02] -; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04] -; X64-NEXT: vpcmpnltud %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xe9,0x05] -; X64-NEXT: vpcmpnleud %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc9,0x06] -; X64-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10] +; X64-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1] +; X64-NEXT: movq %rcx, %rax ## encoding: [0x48,0x89,0xc8] +; X64-NEXT: shlq $32, %rax ## encoding: [0x48,0xc1,0xe0,0x20] +; X64-NEXT: orq %rcx, %rax ## encoding: [0x48,0x09,0xc8] +; X64-NEXT: vmovq %rax, %xmm2 ## encoding: [0xc4,0xe1,0xf9,0x6e,0xd0] +; X64-NEXT: vpbroadcastq %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X64-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x01] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x02] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x04] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x05] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x06] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq ## encoding: [0xc3] @@ -6797,9 +6917,10 @@ ; X86-LABEL: test_vptestmd: ; X86: ## %bb.0: ; X86-NEXT: vptestmd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc1] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x41,0xc9] ; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: andw %cx, %ax ## encoding: [0x66,0x21,0xc8] +; X86-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] ; X86-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] @@ -6827,9 +6948,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestnm_d_512: ; X86: ## %bb.0: ; X86-NEXT: vptestnmd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x48,0x27,0xc1] -; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: andw %cx, %ax ## encoding: [0x66,0x21,0xc8] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x41,0xc9] +; X86-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] @@ -6882,8 +7004,10 @@ define i16 @test_kand(i16 %a0, i16 %a1) { ; X86-LABEL: test_kand: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x23,0x44,0x24,0x08] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 ## encoding: [0xc5,0xf8,0x90,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: kandw %k1, %k0, %k0 ## encoding: [0xc5,0xfc,0x41,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: andl $8, %eax ## encoding: [0x83,0xe0,0x08] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ## encoding: [0xc3] @@ -6904,17 +7028,21 @@ define i16 @test_kandn(i16 %a0, i16 %a1) { ; X86-LABEL: test_kandn: ; X86: ## %bb.0: -; X86-NEXT: movl $65527, %eax ## encoding: [0xb8,0xf7,0xff,0x00,0x00] -; X86-NEXT: ## imm = 0xFFF7 -; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ## encoding: [0x0b,0x44,0x24,0x04] -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x23,0x44,0x24,0x08] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 ## encoding: [0xc5,0xf8,0x90,0x44,0x24,0x04] +; X86-NEXT: movw $8, %ax ## encoding: [0x66,0xb8,0x08,0x00] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: kandnw %k1, %k0, %k0 ## encoding: [0xc5,0xfc,0x42,0xc1] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: kandnw %k1, %k0, %k0 ## encoding: [0xc5,0xfc,0x42,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_kandn: ; X64: ## %bb.0: ; X64-NEXT: movl %edi, %eax ## encoding: [0x89,0xf8] -; X64-NEXT: orl $-9, %eax ## encoding: [0x83,0xc8,0xf7] +; X64-NEXT: orl $65527, %eax ## encoding: [0x0d,0xf7,0xff,0x00,0x00] +; X64-NEXT: ## imm = 0xFFF7 ; X64-NEXT: andl %esi, %eax ## encoding: [0x21,0xf0] ; X64-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ## encoding: [0xc3] @@ -6946,8 +7074,10 @@ define i16 @test_kor(i16 %a0, i16 %a1) { ; X86-LABEL: test_kor: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: orw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x0b,0x44,0x24,0x08] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 ## encoding: [0xc5,0xf8,0x90,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: korw %k1, %k0, %k0 ## encoding: [0xc5,0xfc,0x45,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: orl $8, %eax ## encoding: [0x83,0xc8,0x08] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ## encoding: [0xc3] @@ -6970,8 +7100,10 @@ define i16 @test_kxnor(i16 %a0, i16 %a1) { ; X86-LABEL: test_kxnor: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: xorw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x33,0x44,0x24,0x08] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 ## encoding: [0xc5,0xf8,0x90,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: kxorw %k1, %k0, %k0 ## encoding: [0xc5,0xfc,0x47,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: xorl $8, %eax ## encoding: [0x83,0xf0,0x08] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ## encoding: [0xc3] @@ -6992,8 +7124,10 @@ define i16 @test_kxor(i16 %a0, i16 %a1) { ; X86-LABEL: test_kxor: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: xorw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x33,0x44,0x24,0x08] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 ## encoding: [0xc5,0xf8,0x90,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: kxorw %k1, %k0, %k0 ## encoding: [0xc5,0xfc,0x47,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: xorl $8, %eax ## encoding: [0x83,0xf0,0x08] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ## encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -86,17 +86,13 @@ define void @mask16_mem(ptr %ptr) { ; CHECK-LABEL: mask16_mem: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw (%rdi), %k0 -; CHECK-NEXT: knotw %k0, %k0 -; CHECK-NEXT: kmovw %k0, (%rdi) +; CHECK-NEXT: notw (%rdi) ; CHECK-NEXT: retq ; ; X86-LABEL: mask16_mem: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: kmovw (%eax), %k0 -; X86-NEXT: knotw %k0, %k0 -; X86-NEXT: kmovw %k0, (%eax) +; X86-NEXT: notw (%eax) ; X86-NEXT: retl %x = load i16, ptr %ptr, align 4 %m0 = bitcast i16 %x to <16 x i1> @@ -107,36 +103,15 @@ } define void @mask8_mem(ptr %ptr) { -; KNL-LABEL: mask8_mem: -; KNL: ## %bb.0: -; KNL-NEXT: notb (%rdi) -; KNL-NEXT: retq -; -; SKX-LABEL: mask8_mem: -; SKX: ## %bb.0: -; SKX-NEXT: kmovb (%rdi), %k0 -; SKX-NEXT: knotb %k0, %k0 -; SKX-NEXT: kmovb %k0, (%rdi) -; SKX-NEXT: retq -; -; AVX512BW-LABEL: mask8_mem: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: notb (%rdi) -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: mask8_mem: -; AVX512DQ: ## %bb.0: -; AVX512DQ-NEXT: kmovb (%rdi), %k0 -; AVX512DQ-NEXT: knotb %k0, %k0 -; AVX512DQ-NEXT: kmovb %k0, (%rdi) -; AVX512DQ-NEXT: retq +; CHECK-LABEL: mask8_mem: +; CHECK: ## %bb.0: +; CHECK-NEXT: notb (%rdi) +; CHECK-NEXT: retq ; ; X86-LABEL: mask8_mem: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: kmovb (%eax), %k0 -; X86-NEXT: knotb %k0, %k0 -; X86-NEXT: kmovb %k0, (%eax) +; X86-NEXT: notb (%eax) ; X86-NEXT: retl %x = load i8, ptr %ptr, align 4 %m0 = bitcast i8 %x to <8 x i1> @@ -156,8 +131,11 @@ ; ; X86-LABEL: mand16: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orw {{[0-9]+}}(%esp), %ax +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: korw %k1, %k0, %k0 +; X86-NEXT: kmovd %k0, %eax +; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl %ma = bitcast i16 %x to <16 x i1> %mb = bitcast i16 %y to <16 x i1> @@ -1352,8 +1330,8 @@ ; ; X86-LABEL: test17: ; X86: ## %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k0 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k0 ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; X86-NEXT: setg %al ; X86-NEXT: kshiftrq $6, %k0, %k1 @@ -3882,8 +3860,11 @@ ; ; X86-LABEL: test_v16i1_add: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorw {{[0-9]+}}(%esp), %ax +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: kxorw %k1, %k0, %k0 +; X86-NEXT: kmovd %k0, %eax +; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl %m0 = bitcast i16 %x to <16 x i1> %m1 = bitcast i16 %y to <16 x i1> @@ -3902,8 +3883,11 @@ ; ; X86-LABEL: test_v16i1_sub: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorw {{[0-9]+}}(%esp), %ax +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: kxorw %k1, %k0, %k0 +; X86-NEXT: kmovd %k0, %eax +; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl %m0 = bitcast i16 %x to <16 x i1> %m1 = bitcast i16 %y to <16 x i1> @@ -3922,8 +3906,11 @@ ; ; X86-LABEL: test_v16i1_mul: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: kandw %k1, %k0, %k0 +; X86-NEXT: kmovd %k0, %eax +; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl %m0 = bitcast i16 %x to <16 x i1> %m1 = bitcast i16 %y to <16 x i1> @@ -3942,8 +3929,11 @@ ; ; X86-LABEL: test_v8i1_add: ; X86: ## %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorb {{[0-9]+}}(%esp), %al +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k0 +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; X86-NEXT: kxorb %k1, %k0, %k0 +; X86-NEXT: kmovd %k0, %eax +; X86-NEXT: ## kill: def $al killed $al killed $eax ; X86-NEXT: retl %m0 = bitcast i8 %x to <8 x i1> %m1 = bitcast i8 %y to <8 x i1> @@ -3962,8 +3952,11 @@ ; ; X86-LABEL: test_v8i1_sub: ; X86: ## %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorb {{[0-9]+}}(%esp), %al +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k0 +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; X86-NEXT: kxorb %k1, %k0, %k0 +; X86-NEXT: kmovd %k0, %eax +; X86-NEXT: ## kill: def $al killed $al killed $eax ; X86-NEXT: retl %m0 = bitcast i8 %x to <8 x i1> %m1 = bitcast i8 %y to <8 x i1> @@ -3982,8 +3975,11 @@ ; ; X86-LABEL: test_v8i1_mul: ; X86: ## %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: andb {{[0-9]+}}(%esp), %al +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k0 +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; X86-NEXT: kandb %k1, %k0, %k0 +; X86-NEXT: kmovd %k0, %eax +; X86-NEXT: ## kill: def $al killed $al killed $eax ; X86-NEXT: retl %m0 = bitcast i8 %x to <8 x i1> %m1 = bitcast i8 %y to <8 x i1> @@ -4751,11 +4747,7 @@ ; KNL-NEXT: vpcmpeqw %ymm5, %ymm3, %ymm3 ; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; KNL-NEXT: vpternlogq $200, %zmm1, %zmm0, %zmm2 -; KNL-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; KNL-NEXT: vpor %ymm0, %ymm2, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %k0 ; KNL-NEXT: kortestw %k0, %k0 ; KNL-NEXT: je LBB77_1 ; KNL-NEXT: ## %bb.2: ## %exit @@ -4832,11 +4824,7 @@ ; AVX512DQ-NEXT: vpcmpeqw %ymm5, %ymm3, %ymm3 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512DQ-NEXT: vpternlogq $200, %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %k0 ; AVX512DQ-NEXT: kortestw %k0, %k0 ; AVX512DQ-NEXT: je LBB77_1 ; AVX512DQ-NEXT: ## %bb.2: ## %exit diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -1316,9 +1316,10 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12] +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,0,8,4,6,4,12] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1327,8 +1328,9 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,0,8,4,6,4,12] ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper @@ -1342,10 +1344,11 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,12,4,6,4,12] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,0,8,4,6,4,12] +; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1718,9 +1721,15 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,4,3,6] -; CHECK-NEXT: vpermi2d (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: vpbroadcastd 24(%rdi), %xmm0 +; CHECK-NEXT: vmovdqa (%rdi), %xmm1 +; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 +; CHECK-NEXT: vmovd %xmm1, %eax +; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; CHECK-NEXT: vextractps $3, %xmm2, %eax +; CHECK-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; CHECK-NEXT: vpextrd $2, %xmm1, %eax +; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1729,11 +1738,17 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [2,4,3,6] -; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3 +; CHECK-NEXT: vpbroadcastd 24(%rdi), %xmm2 +; CHECK-NEXT: vmovdqa (%rdi), %xmm3 +; CHECK-NEXT: vmovaps 16(%rdi), %xmm4 +; CHECK-NEXT: vmovd %xmm3, %eax +; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vextractps $3, %xmm4, %eax +; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vpextrd $2, %xmm3, %eax +; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1745,11 +1760,17 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [2,4,3,6] +; CHECK-NEXT: vpbroadcastd 24(%rdi), %xmm1 +; CHECK-NEXT: vmovdqa (%rdi), %xmm2 +; CHECK-NEXT: vmovaps 16(%rdi), %xmm3 +; CHECK-NEXT: vmovd %xmm2, %eax +; CHECK-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; CHECK-NEXT: vextractps $3, %xmm3, %eax +; CHECK-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; CHECK-NEXT: vpextrd $2, %xmm2, %eax +; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -2695,40 +2716,24 @@ } define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(ptr %vp) { -; CHECK-FAST-LABEL: test_8xi64_to_2xi64_perm_mem_mask0: -; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm0 = [4,1] -; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm0, %zmm0 -; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; CHECK-FAST-NEXT: vzeroupper -; CHECK-FAST-NEXT: retq -; -; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_2xi64_perm_mem_mask0: -; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm0 -; CHECK-FAST-PERLANE-NEXT: vblendps $12, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[2,3] -; CHECK-FAST-PERLANE-NEXT: retq +; CHECK-LABEL: test_8xi64_to_2xi64_perm_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovsd 8(%rdi), %xmm0 # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd 32(%rdi), %xmm1 # xmm1 = mem[0],zero +; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> ret <2 x i64> %res } define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) { -; CHECK-FAST-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0: -; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,1] -; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm2 -; CHECK-FAST-NEXT: vptestnmq %xmm1, %xmm1, %k1 -; CHECK-FAST-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1} -; CHECK-FAST-NEXT: vzeroupper -; CHECK-FAST-NEXT: retq -; -; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0: -; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 -; CHECK-FAST-PERLANE-NEXT: vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3] -; CHECK-FAST-PERLANE-NEXT: vptestnmq %xmm1, %xmm1, %k1 -; CHECK-FAST-PERLANE-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1} -; CHECK-FAST-PERLANE-NEXT: retq +; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovq 8(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-NEXT: vmovq 32(%rdi), %xmm3 # xmm3 = mem[0],zero +; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0] +; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> %cmp = icmp eq <2 x i64> %mask, zeroinitializer @@ -2737,22 +2742,13 @@ } define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %mask) { -; CHECK-FAST-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0: -; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1] -; CHECK-FAST-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} -; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; CHECK-FAST-NEXT: vzeroupper -; CHECK-FAST-NEXT: retq -; -; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0: -; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm1 -; CHECK-FAST-PERLANE-NEXT: vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3] -; CHECK-FAST-PERLANE-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; CHECK-FAST-PERLANE-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} -; CHECK-FAST-PERLANE-NEXT: retq +; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovq 8(%rdi), %xmm1 # xmm1 = mem[0],zero +; CHECK-NEXT: vmovq 32(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0] +; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> %cmp = icmp eq <2 x i64> %mask, zeroinitializer @@ -3167,11 +3163,12 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,4,8,9,6,1,4,4] -; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [0,4,8,9,6,1,4,4] +; CHECK-NEXT: vpermi2ps %ymm3, %ymm0, %ymm4 +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vcmpeqps %ymm0, %ymm2, %k1 +; CHECK-NEXT: vblendmps %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -3182,10 +3179,11 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,8,9,6,1,4,4] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 -; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,4,8,9,6,1,4,4] +; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %ymm4, %ymm1, %k1 +; CHECK-NEXT: vpermt2ps %ymm2, %ymm3, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> @@ -3482,26 +3480,16 @@ } define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %vec2, <8 x float> %mask) { -; CHECK-FAST-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: -; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [9,5,2,3,2,8,8,1] -; CHECK-FAST-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 -; CHECK-FAST-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-FAST-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-FAST-NEXT: vmovaps %ymm3, %ymm0 {%k1} -; CHECK-FAST-NEXT: retq -; -; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: -; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2 -; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3 -; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm4 = [9,5,2,3,2,8,8,1] -; CHECK-FAST-PERLANE-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4 -; CHECK-FAST-PERLANE-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-FAST-PERLANE-NEXT: vmovaps %ymm4, %ymm0 {%k1} -; CHECK-FAST-PERLANE-NEXT: retq +; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps (%rdi), %xmm2 +; CHECK-NEXT: vmovaps 32(%rdi), %ymm3 +; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [9,5,2,3,2,8,8,1] +; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 +; CHECK-NEXT: vmovaps %ymm4, %ymm0 {%k1} +; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -3510,26 +3498,16 @@ } define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %mask) { -; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: -; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1] -; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-FAST-NEXT: vcmpeqps %ymm3, %ymm0, %k1 -; CHECK-FAST-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-FAST-NEXT: vmovaps %ymm1, %ymm0 -; CHECK-FAST-NEXT: retq -; -; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: -; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2 -; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3 -; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1] -; CHECK-FAST-PERLANE-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm4, %ymm0, %k1 -; CHECK-FAST-PERLANE-NEXT: vpermi2ps %ymm2, %ymm3, %ymm1 {%k1} {z} -; CHECK-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm0 -; CHECK-FAST-PERLANE-NEXT: retq +; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps (%rdi), %xmm2 +; CHECK-NEXT: vmovaps 32(%rdi), %ymm3 +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1] +; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %ymm4, %ymm0, %k1 +; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm1 {%k1} {z} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -4672,10 +4650,11 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %vec2, <2 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup 8(%rdi), %xmm2 # xmm2 = mem[0,0] -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 -; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0] +; CHECK-NEXT: vmovsd 8(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-NEXT: vmovsd 32(%rdi), %xmm3 # xmm3 = mem[0],zero +; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %xmm4, %xmm1, %k1 +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} = xmm2[0],xmm3[0] ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> @@ -4687,10 +4666,11 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup 8(%rdi), %xmm1 # xmm1 = mem[0,0] -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1 -; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0] +; CHECK-NEXT: vmovsd 8(%rdi), %xmm1 # xmm1 = mem[0],zero +; CHECK-NEXT: vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %xmm3, %xmm0, %k1 +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm1[0],xmm2[0] ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-trunc.ll b/llvm/test/CodeGen/X86/avx512-trunc.ll --- a/llvm/test/CodeGen/X86/avx512-trunc.ll +++ b/llvm/test/CodeGen/X86/avx512-trunc.ll @@ -629,7 +629,8 @@ ; ; SKX-LABEL: usat_trunc_wb_128_mem: ; SKX: ## %bb.0: -; SKX-NEXT: vpmovuswb %xmm0, (%rdi) +; SKX-NEXT: vpmovuswb %xmm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: retq %x3 = icmp ult <8 x i16> %i, %x5 = select <8 x i1> %x3, <8 x i16> %i, <8 x i16> @@ -654,7 +655,8 @@ define void @usat_trunc_qb_512_mem(<8 x i64> %i, ptr %res) { ; ALL-LABEL: usat_trunc_qb_512_mem: ; ALL: ## %bb.0: -; ALL-NEXT: vpmovusqb %zmm0, (%rdi) +; ALL-NEXT: vpmovusqb %zmm0, %xmm0 +; ALL-NEXT: vmovq %xmm0, (%rdi) ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x3 = icmp ult <8 x i64> %i, @@ -864,18 +866,11 @@ } define void @smax_usat_trunc_wb_128_mem(<8 x i16> %i, ptr %res) { -; KNL-LABEL: smax_usat_trunc_wb_128_mem: -; KNL: ## %bb.0: -; KNL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovq %xmm0, (%rdi) -; KNL-NEXT: retq -; -; SKX-LABEL: smax_usat_trunc_wb_128_mem: -; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpmovuswb %xmm0, (%rdi) -; SKX-NEXT: retq +; ALL-LABEL: smax_usat_trunc_wb_128_mem: +; ALL: ## %bb.0: +; ALL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vmovq %xmm0, (%rdi) +; ALL-NEXT: retq %x1 = icmp sgt <8 x i16> %i, %x2 = select <8 x i1> %x1, <8 x i16> %i, <8 x i16> %x3 = icmp slt <8 x i16> %x2, @@ -907,7 +902,8 @@ ; ALL: ## %bb.0: ; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; ALL-NEXT: vpmovusqb %zmm0, (%rdi) +; ALL-NEXT: vpmovusqb %zmm0, %xmm0 +; ALL-NEXT: vmovq %xmm0, (%rdi) ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x1 = icmp sgt <8 x i64> %i, diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll --- a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll @@ -311,10 +311,6 @@ ; We implement the scalar broadcast intrinsics with vector initializers. ; Verify that the IR generated will produce the broadcast at the end. define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a) { -; ALL-LABEL: test_mm512_broadcastsd_pd: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 -; ALL-NEXT: retq entry: %0 = extractelement <2 x double> %a, i32 0 %vecinit.i = insertelement <8 x double> undef, double %0, i32 0 diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll --- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll @@ -1187,14 +1187,22 @@ } define <2 x i64> @test45(<2 x i16> %x, <2 x i16> %y) #0 { -; AVX512-LABEL: test45: -; AVX512: ## %bb.0: -; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x75,0xc1] -; AVX512-NEXT: vpmovzxwq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x34,0xc0] -; AVX512-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A] -; AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; AVX512-NEXT: retq ## encoding: [0xc3] +; KNL-LABEL: test45: +; KNL: ## %bb.0: +; KNL-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x75,0xc1] +; KNL-NEXT: vpsrlw $15, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xd0,0x0f] +; KNL-NEXT: vpmovzxwq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x34,0xc0] +; KNL-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; KNL-NEXT: retq ## encoding: [0xc3] +; +; AVX512BW-LABEL: test45: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x75,0xc1] +; AVX512BW-NEXT: vpmovzxwq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x34,0xc0] +; AVX512BW-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A] +; AVX512BW-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; AVX512BW-NEXT: retq ## encoding: [0xc3] ; ; SKX-LABEL: test45: ; SKX: ## %bb.0: diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -49,8 +49,8 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) nounwind { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512: ; X86: # %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x78,0x5c,0x24,0x04] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpblendmb %zmm3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x66,0xcb] ; X86-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x6f,0xd3] ; X86-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] @@ -108,8 +108,8 @@ ; X86-LABEL: test_int_x86_avx512_mask_storeu_b_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c] ; X86-NEXT: vmovdqu8 %zmm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0x7f,0x49,0x7f,0x01] ; X86-NEXT: vmovdqu64 %zmm0, (%eax) # encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x00] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] @@ -187,9 +187,9 @@ ; X86-LABEL: test_int_x86_avx512_mask_loadu_b_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] ; X86-NEXT: vmovdqu64 (%ecx), %zmm0 # encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x01] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c] ; X86-NEXT: vpblendmb (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x66,0x08] ; X86-NEXT: vmovdqu8 (%ecx), %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x6f,0x11] ; X86-NEXT: retl # encoding: [0xc3] @@ -455,12 +455,11 @@ define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind { ; X86-LABEL: test_mask_pcmpeq_b: ; X86: # %bb.0: -; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc1] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax # encoding: [0x23,0x44,0x24,0x04] -; X86-NEXT: andl {{[0-9]+}}(%esp), %edx # encoding: [0x23,0x54,0x24,0x08] +; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -491,9 +490,9 @@ define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind { ; X86-LABEL: test_mask_pcmpeq_w: ; X86: # %bb.0: -; X86-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x75,0xc1] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax # encoding: [0x23,0x44,0x24,0x04] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -533,12 +532,11 @@ define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind { ; X86-LABEL: test_mask_pcmpgt_b: ; X86: # %bb.0: -; X86-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x64,0xc1] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax # encoding: [0x23,0x44,0x24,0x04] -; X86-NEXT: andl {{[0-9]+}}(%esp), %edx # encoding: [0x23,0x54,0x24,0x08] +; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -569,9 +567,9 @@ define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind { ; X86-LABEL: test_mask_pcmpgt_w: ; X86: # %bb.0: -; X86-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x65,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax # encoding: [0x23,0x44,0x24,0x04] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -1941,47 +1939,66 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) nounwind { ; X86-LABEL: test_mask_cmp_b_512: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp # encoding: [0x55] +; X86-NEXT: pushl %ebx # encoding: [0x53] ; X86-NEXT: pushl %edi # encoding: [0x57] ; X86-NEXT: pushl %esi # encoding: [0x56] -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c] -; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: vpcmpgtb %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x64,0xc0] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] -; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] -; X86-NEXT: vpcmpleb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x02] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: addl %esi, %edx # encoding: [0x01,0xf2] -; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] -; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] -; X86-NEXT: kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2] +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x14] +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x18] +; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xd1] +; X86-NEXT: kmovd %esi, %k0 # encoding: [0xc5,0xfb,0x92,0xc6] +; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %edx # encoding: [0xc5,0xfb,0x93,0xd3] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %edi # encoding: [0xc5,0xfb,0x93,0xfa] +; X86-NEXT: vpcmpgtb %zmm0, %zmm1, %k2 # encoding: [0x62,0xf1,0x75,0x48,0x64,0xd0] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %ebx # encoding: [0xc5,0xfb,0x93,0xda] +; X86-NEXT: addl %edi, %ebx # encoding: [0x01,0xfb] +; X86-NEXT: adcl %edx, %eax # encoding: [0x11,0xd0] +; X86-NEXT: vpcmpleb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xd1,0x02] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %edx # encoding: [0xc5,0xfb,0x93,0xd3] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %edi # encoding: [0xc5,0xfb,0x93,0xfa] +; X86-NEXT: addl %ebx, %edi # encoding: [0x01,0xdf] ; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] -; X86-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x05] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8] -; X86-NEXT: addl %esi, %edi # encoding: [0x01,0xf7] -; X86-NEXT: adcl %edx, %ecx # encoding: [0x11,0xd1] -; X86-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x64,0xc1] -; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] +; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xd1,0x04] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %ebx # encoding: [0xc5,0xfb,0x93,0xda] +; X86-NEXT: addl %edi, %ebx # encoding: [0x01,0xfb] +; X86-NEXT: adcl %edx, %eax # encoding: [0x11,0xd0] +; X86-NEXT: vpcmpnltb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xd1,0x05] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %edi # encoding: [0xc5,0xfb,0x93,0xfb] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %ebp # encoding: [0xc5,0xfb,0x93,0xea] +; X86-NEXT: addl %ebx, %ebp # encoding: [0x01,0xdd] +; X86-NEXT: adcl %eax, %edi # encoding: [0x11,0xc7] +; X86-NEXT: vpcmpgtb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xd1] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k1 # encoding: [0xc4,0xe1,0xe5,0x41,0xc9] ; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] +; X86-NEXT: kandd %k0, %k2, %k0 # encoding: [0xc4,0xe1,0xed,0x41,0xc0] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %edi, %eax # encoding: [0x01,0xf8] +; X86-NEXT: addl %ebp, %eax # encoding: [0x01,0xe8] +; X86-NEXT: adcl %edi, %edx # encoding: [0x11,0xfa] +; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] ; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] -; X86-NEXT: addl {{[0-9]+}}(%esp), %eax # encoding: [0x03,0x44,0x24,0x0c] -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x10] ; X86-NEXT: popl %esi # encoding: [0x5e] ; X86-NEXT: popl %edi # encoding: [0x5f] +; X86-NEXT: popl %ebx # encoding: [0x5b] +; X86-NEXT: popl %ebp # encoding: [0x5d] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2116,47 +2133,66 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) nounwind { ; X86-LABEL: test_mask_x86_avx512_ucmp_b_512: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp # encoding: [0x55] +; X86-NEXT: pushl %ebx # encoding: [0x53] ; X86-NEXT: pushl %edi # encoding: [0x57] ; X86-NEXT: pushl %esi # encoding: [0x56] -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c] -; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: vpcmpltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x01] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] -; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] -; X86-NEXT: vpcmpleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x02] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: addl %esi, %edx # encoding: [0x01,0xf2] -; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] -; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] -; X86-NEXT: kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2] +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x14] +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x18] +; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xd1] +; X86-NEXT: kmovd %esi, %k0 # encoding: [0xc5,0xfb,0x92,0xc6] +; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %edx # encoding: [0xc5,0xfb,0x93,0xd3] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %edi # encoding: [0xc5,0xfb,0x93,0xfa] +; X86-NEXT: vpcmpltub %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xd1,0x01] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %ebx # encoding: [0xc5,0xfb,0x93,0xda] +; X86-NEXT: addl %edi, %ebx # encoding: [0x01,0xfb] +; X86-NEXT: adcl %edx, %eax # encoding: [0x11,0xd0] +; X86-NEXT: vpcmpleub %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xd1,0x02] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %edx # encoding: [0xc5,0xfb,0x93,0xd3] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %edi # encoding: [0xc5,0xfb,0x93,0xfa] +; X86-NEXT: addl %ebx, %edi # encoding: [0x01,0xdf] ; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] -; X86-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x05] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8] -; X86-NEXT: addl %esi, %edi # encoding: [0x01,0xf7] -; X86-NEXT: adcl %edx, %ecx # encoding: [0x11,0xd1] -; X86-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x06] -; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] +; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xd1,0x04] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %ebx # encoding: [0xc5,0xfb,0x93,0xda] +; X86-NEXT: addl %edi, %ebx # encoding: [0x01,0xfb] +; X86-NEXT: adcl %edx, %eax # encoding: [0x11,0xd0] +; X86-NEXT: vpcmpnltub %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xd1,0x05] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %edi # encoding: [0xc5,0xfb,0x93,0xfb] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %ebp # encoding: [0xc5,0xfb,0x93,0xea] +; X86-NEXT: addl %ebx, %ebp # encoding: [0x01,0xdd] +; X86-NEXT: adcl %eax, %edi # encoding: [0x11,0xc7] +; X86-NEXT: vpcmpnleub %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xd1,0x06] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k1 # encoding: [0xc4,0xe1,0xe5,0x41,0xc9] ; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] +; X86-NEXT: kandd %k0, %k2, %k0 # encoding: [0xc4,0xe1,0xed,0x41,0xc0] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %edi, %eax # encoding: [0x01,0xf8] +; X86-NEXT: addl %ebp, %eax # encoding: [0x01,0xe8] +; X86-NEXT: adcl %edi, %edx # encoding: [0x11,0xfa] +; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] ; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] -; X86-NEXT: addl {{[0-9]+}}(%esp), %eax # encoding: [0x03,0x44,0x24,0x0c] -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x10] ; X86-NEXT: popl %esi # encoding: [0x5e] ; X86-NEXT: popl %edi # encoding: [0x5f] +; X86-NEXT: popl %ebx # encoding: [0x5b] +; X86-NEXT: popl %ebp # encoding: [0x5d] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2593,13 +2629,14 @@ ; X86: # %bb.0: ; X86-NEXT: pushl %esi # encoding: [0x56] ; X86-NEXT: vptestmb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x48,0x26,0xc1] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: kandq %k1, %k0, %k1 # encoding: [0xc4,0xe1,0xfc,0x41,0xc9] +; X86-NEXT: kshiftrq $32, %k1, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd1,0x20] +; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: kmovd %k1, %esi # encoding: [0xc5,0xfb,0x93,0xf1] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] -; X86-NEXT: andl %ecx, %edx # encoding: [0x21,0xca] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] -; X86-NEXT: andl %esi, %eax # encoding: [0x21,0xf0] +; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] ; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] ; X86-NEXT: popl %esi # encoding: [0x5e] @@ -2626,9 +2663,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestm_w_512: ; X86: # %bb.0: ; X86-NEXT: vptestmw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x26,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: andl %ecx, %eax # encoding: [0x21,0xc8] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandd %k1, %k0, %k1 # encoding: [0xc4,0xe1,0xfd,0x41,0xc9] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -2654,13 +2692,14 @@ ; X86: # %bb.0: ; X86-NEXT: pushl %esi # encoding: [0x56] ; X86-NEXT: vptestnmb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x48,0x26,0xc1] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: kandq %k1, %k0, %k1 # encoding: [0xc4,0xe1,0xfc,0x41,0xc9] +; X86-NEXT: kshiftrq $32, %k1, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd1,0x20] +; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: kmovd %k1, %esi # encoding: [0xc5,0xfb,0x93,0xf1] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] -; X86-NEXT: andl %ecx, %edx # encoding: [0x21,0xca] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] -; X86-NEXT: andl %esi, %eax # encoding: [0x21,0xf0] +; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] ; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] ; X86-NEXT: popl %esi # encoding: [0x5e] @@ -2687,9 +2726,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestnm_w_512: ; X86: # %bb.0: ; X86-NEXT: vptestnmw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0xfe,0x48,0x26,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: andl %ecx, %eax # encoding: [0x21,0xc8] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandd %k1, %k0, %k1 # encoding: [0xc4,0xe1,0xfd,0x41,0xc9] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx512bw-mask-op.ll b/llvm/test/CodeGen/X86/avx512bw-mask-op.ll --- a/llvm/test/CodeGen/X86/avx512bw-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512bw-mask-op.ll @@ -38,9 +38,7 @@ define void @mask32_mem(ptr %ptr) { ; CHECK-LABEL: mask32_mem: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd (%rdi), %k0 -; CHECK-NEXT: knotd %k0, %k0 -; CHECK-NEXT: kmovd %k0, (%rdi) +; CHECK-NEXT: notl (%rdi) ; CHECK-NEXT: retq %x = load i32, ptr %ptr, align 4 %m0 = bitcast i32 %x to <32 x i1> @@ -56,9 +54,7 @@ define void @mask64_mem(ptr %ptr) { ; CHECK-LABEL: mask64_mem: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovq (%rdi), %k0 -; CHECK-NEXT: knotq %k0, %k0 -; CHECK-NEXT: kmovq %k0, (%rdi) +; CHECK-NEXT: notq (%rdi) ; CHECK-NEXT: retq %x = load i64, ptr %ptr, align 4 %m0 = bitcast i64 %x to <64 x i1> diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -927,9 +927,9 @@ define i32 @test_mask_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { ; X86-LABEL: test_mask_pcmpeq_b_256: ; X86: # %bb.0: -; X86-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax # encoding: [0x23,0x44,0x24,0x04] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -961,9 +961,9 @@ define i16 @test_mask_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { ; X86-LABEL: test_mask_pcmpeq_w_256: ; X86: # %bb.0: -; X86-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax # encoding: [0x66,0x23,0x44,0x24,0x04] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -996,9 +996,9 @@ define i32 @test_mask_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { ; X86-LABEL: test_mask_pcmpgt_b_256: ; X86: # %bb.0: -; X86-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x64,0xc1] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax # encoding: [0x23,0x44,0x24,0x04] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -1030,9 +1030,9 @@ define i16 @test_mask_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { ; X86-LABEL: test_mask_pcmpgt_w_256: ; X86: # %bb.0: -; X86-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x65,0xc1] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax # encoding: [0x66,0x23,0x44,0x24,0x04] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -1065,9 +1065,9 @@ define i16 @test_mask_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { ; X86-LABEL: test_mask_pcmpeq_b_128: ; X86: # %bb.0: -; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax # encoding: [0x66,0x23,0x44,0x24,0x04] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl # encoding: [0xc3] ; @@ -1131,9 +1131,9 @@ define i16 @test_mask_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { ; X86-LABEL: test_mask_pcmpgt_b_128: ; X86: # %bb.0: -; X86-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x64,0xc1] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax # encoding: [0x66,0x23,0x44,0x24,0x04] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl # encoding: [0xc3] ; @@ -4846,7 +4846,7 @@ ; X64-LABEL: test_cmp_b_256: ; X64: # %bb.0: ; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1] -; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 # encoding: [0x62,0xf1,0x75,0x28,0x64,0xc0] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: vpcmpleb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x02] @@ -4946,7 +4946,7 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1] -; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x64,0xc0] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: vpcmpleb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x02] @@ -5040,7 +5040,7 @@ ; X64-LABEL: test_ucmp_b_256: ; X64: # %bb.0: ; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1] -; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpltub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x01] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: vpcmpleub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x02] @@ -5140,7 +5140,7 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1] -; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x01] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: vpcmpleub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x02] @@ -5186,31 +5186,68 @@ declare i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) { -; CHECK-LABEL: test_cmp_w_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] -; CHECK-NEXT: vpcmpgtw %ymm0, %ymm1, %k1 # encoding: [0x62,0xf1,0x75,0x28,0x65,0xc8] -; CHECK-NEXT: vpcmplew %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd1,0x02] -; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltw %ymm1, %ymm0, %k4 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xe1,0x05] -; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k5 # encoding: [0x62,0xf1,0x7d,0x28,0x65,0xe9] -; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] -; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] -; CHECK-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; X86-LABEL: test_cmp_w_256: +; X86: # %bb.0: +; X86-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; X86-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpgtw %ymm0, %ymm1, %k0 # encoding: [0x62,0xf1,0x75,0x28,0x65,0xc0] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X86-NEXT: vpcmplew %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x02] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X86-NEXT: vpcmpnltw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x05] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X86-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x65,0xc1] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X86-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X86-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_cmp_w_256: +; X64: # %bb.0: +; X64-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpgtw %ymm0, %ymm1, %k0 # encoding: [0x62,0xf1,0x75,0x28,0x65,0xc0] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmplew %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x65,0xc1] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X64-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X64-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1) @@ -5236,23 +5273,29 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] -; X86-NEXT: vpcmpgtw %ymm0, %ymm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x65,0xd0] -; X86-NEXT: vpcmplew %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xd9,0x02] -; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe1,0x04] -; X86-NEXT: vpcmpnltw %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe9,0x05] -; X86-NEXT: vpcmpgtw %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc9] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpgtw %ymm0, %ymm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x65,0xc0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmplew %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x02] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x05] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc1] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -5261,23 +5304,28 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] -; X64-NEXT: vpcmpgtw %ymm0, %ymm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x65,0xd0] -; X64-NEXT: vpcmplew %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xd9,0x02] -; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe1,0x04] -; X64-NEXT: vpcmpnltw %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe9,0x05] -; X64-NEXT: vpcmpgtw %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc9] -; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpgtw %ymm0, %ymm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x65,0xc0] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmplew %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc1] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -5303,31 +5351,68 @@ declare i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) { -; CHECK-LABEL: test_ucmp_w_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] -; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k1 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc9,0x01] -; CHECK-NEXT: vpcmpleuw %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xd1,0x02] -; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltuw %ymm1, %ymm0, %k4 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xe1,0x05] -; CHECK-NEXT: vpcmpnleuw %ymm1, %ymm0, %k5 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xe9,0x06] -; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] -; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] -; CHECK-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; X86-LABEL: test_ucmp_w_256: +; X86: # %bb.0: +; X86-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; X86-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x01] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X86-NEXT: vpcmpleuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x02] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X86-NEXT: vpcmpnltuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x05] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X86-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x06] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X86-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X86-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_ucmp_w_256: +; X64: # %bb.0: +; X64-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x01] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x06] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X64-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X64-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1) @@ -5353,23 +5438,29 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] -; X86-NEXT: vpcmpltuw %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd1,0x01] -; X86-NEXT: vpcmpleuw %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd9,0x02] -; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe1,0x04] -; X86-NEXT: vpcmpnltuw %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xe9,0x05] -; X86-NEXT: vpcmpnleuw %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc9,0x06] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x01] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x02] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x05] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x06] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -5378,23 +5469,28 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] -; X64-NEXT: vpcmpltuw %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd1,0x01] -; X64-NEXT: vpcmpleuw %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd9,0x02] -; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe1,0x04] -; X64-NEXT: vpcmpnltuw %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xe9,0x05] -; X64-NEXT: vpcmpnleuw %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc9,0x06] -; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x01] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x06] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -5420,30 +5516,66 @@ declare i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) { -; CHECK-LABEL: test_cmp_b_128: -; CHECK: # %bb.0: -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] -; CHECK-NEXT: vpcmpgtb %xmm0, %xmm1, %k1 # encoding: [0x62,0xf1,0x75,0x08,0x64,0xc8] -; CHECK-NEXT: vpcmpleb %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd1,0x02] -; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltb %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xe1,0x05] -; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k5 # encoding: [0x62,0xf1,0x7d,0x08,0x64,0xe9] -; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] -; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] -; CHECK-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; X86-LABEL: test_cmp_b_128: +; X86: # %bb.0: +; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; X86-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpgtb %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x75,0x08,0x64,0xc0] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X86-NEXT: vpcmpleb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x02] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X86-NEXT: vpcmpnltb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x05] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X86-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x64,0xc1] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X86-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X86-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_cmp_b_128: +; X64: # %bb.0: +; X64-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpgtb %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x75,0x08,0x64,0xc0] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x64,0xc1] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X64-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X64-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1) @@ -5469,23 +5601,29 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1] -; X86-NEXT: vpcmpgtb %xmm0, %xmm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x64,0xd0] -; X86-NEXT: vpcmpleb %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xd9,0x02] -; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe1,0x04] -; X86-NEXT: vpcmpnltb %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe9,0x05] -; X86-NEXT: vpcmpgtb %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc9] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpgtb %xmm0, %xmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x64,0xc0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmpleb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x02] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x05] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc1] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -5493,23 +5631,28 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1] -; X64-NEXT: vpcmpgtb %xmm0, %xmm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x64,0xd0] -; X64-NEXT: vpcmpleb %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xd9,0x02] -; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe1,0x04] -; X64-NEXT: vpcmpnltb %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe9,0x05] -; X64-NEXT: vpcmpgtb %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc9] -; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpgtb %xmm0, %xmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x64,0xc0] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc1] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask) @@ -5534,30 +5677,66 @@ declare i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) { -; CHECK-LABEL: test_ucmp_b_128: -; CHECK: # %bb.0: -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] -; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k1 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc9,0x01] -; CHECK-NEXT: vpcmpleub %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xd1,0x02] -; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltub %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xe1,0x05] -; CHECK-NEXT: vpcmpnleub %xmm1, %xmm0, %k5 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xe9,0x06] -; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] -; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] -; CHECK-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; X86-LABEL: test_ucmp_b_128: +; X86: # %bb.0: +; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; X86-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpltub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x01] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X86-NEXT: vpcmpleub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x02] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X86-NEXT: vpcmpnltub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x05] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X86-NEXT: vpcmpnleub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x06] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X86-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X86-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_ucmp_b_128: +; X64: # %bb.0: +; X64-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpltub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x01] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x06] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X64-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X64-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1) @@ -5583,23 +5762,29 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1] -; X86-NEXT: vpcmpltub %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd1,0x01] -; X86-NEXT: vpcmpleub %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd9,0x02] -; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe1,0x04] -; X86-NEXT: vpcmpnltub %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xe9,0x05] -; X86-NEXT: vpcmpnleub %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc9,0x06] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x01] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmpleub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x02] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x05] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x06] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -5607,23 +5792,28 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1] -; X64-NEXT: vpcmpltub %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd1,0x01] -; X64-NEXT: vpcmpleub %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd9,0x02] -; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe1,0x04] -; X64-NEXT: vpcmpnltub %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xe9,0x05] -; X64-NEXT: vpcmpnleub %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc9,0x06] -; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x01] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x06] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask) @@ -6119,9 +6309,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestm_b_128: ; X86: # %bb.0: ; X86-NEXT: vptestmb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x08,0x26,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: andw %cx, %ax # encoding: [0x66,0x21,0xc8] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl # encoding: [0xc3] @@ -6146,9 +6337,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestm_b_256: ; X86: # %bb.0: ; X86-NEXT: vptestmb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0x7d,0x28,0x26,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: andl %ecx, %eax # encoding: [0x21,0xc8] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandd %k1, %k0, %k1 # encoding: [0xc4,0xe1,0xfd,0x41,0xc9] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -6199,9 +6391,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestm_w_256: ; X86: # %bb.0: ; X86-NEXT: vptestmw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfd,0x28,0x26,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: andw %cx, %ax # encoding: [0x66,0x21,0xc8] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] @@ -6228,9 +6421,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestnm_b_128: ; X86: # %bb.0: ; X86-NEXT: vptestnmb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x08,0x26,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: andw %cx, %ax # encoding: [0x66,0x21,0xc8] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl # encoding: [0xc3] @@ -6255,9 +6449,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestnm_b_256: ; X86: # %bb.0: ; X86-NEXT: vptestnmb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0x7e,0x28,0x26,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: andl %ecx, %eax # encoding: [0x21,0xc8] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandd %k1, %k0, %k1 # encoding: [0xc4,0xe1,0xfd,0x41,0xc9] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -6308,9 +6503,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestnm_w_256: ; X86: # %bb.0: ; X86-NEXT: vptestnmw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfe,0x28,0x26,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: andw %cx, %ax # encoding: [0x66,0x21,0xc8] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] diff --git a/llvm/test/CodeGen/X86/avx512dq-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512dq-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/avx512dq-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512dq-intrinsics-fast-isel.ll @@ -7,9 +7,9 @@ define zeroext i8 @test_mm512_mask_fpclass_pd_mask(i8 zeroext %__U, <8 x double> %__A) { ; X86-LABEL: test_mm512_mask_fpclass_pd_mask: ; X86: # %bb.0: # %entry -; X86-NEXT: vfpclasspd $4, %zmm0, %k0 +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vfpclasspd $4, %zmm0, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax -; X86-NEXT: andb {{[0-9]+}}(%esp), %al ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: vzeroupper ; X86-NEXT: retl @@ -49,9 +49,9 @@ define zeroext i16 @test_mm512_mask_fpclass_ps_mask(i16 zeroext %__U, <16 x float> %__A) { ; X86-LABEL: test_mm512_mask_fpclass_ps_mask: ; X86: # %bb.0: # %entry -; X86-NEXT: vfpclassps $4, %zmm0, %k0 +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vfpclassps $4, %zmm0, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/avx512dq-mask-op.ll b/llvm/test/CodeGen/X86/avx512dq-mask-op.ll --- a/llvm/test/CodeGen/X86/avx512dq-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512dq-mask-op.ll @@ -17,9 +17,7 @@ define void @mask8_mem(ptr %ptr) { ; CHECK-LABEL: mask8_mem: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovb (%rdi), %k0 -; CHECK-NEXT: knotb %k0, %k0 -; CHECK-NEXT: kmovb %k0, (%rdi) +; CHECK-NEXT: notb (%rdi) ; CHECK-NEXT: retq %x = load i8, ptr %ptr, align 4 %m0 = bitcast i8 %x to <8 x i1> diff --git a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-fast-isel.ll @@ -336,9 +336,9 @@ define zeroext i8 @test_mm256_mask_fpclass_ps_mask(i8 zeroext %__U, <8 x float> %__A) { ; X86-LABEL: test_mm256_mask_fpclass_ps_mask: ; X86: # %bb.0: # %entry -; X86-NEXT: vfpclassps $2, %ymm0, %k0 +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vfpclassps $2, %ymm0, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax -; X86-NEXT: andb {{[0-9]+}}(%esp), %al ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: vzeroupper ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll b/llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll --- a/llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll +++ b/llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll @@ -92,9 +92,9 @@ ; ; X86-LABEL: TEST_mm512_mask_test_epi32_mask: ; X86: # %bb.0: # %entry -; X86-NEXT: vptestmd %zmm0, %zmm1, %k0 +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper ; X86-NEXT: retl @@ -198,9 +198,9 @@ ; ; X86-LABEL: TEST_mm512_mask_testn_epi32_mask: ; X86: # %bb.0: # %entry -; X86-NEXT: vptestnmd %zmm0, %zmm1, %k0 +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-shuffle-fma.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-shuffle-fma.ll --- a/llvm/test/CodeGen/X86/avx512fp16-combine-shuffle-fma.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-combine-shuffle-fma.ll @@ -7,7 +7,7 @@ define <2 x half> @foo(<2 x half> %0) "unsafe-fp-math"="true" nounwind { ; AVX2-LABEL: foo: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $40, %rsp +; AVX2-NEXT: subq $56, %rsp ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX2-NEXT: callq __extendhfsf2@PLT @@ -20,9 +20,17 @@ ; AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload ; AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero ; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX2-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload ; AVX2-NEXT: callq __truncsfhf2@PLT -; AVX2-NEXT: addq $40, %rsp +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; AVX2-NEXT: callq __truncsfhf2@PLT +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero +; AVX2-NEXT: addq $56, %rsp ; AVX2-NEXT: retq ; ; F16C-LABEL: foo: @@ -44,7 +52,10 @@ ; FP16-LABEL: foo: ; FP16: # %bb.0: ; FP16-NEXT: vpsrld $16, %xmm0, %xmm1 -; FP16-NEXT: vfmaddsub231ph {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 +; FP16-NEXT: vmulph {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; FP16-NEXT: vaddph %xmm0, %xmm1, %xmm2 +; FP16-NEXT: vsubsh %xmm0, %xmm1, %xmm0 +; FP16-NEXT: vmovsh %xmm0, %xmm2, %xmm0 ; FP16-NEXT: retq %2 = shufflevector <2 x half> %0, <2 x half> undef, <2 x i32> %3 = fmul fast <2 x half> %2, diff --git a/llvm/test/CodeGen/X86/avx512fp16-fma-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-fma-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512fp16-fma-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-fma-intrinsics.ll @@ -528,12 +528,12 @@ define void @fmadd_sh_mask_memfold(ptr %a, ptr %b, i8 %c) { ; X86-LABEL: fmadd_sh_mask_memfold: ; X86: # %bb.0: -; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] ; X86-NEXT: vmovsh (%ecx), %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x10,0x01] ; X86-NEXT: vmovsh (%eax), %xmm1 # encoding: [0x62,0xf5,0x7e,0x08,0x10,0x08] ; X86-NEXT: vfmadd213sh %xmm0, %xmm0, %xmm1 # encoding: [0x62,0xf6,0x7d,0x08,0xa9,0xc8] +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x0c] ; X86-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xc1] ; X86-NEXT: vmovsh %xmm0, (%ecx) # encoding: [0x62,0xf5,0x7e,0x08,0x11,0x01] ; X86-NEXT: retl # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll --- a/llvm/test/CodeGen/X86/avx512fp16-mov.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll @@ -1995,25 +1995,27 @@ define <8 x half> @test21(half %a, half %b, half %c) nounwind { ; X64-LABEL: test21: ; X64: # %bb.0: -; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X64-NEXT: vmovsh %xmm2, %xmm3, %xmm2 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-NEXT: vpbroadcastw %xmm1, %xmm1 -; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: vpbroadcastw %xmm1, %xmm1 +; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: retq ; ; X86-LABEL: test21: ; X86: # %bb.0: ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 -; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 -; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86-NEXT: vpbroadcastw %xmm1, %xmm1 -; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X86-NEXT: vpbroadcastw %xmm2, %xmm2 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-NEXT: vpbroadcastw %xmm2, %xmm1 +; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X86-NEXT: retl %1 = insertelement <8 x half> , half %a, i32 0 %2 = insertelement <8 x half> %1, half %b, i32 1 @@ -2099,7 +2101,9 @@ ; X64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112] ; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; X64-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7,8,9,10,11,12,13],ymm1[14],ymm2[15] +; X64-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X64-NEXT: vmovsh %xmm0, %xmm2, %xmm0 ; X64-NEXT: retq @@ -2115,7 +2119,9 @@ ; X86-NEXT: vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112] ; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; X86-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 +; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X86-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7,8,9,10,11,12,13],ymm1[14],ymm2[15] +; X86-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X86-NEXT: vmovsh %xmm0, %xmm2, %xmm0 ; X86-NEXT: movl %ebp, %esp @@ -2130,8 +2136,9 @@ define <8 x i16> @pr59628_xmm(i16 %arg) { ; X64-LABEL: pr59628_xmm: ; X64: # %bb.0: -; X64-NEXT: vmovw %edi, %xmm0 +; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X64-NEXT: vpbroadcastw %edi, %xmm1 +; X64-NEXT: vmovsh %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpcmpneqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1 ; X64-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-upgrade.ll @@ -283,8 +283,8 @@ define void @test_mask_compress_store_b_512(ptr %addr, <64 x i8> %data, i64 %mask) { ; X86-LABEL: test_mask_compress_store_b_512: ; X86: # %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpcompressb %zmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x63,0x00] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics.ll @@ -282,8 +282,8 @@ define void @test_mask_compress_store_b_512(ptr %addr, <64 x i8> %data, i64 %mask) { ; X86-LABEL: test_mask_compress_store_b_512: ; X86: # %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpcompressb %zmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x63,0x00] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll --- a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -19496,8 +19496,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovaps (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -19682,8 +19682,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovaps (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -19859,8 +19859,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovaps (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -20033,8 +20033,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovaps (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -21162,8 +21162,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl $3, %eax @@ -21343,8 +21343,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 @@ -21529,8 +21529,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 @@ -21706,8 +21706,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 @@ -21880,8 +21880,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 @@ -22068,8 +22068,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -22260,8 +22260,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -22443,8 +22443,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -22623,8 +22623,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 diff --git a/llvm/test/CodeGen/X86/bit-test-shift.ll b/llvm/test/CodeGen/X86/bit-test-shift.ll --- a/llvm/test/CodeGen/X86/bit-test-shift.ll +++ b/llvm/test/CodeGen/X86/bit-test-shift.ll @@ -5,10 +5,12 @@ define i32 @x(i32 %t) nounwind readnone ssp { ; CHECK-LABEL: x: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: shll $23, %eax -; CHECK-NEXT: sarl $31, %eax -; CHECK-NEXT: andl $-26, %eax +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb $1, {{[0-9]+}}(%esp) +; CHECK-NEXT: je .LBB0_2 +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: movl $-26, %eax +; CHECK-NEXT: .LBB0_2: # %entry ; CHECK-NEXT: retl entry: %and = and i32 %t, 256 diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll --- a/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll +++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-128.ll @@ -433,16 +433,26 @@ } define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) { -; SSE2-SSSE3-LABEL: v4i8: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSE2-SSSE3-NEXT: movmskps %xmm2, %eax -; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax -; SSE2-SSSE3-NEXT: retq +; SSE2-LABEL: v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 +; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movmskps %xmm2, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v4i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pcmpgtb %xmm1, %xmm0 +; SSSE3-NEXT: pcmpgtb %xmm3, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm2 +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[u,u,u,0,u,u,u,1,u,u,u,2,u,u,u,3] +; SSSE3-NEXT: movmskps %xmm2, %eax +; SSSE3-NEXT: # kill: def $al killed $al killed $eax +; SSSE3-NEXT: retq ; ; AVX12-LABEL: v4i8: ; AVX12: # %bb.0: diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-128.ll b/llvm/test/CodeGen/X86/bitcast-setcc-128.ll --- a/llvm/test/CodeGen/X86/bitcast-setcc-128.ll +++ b/llvm/test/CodeGen/X86/bitcast-setcc-128.ll @@ -340,14 +340,22 @@ } define i4 @v4i8(<4 x i8> %a, <4 x i8> %b) { -; SSE2-SSSE3-LABEL: v4i8: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax -; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax -; SSE2-SSSE3-NEXT: retq +; SSE2-LABEL: v4i8: +; SSE2: # %bb.0: +; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movmskps %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v4i8: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pcmpgtb %xmm1, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,u,u,u,1,u,u,u,2,u,u,u,3] +; SSSE3-NEXT: movmskps %xmm0, %eax +; SSSE3-NEXT: # kill: def $al killed $al killed $eax +; SSSE3-NEXT: retq ; ; AVX12-LABEL: v4i8: ; AVX12: # %bb.0: @@ -566,18 +574,14 @@ ; ; AVX512F-LABEL: bitcast_16i8_store: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, (%rdi) -; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: vpmovmskb %xmm0, %eax +; AVX512F-NEXT: movw %ax, (%rdi) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: bitcast_16i8_store: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovb2m %xmm0, %k0 -; AVX512BW-NEXT: kmovw %k0, (%rdi) +; AVX512BW-NEXT: vpmovmskb %xmm0, %eax +; AVX512BW-NEXT: movw %ax, (%rdi) ; AVX512BW-NEXT: retq %a1 = icmp slt <16 x i8> %a0, zeroinitializer %a2 = bitcast <16 x i1> %a1 to i16 @@ -638,17 +642,13 @@ ; ; AVX512F-LABEL: bitcast_4i32_store: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpgtd %xmm0, %xmm1, %k0 -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vmovmskps %xmm0, %eax ; AVX512F-NEXT: movb %al, (%rdi) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: bitcast_4i32_store: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpcmpgtd %xmm0, %xmm1, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: vmovmskps %xmm0, %eax ; AVX512BW-NEXT: movb %al, (%rdi) ; AVX512BW-NEXT: retq %a1 = icmp slt <4 x i32> %a0, zeroinitializer @@ -672,17 +672,13 @@ ; ; AVX512F-LABEL: bitcast_2i64_store: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vmovmskpd %xmm0, %eax ; AVX512F-NEXT: movb %al, (%rdi) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: bitcast_2i64_store: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: vmovmskpd %xmm0, %eax ; AVX512BW-NEXT: movb %al, (%rdi) ; AVX512BW-NEXT: retq %a1 = icmp slt <2 x i64> %a0, zeroinitializer diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-256.ll b/llvm/test/CodeGen/X86/bitcast-setcc-256.ll --- a/llvm/test/CodeGen/X86/bitcast-setcc-256.ll +++ b/llvm/test/CodeGen/X86/bitcast-setcc-256.ll @@ -329,26 +329,12 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: bitcast_32i8_store: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: kmovw %k1, 2(%rdi) -; AVX512F-NEXT: kmovw %k0, (%rdi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: bitcast_32i8_store: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovb2m %ymm0, %k0 -; AVX512BW-NEXT: kmovd %k0, (%rdi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: bitcast_32i8_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovmskb %ymm0, %eax +; AVX512-NEXT: movl %eax, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %a1 = icmp slt <32 x i8> %a0, zeroinitializer %a2 = bitcast <32 x i1> %a1 to i32 store i32 %a2, ptr %p @@ -446,23 +432,12 @@ ; AVX12-NEXT: vzeroupper ; AVX12-NEXT: retq ; -; AVX512F-LABEL: bitcast_4i64_store: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpgtq %ymm0, %ymm1, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, (%rdi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: bitcast_4i64_store: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpcmpgtq %ymm0, %ymm1, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, (%rdi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: bitcast_4i64_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovmskpd %ymm0, %eax +; AVX512-NEXT: movb %al, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %a1 = icmp slt <4 x i64> %a0, zeroinitializer %a2 = bitcast <4 x i1> %a1 to i4 store i4 %a2, ptr %p diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-512.ll b/llvm/test/CodeGen/X86/bitcast-setcc-512.ll --- a/llvm/test/CodeGen/X86/bitcast-setcc-512.ll +++ b/llvm/test/CodeGen/X86/bitcast-setcc-512.ll @@ -450,24 +450,12 @@ ; ; AVX512F-LABEL: bitcast_64i8_store: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k3 -; AVX512F-NEXT: kmovw %k3, 6(%rdi) -; AVX512F-NEXT: kmovw %k2, 4(%rdi) -; AVX512F-NEXT: kmovw %k1, 2(%rdi) -; AVX512F-NEXT: kmovw %k0, (%rdi) +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpmovmskb %ymm1, %eax +; AVX512F-NEXT: shlq $32, %rax +; AVX512F-NEXT: vpmovmskb %ymm0, %ecx +; AVX512F-NEXT: orq %rax, %rcx +; AVX512F-NEXT: movq %rcx, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -615,13 +603,10 @@ ; ; AVX1-LABEL: bitcast_8i64_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovmskps %ymm0, %eax ; AVX1-NEXT: movb %al, (%rdi) diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -46,30 +46,27 @@ } define i1 @trunc_v2i64_cmp(<2 x i64> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v2i64_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: psllq $63, %xmm0 -; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax -; SSE2-SSSE3-NEXT: testl %eax, %eax -; SSE2-SSSE3-NEXT: sete %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_v2i64_cmp: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v2i64_cmp: +; SSE: # %bb.0: +; SSE-NEXT: psllq $63, %xmm0 +; SSE-NEXT: movmskpd %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX12-LABEL: trunc_v2i64_cmp: ; AVX12: # %bb.0: -; AVX12-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX12-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX12-NEXT: vtestpd %xmm0, %xmm0 ; AVX12-NEXT: sete %al ; AVX12-NEXT: retq ; ; AVX512-LABEL: trunc_v2i64_cmp: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; AVX512-NEXT: vptest %xmm1, %xmm0 +; AVX512-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX512-NEXT: vptestmq %xmm0, %xmm0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: testb %al, %al ; AVX512-NEXT: sete %al ; AVX512-NEXT: retq %1 = trunc <2 x i64> %a0 to <2 x i1> @@ -79,15 +76,30 @@ } define i2 @bitcast_v4i32_to_v2i2(<4 x i32> %a0) nounwind { -; SSE-LABEL: bitcast_v4i32_to_v2i2: -; SSE: # %bb.0: -; SSE-NEXT: movmskps %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrb $2, %cl -; SSE-NEXT: andb $3, %al -; SSE-NEXT: addb %cl, %al -; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: bitcast_v4i32_to_v2i2: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrb $2, %cl +; SSE2-SSSE3-NEXT: movzbl %cl, %ecx +; SSE2-SSSE3-NEXT: andb $3, %al +; SSE2-SSSE3-NEXT: movzbl %al, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: bitcast_v4i32_to_v2i2: +; SSE41: # %bb.0: +; SSE41-NEXT: movmskps %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrb $2, %cl +; SSE41-NEXT: andb $3, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq ; ; AVX-LABEL: bitcast_v4i32_to_v2i2: ; AVX: # %bb.0: @@ -107,31 +119,29 @@ } define i1 @trunc_v4i32_cmp(<4 x i32> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v4i32_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: pslld $31, %xmm0 -; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax -; SSE2-SSSE3-NEXT: xorl $15, %eax -; SSE2-SSSE3-NEXT: sete %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_v4i32_cmp: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v4i32_cmp: +; SSE: # %bb.0: +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX12-LABEL: trunc_v4i32_cmp: ; AVX12: # %bb.0: -; AVX12-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX12-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX12-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX12-NEXT: vtestps %xmm1, %xmm0 ; AVX12-NEXT: setb %al ; AVX12-NEXT: retq ; ; AVX512-LABEL: trunc_v4i32_cmp: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967297,4294967297] -; AVX512-NEXT: vptest %xmm1, %xmm0 -; AVX512-NEXT: setb %al +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: testb $15, %al +; AVX512-NEXT: sete %al ; AVX512-NEXT: retq %1 = trunc <4 x i32> %a0 to <4 x i1> %2 = bitcast <4 x i1> %1 to i4 @@ -140,16 +150,32 @@ } define i4 @bitcast_v8i16_to_v2i4(<8 x i16> %a0) nounwind { -; SSE-LABEL: bitcast_v8i16_to_v2i4: -; SSE: # %bb.0: -; SSE-NEXT: packsswb %xmm0, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrb $4, %cl -; SSE-NEXT: andb $15, %al -; SSE-NEXT: addb %cl, %al -; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: bitcast_v8i16_to_v2i4: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0 +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrb $4, %cl +; SSE2-SSSE3-NEXT: movzbl %cl, %ecx +; SSE2-SSSE3-NEXT: andb $15, %al +; SSE2-SSSE3-NEXT: movzbl %al, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: bitcast_v8i16_to_v2i4: +; SSE41: # %bb.0: +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrb $4, %cl +; SSE41-NEXT: andb $15, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq ; ; AVX12-LABEL: bitcast_v8i16_to_v2i4: ; AVX12: # %bb.0: @@ -181,23 +207,19 @@ } define i1 @trunc_v8i16_cmp(<8 x i16> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v8i16_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: psllw $7, %xmm0 -; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSE2-SSSE3-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-SSSE3-NEXT: setne %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_v8i16_cmp: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v8i16_cmp: +; SSE: # %bb.0: +; SSE-NEXT: psllw $15, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; ; AVX12-LABEL: trunc_v8i16_cmp: ; AVX12: # %bb.0: -; AVX12-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX12-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: testl $43690, %eax # imm = 0xAAAA ; AVX12-NEXT: setne %al ; AVX12-NEXT: retq ; @@ -223,24 +245,14 @@ ; SSE-NEXT: # kill: def $al killed $al killed $eax ; SSE-NEXT: retq ; -; AVX12-LABEL: bitcast_v16i8_to_v2i8: -; AVX12: # %bb.0: -; AVX12-NEXT: vpmovmskb %xmm0, %ecx -; AVX12-NEXT: movl %ecx, %eax -; AVX12-NEXT: shrl $8, %eax -; AVX12-NEXT: addb %cl, %al -; AVX12-NEXT: # kill: def $al killed $al killed $eax -; AVX12-NEXT: retq -; -; AVX512-LABEL: bitcast_v16i8_to_v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovb2m %xmm0, %k0 -; AVX512-NEXT: kshiftrw $8, %k0, %k1 -; AVX512-NEXT: kmovd %k0, %ecx -; AVX512-NEXT: kmovd %k1, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq +; AVX-LABEL: bitcast_v16i8_to_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpmovmskb %xmm0, %ecx +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: shrl $8, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq %1 = icmp slt <16 x i8> %a0, zeroinitializer %2 = bitcast <16 x i1> %1 to <2 x i8> %3 = extractelement <2 x i8> %2, i32 0 @@ -250,32 +262,21 @@ } define i1 @trunc_v16i8_cmp(<16 x i8> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v16i8_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: psllw $7, %xmm0 -; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSE2-SSSE3-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-SSSE3-NEXT: setne %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_v16i8_cmp: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setae %al -; SSE41-NEXT: retq -; -; AVX12-LABEL: trunc_v16i8_cmp: -; AVX12: # %bb.0: -; AVX12-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX12-NEXT: setae %al -; AVX12-NEXT: retq +; SSE-LABEL: trunc_v16i8_cmp: +; SSE: # %bb.0: +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; -; AVX512-LABEL: trunc_v16i8_cmp: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [72340172838076673,72340172838076673] -; AVX512-NEXT: vptest %xmm1, %xmm0 -; AVX512-NEXT: setae %al -; AVX512-NEXT: retq +; AVX-LABEL: trunc_v16i8_cmp: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX-NEXT: vpmovmskb %xmm0, %eax +; AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX-NEXT: setne %al +; AVX-NEXT: retq %1 = trunc <16 x i8> %a0 to <16 x i1> %2 = bitcast <16 x i1> %1 to i16 %3 = icmp ne i16 %2, -1 @@ -287,16 +288,32 @@ ; define i2 @bitcast_v4i64_to_v2i2(<4 x i64> %a0) nounwind { -; SSE-LABEL: bitcast_v4i64_to_v2i2: -; SSE: # %bb.0: -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; SSE-NEXT: movmskps %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrb $2, %cl -; SSE-NEXT: andb $3, %al -; SSE-NEXT: addb %cl, %al -; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: bitcast_v4i64_to_v2i2: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrb $2, %cl +; SSE2-SSSE3-NEXT: movzbl %cl, %ecx +; SSE2-SSSE3-NEXT: andb $3, %al +; SSE2-SSSE3-NEXT: movzbl %al, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: bitcast_v4i64_to_v2i2: +; SSE41: # %bb.0: +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE41-NEXT: movmskps %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrb $2, %cl +; SSE41-NEXT: andb $3, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq ; ; AVX-LABEL: bitcast_v4i64_to_v2i2: ; AVX: # %bb.0: @@ -317,41 +334,39 @@ } define i1 @trunc_v4i64_cmp(<4 x i64> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v4i64_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE2-SSSE3-NEXT: pslld $31, %xmm0 -; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax -; SSE2-SSSE3-NEXT: testl %eax, %eax -; SSE2-SSSE3-NEXT: setne %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_v4i64_cmp: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v4i64_cmp: +; SSE: # %bb.0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v4i64_cmp: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vtestps %xmm0, %xmm0 ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v4i64_cmp: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX2-NEXT: vtestpd %ymm0, %ymm0 ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_v4i64_cmp: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX512-NEXT: vptest %ymm1, %ymm0 +; AVX512-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX512-NEXT: vptestmq %ymm0, %ymm0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: testb %al, %al ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -362,17 +377,34 @@ } define i4 @bitcast_v8i32_to_v2i4(<8 x i32> %a0) nounwind { -; SSE-LABEL: bitcast_v8i32_to_v2i4: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packsswb %xmm0, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrb $4, %cl -; SSE-NEXT: andb $15, %al -; SSE-NEXT: addb %cl, %al -; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: bitcast_v8i32_to_v2i4: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0 +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrb $4, %cl +; SSE2-SSSE3-NEXT: movzbl %cl, %ecx +; SSE2-SSSE3-NEXT: andb $15, %al +; SSE2-SSSE3-NEXT: movzbl %al, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: bitcast_v8i32_to_v2i4: +; SSE41: # %bb.0: +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrb $4, %cl +; SSE41-NEXT: andb $15, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq ; ; AVX-LABEL: bitcast_v8i32_to_v2i4: ; AVX: # %bb.0: @@ -393,33 +425,35 @@ } define i1 @trunc_v8i132_cmp(<8 x i32> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v8i132_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pslld $31, %xmm0 -; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax -; SSE2-SSSE3-NEXT: xorl $15, %eax -; SSE2-SSSE3-NEXT: setne %al -; SSE2-SSSE3-NEXT: retq -; ; SSE41-LABEL: trunc_v8i132_cmp: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setae %al +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: psllw $15, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: cmpb $-1, %al +; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_v8i132_cmp: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vtestps %xmm1, %xmm0 ; AVX1-NEXT: setae %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v8i132_cmp: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vtestps %ymm1, %ymm0 ; AVX2-NEXT: setae %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -491,33 +525,38 @@ } define i1 @trunc_v16i16_cmp(<16 x i16> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v16i16_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: por %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: psllw $7, %xmm0 -; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSE2-SSSE3-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-SSSE3-NEXT: sete %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_v16i16_cmp: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v16i16_cmp: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v16i16_cmp: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v16i16_cmp: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -566,10 +605,9 @@ ; ; AVX512-LABEL: bitcast_v32i8_to_v2i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovb2m %ymm0, %k0 -; AVX512-NEXT: kshiftrd $16, %k0, %k1 -; AVX512-NEXT: kmovd %k0, %ecx -; AVX512-NEXT: kmovd %k1, %eax +; AVX512-NEXT: vpmovmskb %ymm0, %ecx +; AVX512-NEXT: movl %ecx, %eax +; AVX512-NEXT: shrl $16, %eax ; AVX512-NEXT: addl %ecx, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -583,42 +621,41 @@ } define i1 @trunc_v32i8_cmp(<32 x i8> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v32i8_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: psllw $7, %xmm0 -; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSE2-SSSE3-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-SSSE3-NEXT: sete %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_v32i8_cmp: -; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v32i8_cmp: +; SSE: # %bb.0: +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v32i8_cmp: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v32i8_cmp: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: cmpl $-1, %eax +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_v32i8_cmp: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setb %al +; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512-NEXT: vpmovmskb %ymm0, %eax +; AVX512-NEXT: cmpl $-1, %eax +; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = trunc <32 x i8> %a0 to <32 x i1> @@ -632,29 +669,45 @@ ; define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind { -; SSE-LABEL: bitcast_v8i64_to_v2i4: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packssdw %xmm2, %xmm0 -; SSE-NEXT: packsswb %xmm0, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrb $4, %cl -; SSE-NEXT: andb $15, %al -; SSE-NEXT: addb %cl, %al -; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: bitcast_v8i64_to_v2i4: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0 +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrb $4, %cl +; SSE2-SSSE3-NEXT: movzbl %cl, %ecx +; SSE2-SSSE3-NEXT: andb $15, %al +; SSE2-SSSE3-NEXT: movzbl %al, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: bitcast_v8i64_to_v2i4: +; SSE41: # %bb.0: +; SSE41-NEXT: packssdw %xmm3, %xmm2 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: packssdw %xmm2, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrb $4, %cl +; SSE41-NEXT: andb $15, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq ; ; AVX1-LABEL: bitcast_v8i64_to_v2i4: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovmskps %ymm0, %eax ; AVX1-NEXT: movl %eax, %ecx @@ -717,26 +770,43 @@ ; ; SSE41-LABEL: trunc_v8i64_cmp: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packusdw %xmm2, %xmm0 +; SSE41-NEXT: psllw $15, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: cmpb $-1, %al +; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_v8i64_cmp: ; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vtestps %xmm1, %xmm0 ; AVX1-NEXT: setb %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v8i64_cmp: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vtestps %ymm1, %ymm0 ; AVX2-NEXT: setb %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -823,37 +893,65 @@ define i1 @trunc_v16i32_cmp(<16 x i32> %a0) nounwind { ; SSE2-SSSE3-LABEL: trunc_v16i32_cmp: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: por %xmm3, %xmm1 -; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: por %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pslld $31, %xmm0 -; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: psllw $7, %xmm0 +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax ; SSE2-SSSE3-NEXT: testl %eax, %eax ; SSE2-SSSE3-NEXT: sete %al ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_v16i32_cmp: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm1 +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: psllw $7, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: testl %eax, %eax ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_v16i32_cmp: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v16i32_cmp: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -926,41 +1024,50 @@ } define i1 @trunc_v32i16_cmp(<32 x i16> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v32i16_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: psllw $7, %xmm0 -; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSE2-SSSE3-NEXT: notl %eax -; SSE2-SSSE3-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-SSSE3-NEXT: setne %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_v32i16_cmp: -; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setae %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v32i16_cmp: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: psllw $7, %xmm2 +; SSE-NEXT: pmovmskb %xmm2, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v32i16_cmp: ; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: setae %al +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v32i16_cmp: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setae %al +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: cmpl $-1, %eax +; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -991,7 +1098,6 @@ ; SSE2-SSSE3-NEXT: shll $16, %edx ; SSE2-SSSE3-NEXT: orl %eax, %edx ; SSE2-SSSE3-NEXT: shlq $32, %rdx -; SSE2-SSSE3-NEXT: orq %rcx, %rdx ; SSE2-SSSE3-NEXT: movq %rdx, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE2-SSSE3-NEXT: movd %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/bitselect.ll b/llvm/test/CodeGen/X86/bitselect.ll --- a/llvm/test/CodeGen/X86/bitselect.ll +++ b/llvm/test/CodeGen/X86/bitselect.ll @@ -35,21 +35,21 @@ define i16 @bitselect_i16(i16 %a, i16 %b, i16 %m) nounwind { ; X86-LABEL: bitselect_i16: ; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorw %ax, %cx +; X86-NEXT: andw %cx, %ax +; X86-NEXT: notl %ecx ; X86-NEXT: andw {{[0-9]+}}(%esp), %cx -; X86-NEXT: xorl %ecx, %eax +; X86-NEXT: orl %ecx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; ; X64-NOBMI-LABEL: bitselect_i16: ; X64-NOBMI: # %bb.0: -; X64-NOBMI-NEXT: movl %edx, %eax -; X64-NOBMI-NEXT: andl %edx, %esi -; X64-NOBMI-NEXT: notl %eax -; X64-NOBMI-NEXT: andl %edi, %eax -; X64-NOBMI-NEXT: orl %esi, %eax +; X64-NOBMI-NEXT: movl %esi, %eax +; X64-NOBMI-NEXT: xorl %edi, %eax +; X64-NOBMI-NEXT: andl %edx, %eax +; X64-NOBMI-NEXT: xorl %edi, %eax ; X64-NOBMI-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NOBMI-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/bool-ext-inc.ll b/llvm/test/CodeGen/X86/bool-ext-inc.ll --- a/llvm/test/CodeGen/X86/bool-ext-inc.ll +++ b/llvm/test/CodeGen/X86/bool-ext-inc.ll @@ -6,8 +6,8 @@ define i32 @sext_inc(i1 zeroext %x) nounwind { ; CHECK-LABEL: sext_inc: ; CHECK: # %bb.0: -; CHECK-NEXT: xorb $1, %dil -; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: xorl $1, %eax ; CHECK-NEXT: retq %ext = sext i1 %x to i32 %add = add i32 %ext, 1 @@ -19,8 +19,10 @@ define <4 x i32> @sext_inc_vec(<4 x i1> %x) nounwind { ; CHECK-LABEL: sext_inc_vec: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-NEXT: vandnps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-NEXT: vpsrad $31, %xmm0, %xmm0 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %ext = sext <4 x i1> %x to <4 x i32> %add = add <4 x i32> %ext, @@ -31,8 +33,8 @@ ; CHECK-LABEL: cmpgt_sext_inc_vec: ; CHECK: # %bb.0: ; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %cmp = icmp sgt <4 x i32> %x, %y %ext = sext <4 x i1> %cmp to <4 x i32> @@ -44,7 +46,8 @@ ; CHECK-LABEL: cmpne_sext_inc_vec: ; CHECK: # %bb.0: ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %cmp = icmp ne <4 x i32> %x, %y %ext = sext <4 x i1> %cmp to <4 x i32> @@ -56,8 +59,8 @@ ; CHECK-LABEL: cmpgt_sext_inc_vec256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; CHECK-NEXT: vpandn %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; CHECK-NEXT: vpsubq %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq %cmp = icmp sgt <4 x i64> %x, %y %ext = sext <4 x i1> %cmp to <4 x i64> diff --git a/llvm/test/CodeGen/X86/bool-math.ll b/llvm/test/CodeGen/X86/bool-math.ll --- a/llvm/test/CodeGen/X86/bool-math.ll +++ b/llvm/test/CodeGen/X86/bool-math.ll @@ -12,8 +12,9 @@ ; ; X32-LABEL: sub_zext_cmp_mask_same_size_result: ; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: andl $1, %eax +; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X32-NEXT: andb $1, %al +; X32-NEXT: movzbl %al, %eax ; X32-NEXT: orl $-28, %eax ; X32-NEXT: retl %a = and i32 %x, 1 @@ -141,7 +142,7 @@ ; ; X32-LABEL: low_bit_select_constants_bigger_false_same_size_result: ; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: andl $1, %eax ; X32-NEXT: orl $42, %eax ; X32-NEXT: retl @@ -161,7 +162,7 @@ ; ; X32-LABEL: low_bit_select_constants_bigger_false_wider_result: ; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: andl $1, %eax ; X32-NEXT: orl $26, %eax ; X32-NEXT: xorl %edx, %edx @@ -183,7 +184,7 @@ ; ; X32-LABEL: low_bit_select_constants_bigger_false_narrower_result: ; X32: # %bb.0: -; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: andl $1, %eax ; X32-NEXT: orl $36, %eax ; X32-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/bswap.ll b/llvm/test/CodeGen/X86/bswap.ll --- a/llvm/test/CodeGen/X86/bswap.ll +++ b/llvm/test/CodeGen/X86/bswap.ll @@ -126,14 +126,21 @@ ; CHECK-LABEL: test2: ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: bswapl %eax +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shll $24, %ecx +; CHECK-NEXT: shll $8, %eax +; CHECK-NEXT: andl $16711680, %eax # imm = 0xFF0000 +; CHECK-NEXT: orl %ecx, %eax ; CHECK-NEXT: sarl $16, %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: test2: ; CHECK64: # %bb.0: ; CHECK64-NEXT: movl %edi, %eax -; CHECK64-NEXT: bswapl %eax +; CHECK64-NEXT: shll $24, %eax +; CHECK64-NEXT: shll $8, %edi +; CHECK64-NEXT: andl $16711680, %edi # imm = 0xFF0000 +; CHECK64-NEXT: orl %edi, %eax ; CHECK64-NEXT: sarl $16, %eax ; CHECK64-NEXT: retq %and = lshr i32 %a, 8 diff --git a/llvm/test/CodeGen/X86/bswap_tree2.ll b/llvm/test/CodeGen/X86/bswap_tree2.ll --- a/llvm/test/CodeGen/X86/bswap_tree2.ll +++ b/llvm/test/CodeGen/X86/bswap_tree2.ll @@ -11,20 +11,28 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: andl $16711935, %ecx # imm = 0xFF00FF +; CHECK-NEXT: andl $16711680, %ecx # imm = 0xFF0000 +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: orl $-16777216, %edx # imm = 0xFF000000 ; CHECK-NEXT: shll $8, %ecx -; CHECK-NEXT: orl $-16777216, %eax # imm = 0xFF000000 -; CHECK-NEXT: shrl $8, %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: shrl $8, %edx +; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: bswapl %eax +; CHECK-NEXT: shrl $16, %eax +; CHECK-NEXT: orl %edx, %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: test1: ; CHECK64: # %bb.0: +; CHECK64-NEXT: movl %edi, %ecx +; CHECK64-NEXT: andl $16711680, %ecx # imm = 0xFF0000 ; CHECK64-NEXT: movl %edi, %eax -; CHECK64-NEXT: andl $16711935, %eax # imm = 0xFF00FF -; CHECK64-NEXT: shll $8, %eax -; CHECK64-NEXT: orl $-16777216, %edi # imm = 0xFF000000 -; CHECK64-NEXT: shrl $8, %edi +; CHECK64-NEXT: orl $-16777216, %eax # imm = 0xFF000000 +; CHECK64-NEXT: shll $8, %ecx +; CHECK64-NEXT: shrl $8, %eax +; CHECK64-NEXT: orl %ecx, %eax +; CHECK64-NEXT: bswapl %edi +; CHECK64-NEXT: shrl $16, %edi ; CHECK64-NEXT: orl %edi, %eax ; CHECK64-NEXT: retq %byte0 = and i32 %x, 255 ; 0x000000ff diff --git a/llvm/test/CodeGen/X86/bt.ll b/llvm/test/CodeGen/X86/bt.ll --- a/llvm/test/CodeGen/X86/bt.ll +++ b/llvm/test/CodeGen/X86/bt.ll @@ -1064,7 +1064,7 @@ ; X86-LABEL: extend: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: btl %ecx, %eax ; X86-NEXT: setb %al ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/btc_bts_btr.ll b/llvm/test/CodeGen/X86/btc_bts_btr.ll --- a/llvm/test/CodeGen/X86/btc_bts_btr.ll +++ b/llvm/test/CodeGen/X86/btc_bts_btr.ll @@ -859,8 +859,8 @@ ; X86-NEXT: .LBB33_2: ; X86-NEXT: notl %esi ; X86-NEXT: notl %edx -; X86-NEXT: andl %edx, (%eax) ; X86-NEXT: andl %esi, 4(%eax) +; X86-NEXT: andl %edx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl @@ -899,8 +899,8 @@ ; X86-NEXT: movl %edx, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .LBB34_2: -; X86-NEXT: orl %edx, (%eax) ; X86-NEXT: orl %esi, 4(%eax) +; X86-NEXT: orl %edx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl @@ -938,8 +938,8 @@ ; X86-NEXT: movl %edx, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .LBB35_2: -; X86-NEXT: xorl %edx, (%eax) ; X86-NEXT: xorl %esi, 4(%eax) +; X86-NEXT: xorl %edx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl @@ -1027,8 +1027,8 @@ ; ; X86-LABEL: btr_64_mask_zeros: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $2, %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shlb $2, %cl ; X86-NEXT: movl $1, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: shldl %cl, %eax, %edx @@ -1062,8 +1062,8 @@ ; ; X86-LABEL: bts_64_mask_zeros: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $2, %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shlb $2, %cl ; X86-NEXT: movl $1, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: shldl %cl, %eax, %edx @@ -1094,8 +1094,8 @@ ; ; X86-LABEL: btc_64_mask_zeros: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $2, %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shlb $2, %cl ; X86-NEXT: movl $1, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: shldl %cl, %eax, %edx diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll --- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll +++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll @@ -10,9 +10,8 @@ ; SSE2-NEXT: cvttps2dq %xmm0, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: orl $-16777216, %eax # imm = 0xFF000000 -; SSE2-NEXT: movl %eax, (%rdi) +; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movd %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSE41-LABEL: foo: @@ -47,12 +46,12 @@ define <4 x float> @test_negative_zero_1(<4 x float> %A) { ; SSE2-LABEL: test_negative_zero_1: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0] +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_negative_zero_1: @@ -77,19 +76,14 @@ ; FIXME: This could be 'movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]'. define <2 x double> @test_negative_zero_2(<2 x double> %A) { -; SSE2-LABEL: test_negative_zero_2: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],mem[1] -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_negative_zero_2: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; SSE41-NEXT: retq +; SSE-LABEL: test_negative_zero_2: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; SSE-NEXT: retq ; ; AVX-LABEL: test_negative_zero_2: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; AVX-NEXT: retq entry: %0 = extractelement <2 x double> %A, i32 0 @@ -785,9 +779,10 @@ define i32 @PR46586(ptr %p, <4 x i32> %v) { ; SSE2-LABEL: PR46586: ; SSE2: # %bb.0: -; SSE2-NEXT: movzbl 3(%rdi), %eax -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pinsrw $6, %eax, %xmm1 +; SSE2-NEXT: movzbl (%rdi), %eax +; SSE2-NEXT: movzbl 3(%rdi), %ecx +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pinsrw $6, %ecx, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] @@ -799,9 +794,10 @@ ; ; SSE41-LABEL: PR46586: ; SSE41: # %bb.0: -; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pinsrb $12, 3(%rdi), %xmm1 +; SSE41-NEXT: pextrd $3, %xmm1, %eax ; SSE41-NEXT: extractps $3, %xmm0, %ecx -; SSE41-NEXT: pextrb $3, %xmm1, %eax ; SSE41-NEXT: xorl %edx, %edx ; SSE41-NEXT: divl %ecx ; SSE41-NEXT: movl %edx, %eax @@ -809,9 +805,10 @@ ; ; AVX-LABEL: PR46586: ; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $12, 3(%rdi), %xmm1, %xmm1 +; AVX-NEXT: vpextrd $3, %xmm1, %eax ; AVX-NEXT: vextractps $3, %xmm0, %ecx -; AVX-NEXT: vpextrb $3, %xmm1, %eax ; AVX-NEXT: xorl %edx, %edx ; AVX-NEXT: divl %ecx ; AVX-NEXT: movl %edx, %eax diff --git a/llvm/test/CodeGen/X86/bypass-slow-division-32.ll b/llvm/test/CodeGen/X86/bypass-slow-division-32.ll --- a/llvm/test/CodeGen/X86/bypass-slow-division-32.ll +++ b/llvm/test/CodeGen/X86/bypass-slow-division-32.ll @@ -174,13 +174,10 @@ ; CHECK-NEXT: imull %edx ; CHECK-NEXT: movl %edx, %eax ; CHECK-NEXT: shrl $31, %eax -; CHECK-NEXT: sarl $3, %edx -; CHECK-NEXT: addl %edx, %eax -; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: shll $5, %edx +; CHECK-NEXT: shrl $3, %edx ; CHECK-NEXT: addl %eax, %edx +; CHECK-NEXT: shll $5, %edx ; CHECK-NEXT: subl %edx, %ecx -; CHECK-NEXT: addl %eax, %ecx ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: retl %resultdiv = sdiv i32 %a, 33 diff --git a/llvm/test/CodeGen/X86/cfguard-x86-64-vectorcall.ll b/llvm/test/CodeGen/X86/cfguard-x86-64-vectorcall.ll --- a/llvm/test/CodeGen/X86/cfguard-x86-64-vectorcall.ll +++ b/llvm/test/CodeGen/X86/cfguard-x86-64-vectorcall.ll @@ -14,12 +14,12 @@ ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: movups (%rdx), %xmm0 ; X64-NEXT: movups 16(%rdx), %xmm1 -; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; X64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; X64-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; X64-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero +; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; X64-NEXT: callq *__guard_dispatch_icall_fptr(%rip) ; X64-NEXT: nop ; X64-NEXT: addq $72, %rsp diff --git a/llvm/test/CodeGen/X86/clz.ll b/llvm/test/CodeGen/X86/clz.ll --- a/llvm/test/CodeGen/X86/clz.ll +++ b/llvm/test/CodeGen/X86/clz.ll @@ -831,7 +831,6 @@ ; X86-NOCMOV-LABEL: cttz_i64_zero_test: ; X86-NOCMOV: # %bb.0: ; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOCMOV-NOT: rep ; X86-NOCMOV-NEXT: bsfl {{[0-9]+}}(%esp), %edx ; X86-NOCMOV-NEXT: movl $32, %eax ; X86-NOCMOV-NEXT: je .LBB15_2 @@ -852,12 +851,10 @@ ; X86-CMOV-LABEL: cttz_i64_zero_test: ; X86-CMOV: # %bb.0: ; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-CMOV-NOT: rep ; X86-CMOV-NEXT: bsfl {{[0-9]+}}(%esp), %ecx ; X86-CMOV-NEXT: movl $32, %edx ; X86-CMOV-NEXT: cmovnel %ecx, %edx ; X86-CMOV-NEXT: addl $32, %edx -; X86-CMOV-NOT: rep ; X86-CMOV-NEXT: bsfl %eax, %eax ; X86-CMOV-NEXT: cmovel %edx, %eax ; X86-CMOV-NEXT: xorl %edx, %edx @@ -1395,15 +1392,13 @@ ; ; X86-CLZ-LABEL: PR47603_trunc: ; X86-CLZ: # %bb.0: -; X86-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax -; X86-CLZ-NEXT: xorb $31, %al +; X86-CLZ-NEXT: bsrl {{[0-9]+}}(%esp), %eax ; X86-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X86-CLZ-NEXT: retl ; ; X64-CLZ-LABEL: PR47603_trunc: ; X64-CLZ: # %bb.0: -; X64-CLZ-NEXT: lzcntl %edi, %eax -; X64-CLZ-NEXT: xorb $31, %al +; X64-CLZ-NEXT: bsrl %edi, %eax ; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X64-CLZ-NEXT: retq ; @@ -1481,13 +1476,11 @@ define i32 @cttz_i32_osize(i32 %x) optsize { ; X86-LABEL: cttz_i32_osize: ; X86: # %bb.0: -; X86-NOT: rep ; X86-NEXT: bsfl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; ; X64-LABEL: cttz_i32_osize: ; X64: # %bb.0: -; X64-NOT: rep ; X64-NEXT: bsfl %edi, %eax ; X64-NEXT: retq ; @@ -1517,13 +1510,11 @@ define i32 @cttz_i32_msize(i32 %x) minsize { ; X86-LABEL: cttz_i32_msize: ; X86: # %bb.0: -; X86-NOT: rep ; X86-NEXT: bsfl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; ; X64-LABEL: cttz_i32_msize: ; X64: # %bb.0: -; X64-NOT: rep ; X64-NEXT: bsfl %edi, %eax ; X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/cmov-promotion.ll b/llvm/test/CodeGen/X86/cmov-promotion.ll --- a/llvm/test/CodeGen/X86/cmov-promotion.ll +++ b/llvm/test/CodeGen/X86/cmov-promotion.ll @@ -30,20 +30,19 @@ define i32 @cmov_zpromotion_8_to_32(i1 %c) { ; CMOV-LABEL: cmov_zpromotion_8_to_32: ; CMOV: # %bb.0: -; CMOV-NEXT: testb $1, %dil -; CMOV-NEXT: movl $126, %ecx -; CMOV-NEXT: movl $255, %eax -; CMOV-NEXT: cmovnel %ecx, %eax +; CMOV-NEXT: andb $1, %dil +; CMOV-NEXT: decb %dil +; CMOV-NEXT: orb $126, %dil +; CMOV-NEXT: movzbl %dil, %eax ; CMOV-NEXT: retq ; ; NO_CMOV-LABEL: cmov_zpromotion_8_to_32: ; NO_CMOV: # %bb.0: -; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; NO_CMOV-NEXT: movl $126, %eax -; NO_CMOV-NEXT: jne .LBB1_2 -; NO_CMOV-NEXT: # %bb.1: -; NO_CMOV-NEXT: movl $255, %eax -; NO_CMOV-NEXT: .LBB1_2: +; NO_CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; NO_CMOV-NEXT: andb $1, %al +; NO_CMOV-NEXT: decb %al +; NO_CMOV-NEXT: orb $126, %al +; NO_CMOV-NEXT: movzbl %al, %eax ; NO_CMOV-NEXT: retl %t0 = select i1 %c, i8 12414, i8 -1 %ret = zext i8 %t0 to i32 @@ -53,20 +52,19 @@ define i64 @cmov_zpromotion_8_to_64(i1 %c) { ; CMOV-LABEL: cmov_zpromotion_8_to_64: ; CMOV: # %bb.0: -; CMOV-NEXT: testb $1, %dil -; CMOV-NEXT: movl $126, %ecx -; CMOV-NEXT: movl $255, %eax -; CMOV-NEXT: cmovneq %rcx, %rax +; CMOV-NEXT: andb $1, %dil +; CMOV-NEXT: decb %dil +; CMOV-NEXT: orb $126, %dil +; CMOV-NEXT: movzbl %dil, %eax ; CMOV-NEXT: retq ; ; NO_CMOV-LABEL: cmov_zpromotion_8_to_64: ; NO_CMOV: # %bb.0: -; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; NO_CMOV-NEXT: movl $126, %eax -; NO_CMOV-NEXT: jne .LBB2_2 -; NO_CMOV-NEXT: # %bb.1: -; NO_CMOV-NEXT: movl $255, %eax -; NO_CMOV-NEXT: .LBB2_2: +; NO_CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; NO_CMOV-NEXT: andb $1, %al +; NO_CMOV-NEXT: decb %al +; NO_CMOV-NEXT: orb $126, %al +; NO_CMOV-NEXT: movzbl %al, %eax ; NO_CMOV-NEXT: xorl %edx, %edx ; NO_CMOV-NEXT: retl %t0 = select i1 %c, i8 12414, i8 -1 @@ -77,20 +75,19 @@ define i32 @cmov_zpromotion_16_to_32(i1 %c) { ; CMOV-LABEL: cmov_zpromotion_16_to_32: ; CMOV: # %bb.0: -; CMOV-NEXT: testb $1, %dil -; CMOV-NEXT: movl $12414, %ecx # imm = 0x307E -; CMOV-NEXT: movl $65535, %eax # imm = 0xFFFF -; CMOV-NEXT: cmovnel %ecx, %eax +; CMOV-NEXT: andl $1, %edi +; CMOV-NEXT: decl %edi +; CMOV-NEXT: orl $12414, %edi # imm = 0x307E +; CMOV-NEXT: movzwl %di, %eax ; CMOV-NEXT: retq ; ; NO_CMOV-LABEL: cmov_zpromotion_16_to_32: ; NO_CMOV: # %bb.0: -; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; NO_CMOV-NEXT: movl $12414, %eax # imm = 0x307E -; NO_CMOV-NEXT: jne .LBB3_2 -; NO_CMOV-NEXT: # %bb.1: -; NO_CMOV-NEXT: movl $65535, %eax # imm = 0xFFFF -; NO_CMOV-NEXT: .LBB3_2: +; NO_CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; NO_CMOV-NEXT: andl $1, %eax +; NO_CMOV-NEXT: decl %eax +; NO_CMOV-NEXT: orl $12414, %eax # imm = 0x307E +; NO_CMOV-NEXT: movzwl %ax, %eax ; NO_CMOV-NEXT: retl %t0 = select i1 %c, i16 12414, i16 -1 %ret = zext i16 %t0 to i32 @@ -100,20 +97,19 @@ define i64 @cmov_zpromotion_16_to_64(i1 %c) { ; CMOV-LABEL: cmov_zpromotion_16_to_64: ; CMOV: # %bb.0: -; CMOV-NEXT: testb $1, %dil -; CMOV-NEXT: movl $12414, %ecx # imm = 0x307E -; CMOV-NEXT: movl $65535, %eax # imm = 0xFFFF -; CMOV-NEXT: cmovneq %rcx, %rax +; CMOV-NEXT: andl $1, %edi +; CMOV-NEXT: decl %edi +; CMOV-NEXT: orl $12414, %edi # imm = 0x307E +; CMOV-NEXT: movzwl %di, %eax ; CMOV-NEXT: retq ; ; NO_CMOV-LABEL: cmov_zpromotion_16_to_64: ; NO_CMOV: # %bb.0: -; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; NO_CMOV-NEXT: movl $12414, %eax # imm = 0x307E -; NO_CMOV-NEXT: jne .LBB4_2 -; NO_CMOV-NEXT: # %bb.1: -; NO_CMOV-NEXT: movl $65535, %eax # imm = 0xFFFF -; NO_CMOV-NEXT: .LBB4_2: +; NO_CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; NO_CMOV-NEXT: andl $1, %eax +; NO_CMOV-NEXT: decl %eax +; NO_CMOV-NEXT: orl $12414, %eax # imm = 0x307E +; NO_CMOV-NEXT: movzwl %ax, %eax ; NO_CMOV-NEXT: xorl %edx, %edx ; NO_CMOV-NEXT: retl %t0 = select i1 %c, i16 12414, i16 -1 diff --git a/llvm/test/CodeGen/X86/cmov.ll b/llvm/test/CodeGen/X86/cmov.ll --- a/llvm/test/CodeGen/X86/cmov.ll +++ b/llvm/test/CodeGen/X86/cmov.ll @@ -216,7 +216,7 @@ ; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: notl %edi ; CHECK-NEXT: movl $-1, %eax -; CHECK-NEXT: cmovnsl %edi, %eax +; CHECK-NEXT: cmovgl %edi, %eax ; CHECK-NEXT: retq %not_x = xor i32 %x, -1 %1 = icmp slt i32 %not_x, -1 diff --git a/llvm/test/CodeGen/X86/cmp-bool.ll b/llvm/test/CodeGen/X86/cmp-bool.ll --- a/llvm/test/CodeGen/X86/cmp-bool.ll +++ b/llvm/test/CodeGen/X86/cmp-bool.ll @@ -25,8 +25,9 @@ define void @bool_ne(i1 zeroext %a, i1 zeroext %b, ptr nocapture %c) nounwind { ; CHECK-LABEL: bool_ne: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cmpb %sil, %dil -; CHECK-NEXT: je .LBB1_1 +; CHECK-NEXT: xorl %esi, %edi +; CHECK-NEXT: cmpl $1, %edi +; CHECK-NEXT: jne .LBB1_1 ; CHECK-NEXT: # %bb.2: # %if.then ; CHECK-NEXT: jmpq *%rdx # TAILCALL ; CHECK-NEXT: .LBB1_1: # %if.end diff --git a/llvm/test/CodeGen/X86/cmp-concat.ll b/llvm/test/CodeGen/X86/cmp-concat.ll --- a/llvm/test/CodeGen/X86/cmp-concat.ll +++ b/llvm/test/CodeGen/X86/cmp-concat.ll @@ -36,7 +36,7 @@ ; CHECK-NEXT: movzwl %di, %eax ; CHECK-NEXT: movzwl %si, %ecx ; CHECK-NEXT: shlq $8, %rcx -; CHECK-NEXT: orq %rax, %rcx +; CHECK-NEXT: orq %rcx, %rax ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %zx = zext i16 %x to i64 @@ -54,7 +54,7 @@ ; CHECK-NEXT: movzwl %di, %eax ; CHECK-NEXT: movzwl %si, %ecx ; CHECK-NEXT: shlq $8, %rcx -; CHECK-NEXT: orq %rax, %rcx +; CHECK-NEXT: orq %rcx, %rax ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %zx = zext i16 %x to i64 diff --git a/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll b/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll --- a/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll +++ b/llvm/test/CodeGen/X86/cmp-shiftX-maskX.ll @@ -407,8 +407,9 @@ ; ; CHECK-AVX512-LABEL: shl_to_ror_eq_4xi32_s8: ; CHECK-AVX512: # %bb.0: -; CHECK-AVX512-NEXT: vprold $8, %xmm0, %xmm1 -; CHECK-AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpslld $8, %xmm0, %xmm1 +; CHECK-AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpcmpeqd %xmm0, %xmm1, %xmm0 ; CHECK-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; CHECK-AVX512-NEXT: retq %shr = shl <4 x i32> %x, diff --git a/llvm/test/CodeGen/X86/cmp.ll b/llvm/test/CodeGen/X86/cmp.ll --- a/llvm/test/CodeGen/X86/cmp.ll +++ b/llvm/test/CodeGen/X86/cmp.ll @@ -310,8 +310,10 @@ define i8 @signbit_i16(i16 signext %L) { ; CHECK-LABEL: signbit_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: testw %di, %di # encoding: [0x66,0x85,0xff] -; CHECK-NEXT: setns %al # encoding: [0x0f,0x99,0xc0] +; CHECK-NEXT: movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7] +; CHECK-NEXT: shrl $15, %eax # encoding: [0xc1,0xe8,0x0f] +; CHECK-NEXT: xorb $1, %al # encoding: [0x34,0x01] +; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] %lshr = lshr i16 %L, 15 %trunc = trunc i16 %lshr to i8 diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll --- a/llvm/test/CodeGen/X86/combine-and.ll +++ b/llvm/test/CodeGen/X86/combine-and.ll @@ -589,8 +589,9 @@ ; ; AVX512-LABEL: neg_scalar_broadcast_v8i64_arg: ; AVX512: # %bb.0: +; AVX512-NEXT: notq %rdi ; AVX512-NEXT: vpbroadcastq %rdi, %zmm1 -; AVX512-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpandq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %1 = xor i64 %a0, -1 %2 = insertelement <8 x i64> undef, i64 %1, i64 0 @@ -618,35 +619,38 @@ ; AVX1-LABEL: neg_scalar_broadcast_v8i64: ; AVX1: # %bb.0: ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-NEXT: notq %rdi +; AVX1-NEXT: vmovq %rdi, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,3] -; AVX1-NEXT: vmovq %rdi, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; AVX1-NEXT: vandnpd %ymm0, %ymm2, %ymm0 -; AVX1-NEXT: vandnpd %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vandpd %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v8i64: ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,0] +; AVX2-NEXT: notq %rdi +; AVX2-NEXT: vmovq %rdi, %xmm1 +; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,0] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,1,1] -; AVX2-NEXT: vmovq %rdi, %xmm2 -; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v8i64: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512-NEXT: notq %rdi ; AVX512-NEXT: vpbroadcastq %rdi, %zmm1 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,1,1,0,1,0,0] ; AVX512-NEXT: vpermq %zmm0, %zmm2, %zmm0 -; AVX512-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %1 = xor i64 %a0, -1 %2 = insertelement <8 x i64> undef, i64 %1, i64 0 @@ -668,23 +672,26 @@ ; ; AVX1-LABEL: neg_scalar_broadcast_v4i64_arg: ; AVX1: # %bb.0: +; AVX1-NEXT: notq %rdi ; AVX1-NEXT: vmovq %rdi, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v4i64_arg: ; AVX2: # %bb.0: +; AVX2-NEXT: notq %rdi ; AVX2-NEXT: vmovq %rdi, %xmm1 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v4i64_arg: ; AVX512: # %bb.0: +; AVX512-NEXT: notq %rdi ; AVX512-NEXT: vpbroadcastq %rdi, %ymm1 -; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %1 = xor i64 %a0, -1 %2 = insertelement <4 x i64> undef, i64 %1, i64 0 @@ -708,29 +715,32 @@ ; AVX1-LABEL: neg_scalar_broadcast_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: notq %rdi ; AVX1-NEXT: vmovq %rdi, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,3] -; AVX1-NEXT: vandnpd %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: notq %rdi ; AVX2-NEXT: vmovq %rdi, %xmm1 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,1,1] -; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: notq %rdi ; AVX512-NEXT: vpbroadcastq %rdi, %ymm1 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,1,1] -; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %1 = xor i64 %a0, -1 %2 = insertelement <4 x i64> undef, i64 %1, i64 0 @@ -743,30 +753,33 @@ define <2 x i64> @neg_scalar_broadcast_v2i64(i64 %a0, <2 x i64> %a1) { ; SSE-LABEL: neg_scalar_broadcast_v2i64: ; SSE: # %bb.0: +; SSE-NEXT: notq %rdi ; SSE-NEXT: movq %rdi, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: neg_scalar_broadcast_v2i64: ; AVX1: # %bb.0: +; AVX1-NEXT: notq %rdi ; AVX1-NEXT: vmovq %rdi, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v2i64: ; AVX2: # %bb.0: +; AVX2-NEXT: notq %rdi ; AVX2-NEXT: vmovq %rdi, %xmm1 ; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v2i64: ; AVX512: # %bb.0: +; AVX512-NEXT: notq %rdi ; AVX512-NEXT: vpbroadcastq %rdi, %xmm1 -; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = xor i64 %a0, -1 %2 = insertelement <2 x i64> undef, i64 %1, i64 0 @@ -819,23 +832,26 @@ ; ; AVX1-LABEL: neg_scalar_broadcast_v8i32: ; AVX1: # %bb.0: +; AVX1-NEXT: notl %edi ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v8i32: ; AVX2: # %bb.0: +; AVX2-NEXT: notl %edi ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 -; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v8i32: ; AVX512: # %bb.0: +; AVX512-NEXT: notl %edi ; AVX512-NEXT: vpbroadcastd %edi, %ymm1 -; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %1 = xor i32 %a0, -1 %2 = insertelement <8 x i32> undef, i32 %1, i64 0 @@ -847,32 +863,35 @@ define <8 x i16> @neg_scalar_broadcast_v8i16(i16 %a0, <8 x i16> %a1) { ; SSE-LABEL: neg_scalar_broadcast_v8i16: ; SSE: # %bb.0: +; SSE-NEXT: notl %edi ; SSE-NEXT: movd %edi, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: neg_scalar_broadcast_v8i16: ; AVX1: # %bb.0: +; AVX1-NEXT: notl %edi ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v8i16: ; AVX2: # %bb.0: +; AVX2-NEXT: notl %edi ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v8i16: ; AVX512: # %bb.0: +; AVX512-NEXT: notl %edi ; AVX512-NEXT: vpbroadcastw %edi, %xmm1 -; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = xor i16 %a0, -1 %2 = insertelement <8 x i16> undef, i16 %1, i64 0 @@ -884,32 +903,36 @@ define <16 x i8> @neg_scalar_broadcast_v16i8(i8 %a0, <16 x i8> %a1) { ; SSE-LABEL: neg_scalar_broadcast_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: movd %edi, %xmm1 +; SSE-NEXT: notb %dil +; SSE-NEXT: movzbl %dil, %eax +; SSE-NEXT: movd %eax, %xmm1 ; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: pshufb %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: neg_scalar_broadcast_v16i8: ; AVX1: # %bb.0: +; AVX1-NEXT: notb %dil ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v16i8: ; AVX2: # %bb.0: +; AVX2-NEXT: notb %dil ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v16i8: ; AVX512: # %bb.0: +; AVX512-NEXT: notb %dil ; AVX512-NEXT: vpbroadcastb %edi, %xmm1 -; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = xor i8 %a0, -1 %2 = insertelement <16 x i8> undef, i8 %1, i64 0 @@ -954,8 +977,9 @@ ; ; AVX512-LABEL: neg_scalar_broadcast_v64i8: ; AVX512: # %bb.0: +; AVX512-NEXT: notb %dil ; AVX512-NEXT: vpbroadcastb %edi, %zmm1 -; AVX512-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpandq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %1 = xor i8 %a0, -1 %2 = insertelement <64 x i8> undef, i8 %1, i64 0 @@ -1000,8 +1024,9 @@ ; ; AVX512-LABEL: neg_scalar_broadcast_v64i8_v8i64: ; AVX512: # %bb.0: +; AVX512-NEXT: notb %dil ; AVX512-NEXT: vpbroadcastb %edi, %zmm1 -; AVX512-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpandq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %1 = xor i8 %a0, -1 %2 = insertelement <64 x i8> undef, i8 %1, i64 0 @@ -1025,24 +1050,27 @@ ; ; AVX1-LABEL: neg_scalar_broadcast_v32i8_v4i64: ; AVX1: # %bb.0: +; AVX1-NEXT: notb %dil ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v32i8_v4i64: ; AVX2: # %bb.0: +; AVX2-NEXT: notb %dil ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 -; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v32i8_v4i64: ; AVX512: # %bb.0: +; AVX512-NEXT: notb %dil ; AVX512-NEXT: vpbroadcastb %edi, %ymm1 -; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %1 = xor i8 %a0, -1 %2 = insertelement <32 x i8> undef, i8 %1, i64 0 @@ -1055,32 +1083,36 @@ define <2 x i64> @neg_scalar_broadcast_v16i8_v2i64(i8 %a0, <2 x i64> %a1) { ; SSE-LABEL: neg_scalar_broadcast_v16i8_v2i64: ; SSE: # %bb.0: -; SSE-NEXT: movd %edi, %xmm1 +; SSE-NEXT: notb %dil +; SSE-NEXT: movzbl %dil, %eax +; SSE-NEXT: movd %eax, %xmm1 ; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: pshufb %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: neg_scalar_broadcast_v16i8_v2i64: ; AVX1: # %bb.0: +; AVX1-NEXT: notb %dil ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v16i8_v2i64: ; AVX2: # %bb.0: +; AVX2-NEXT: notb %dil ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v16i8_v2i64: ; AVX512: # %bb.0: +; AVX512-NEXT: notb %dil ; AVX512-NEXT: vpbroadcastb %edi, %xmm1 -; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = xor i8 %a0, -1 %2 = insertelement <16 x i8> undef, i8 %1, i64 0 @@ -1102,23 +1134,26 @@ ; ; AVX1-LABEL: neg_scalar_broadcast_v8i32_v4i64: ; AVX1: # %bb.0: +; AVX1-NEXT: notl %edi ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v8i32_v4i64: ; AVX2: # %bb.0: +; AVX2-NEXT: notl %edi ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 -; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v8i32_v4i64: ; AVX512: # %bb.0: +; AVX512-NEXT: notl %edi ; AVX512-NEXT: vpbroadcastd %edi, %ymm1 -; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %1 = xor i32 %a0, -1 %2 = insertelement <8 x i32> undef, i32 %1, i64 0 diff --git a/llvm/test/CodeGen/X86/combine-avx2-intrinsics.ll b/llvm/test/CodeGen/X86/combine-avx2-intrinsics.ll --- a/llvm/test/CodeGen/X86/combine-avx2-intrinsics.ll +++ b/llvm/test/CodeGen/X86/combine-avx2-intrinsics.ll @@ -113,6 +113,7 @@ define <4 x i64> @demandedelts_vpsrlvq(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: demandedelts_vpsrlvq: ; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastq %xmm1, %xmm1 ; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-bitreverse.ll b/llvm/test/CodeGen/X86/combine-bitreverse.ll --- a/llvm/test/CodeGen/X86/combine-bitreverse.ll +++ b/llvm/test/CodeGen/X86/combine-bitreverse.ll @@ -75,9 +75,9 @@ ; X86-NEXT: andl $858993408, %eax # imm = 0x33333300 ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: andl $1431655744, %ecx # imm = 0x55555540 ; X86-NEXT: shrl %eax -; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 +; X86-NEXT: andl $1431655680, %eax # imm = 0x55555500 ; X86-NEXT: leal (%eax,%ecx,2), %eax ; X86-NEXT: retl ; @@ -115,9 +115,9 @@ ; X64-NEXT: andl $858993408, %eax # imm = 0x33333300 ; X64-NEXT: leal (%rax,%rcx,4), %eax ; X64-NEXT: movl %eax, %ecx -; X64-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X64-NEXT: andl $1431655744, %ecx # imm = 0x55555540 ; X64-NEXT: shrl %eax -; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555 +; X64-NEXT: andl $1431655680, %eax # imm = 0x55555500 ; X64-NEXT: leal (%rax,%rcx,2), %eax ; X64-NEXT: retq %b = call i32 @llvm.bitreverse.i32(i32 %a0) @@ -163,7 +163,7 @@ ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 ; X86-NEXT: shrl %eax -; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 +; X86-NEXT: andl $1431655764, %eax # imm = 0x55555554 ; X86-NEXT: leal (%eax,%ecx,2), %edx ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: retl @@ -183,34 +183,33 @@ ; X64-NEXT: andq %rax, %rcx ; X64-NEXT: shrq $2, %rdi ; X64-NEXT: andq %rax, %rdi -; X64-NEXT: leaq (%rdi,%rcx,4), %rax -; X64-NEXT: movabsq $6148914689804861440, %rcx # imm = 0x5555555500000000 -; X64-NEXT: andq %rax, %rcx -; X64-NEXT: shrq %rax -; X64-NEXT: movabsq $6148914685509894144, %rdx # imm = 0x5555555400000000 -; X64-NEXT: andq %rax, %rdx -; X64-NEXT: leaq (%rdx,%rcx,2), %rax -; X64-NEXT: shrq $33, %rax -; X64-NEXT: bswapq %rax -; X64-NEXT: movabsq $1085102592318504960, %rcx # imm = 0xF0F0F0F00000000 -; X64-NEXT: andq %rax, %rcx -; X64-NEXT: shrq $4, %rax -; X64-NEXT: movabsq $1085102557958766592, %rdx # imm = 0xF0F0F0700000000 -; X64-NEXT: andq %rax, %rdx -; X64-NEXT: shlq $4, %rcx -; X64-NEXT: orq %rdx, %rcx -; X64-NEXT: movabsq $3689348813882916864, %rax # imm = 0x3333333300000000 -; X64-NEXT: andq %rcx, %rax -; X64-NEXT: shrq $2, %rcx -; X64-NEXT: movabsq $3689348805292982272, %rdx # imm = 0x3333333100000000 +; X64-NEXT: leaq (%rdi,%rcx,4), %rdx +; X64-NEXT: movabsq $6148914689804861440, %rax # imm = 0x5555555500000000 +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: andq %rax, %rsi +; X64-NEXT: shrq %rdx +; X64-NEXT: movabsq $6148914685509894144, %rcx # imm = 0x5555555400000000 ; X64-NEXT: andq %rcx, %rdx -; X64-NEXT: leaq (%rdx,%rax,4), %rax -; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 -; X64-NEXT: movq %rax, %rdx +; X64-NEXT: leaq (%rdx,%rsi,2), %rdx +; X64-NEXT: shrq $33, %rdx +; X64-NEXT: bswapq %rdx +; X64-NEXT: movabsq $1085102592318504960, %rsi # imm = 0xF0F0F0F00000000 +; X64-NEXT: andq %rdx, %rsi +; X64-NEXT: shrq $4, %rdx +; X64-NEXT: movabsq $1085102557958766592, %rdi # imm = 0xF0F0F0700000000 +; X64-NEXT: andq %rdx, %rdi +; X64-NEXT: shlq $4, %rsi +; X64-NEXT: orq %rdi, %rsi +; X64-NEXT: movabsq $3689348813882916864, %rdx # imm = 0x3333333300000000 +; X64-NEXT: andq %rsi, %rdx +; X64-NEXT: shrq $2, %rsi +; X64-NEXT: movabsq $3689348805292982272, %rdi # imm = 0x3333333100000000 +; X64-NEXT: andq %rsi, %rdi +; X64-NEXT: leaq (%rdi,%rdx,4), %rdx +; X64-NEXT: andq %rdx, %rax +; X64-NEXT: shrq %rdx ; X64-NEXT: andq %rcx, %rdx -; X64-NEXT: shrq %rax -; X64-NEXT: andq %rcx, %rax -; X64-NEXT: leaq (%rax,%rdx,2), %rax +; X64-NEXT: leaq (%rdx,%rax,2), %rax ; X64-NEXT: retq %1 = call i64 @llvm.bitreverse.i64(i64 %a) %2 = lshr i64 %1, 33 @@ -254,9 +253,9 @@ ; X86-NEXT: andl $36909875, %eax # imm = 0x2333333 ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: andl $5592405, %ecx # imm = 0x555555 ; X86-NEXT: shrl %eax -; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 +; X86-NEXT: andl $22369621, %eax # imm = 0x1555555 ; X86-NEXT: leal (%eax,%ecx,2), %eax ; X86-NEXT: retl ; @@ -294,9 +293,9 @@ ; X64-NEXT: andl $36909875, %ecx # imm = 0x2333333 ; X64-NEXT: leal (%rcx,%rax,4), %eax ; X64-NEXT: movl %eax, %ecx -; X64-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X64-NEXT: andl $5592405, %ecx # imm = 0x555555 ; X64-NEXT: shrl %eax -; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555 +; X64-NEXT: andl $22369621, %eax # imm = 0x1555555 ; X64-NEXT: leal (%rax,%rcx,2), %eax ; X64-NEXT: retq %b = call i32 @llvm.bitreverse.i32(i32 %a0) @@ -338,7 +337,7 @@ ; X86-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: andl $357913941, %ecx # imm = 0x15555555 ; X86-NEXT: shrl %eax ; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X86-NEXT: leal (%eax,%ecx,2), %eax @@ -358,13 +357,14 @@ ; X64-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X64-NEXT: shrl $2, %edi ; X64-NEXT: andl $858993459, %edi # imm = 0x33333333 -; X64-NEXT: leal (%rdi,%rax,4), %eax -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: andl $357913941, %ecx # imm = 0x15555555 +; X64-NEXT: leal (%rdi,%rax,4), %ecx +; X64-NEXT: movl %ecx, %eax ; X64-NEXT: shrl %eax ; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555 -; X64-NEXT: leal (%rax,%rcx,2), %eax +; X64-NEXT: andl $357913941, %ecx # imm = 0x15555555 +; X64-NEXT: shlq $34, %rcx ; X64-NEXT: shlq $33, %rax +; X64-NEXT: orq %rcx, %rax ; X64-NEXT: bswapq %rax ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: andl $235867919, %ecx # imm = 0xE0F0F0F diff --git a/llvm/test/CodeGen/X86/combine-bitselect.ll b/llvm/test/CodeGen/X86/combine-bitselect.ll --- a/llvm/test/CodeGen/X86/combine-bitselect.ll +++ b/llvm/test/CodeGen/X86/combine-bitselect.ll @@ -541,8 +541,11 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovq %rdi, %xmm2 ; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm3 +; AVX2-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -551,14 +554,20 @@ ; AVX512F-NEXT: vmovq %rdi, %xmm2 ; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: bitselect_v4i64_broadcast_rrr: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpbroadcastq %rdi, %ymm2 -; AVX512VL-NEXT: vpternlogq $226, %ymm1, %ymm2, %ymm0 +; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: # kill: def $xmm2 killed $xmm2 killed $ymm2 +; AVX512VL-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512VL-NEXT: vpternlogq $248, %ymm2, %ymm1, %ymm0 ; AVX512VL-NEXT: retq %1 = insertelement <4 x i64> undef, i64 %a2, i32 0 %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <4 x i32> zeroinitializer @@ -590,25 +599,43 @@ ; XOP-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 ; XOP-NEXT: retq ; -; AVX-LABEL: bitselect_v4i64_broadcast_rrm: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastsd (%rdi), %ymm2 -; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: bitselect_v4i64_broadcast_rrm: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm2 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitselect_v4i64_broadcast_rrm: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm2 +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm3 +; AVX2-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: bitselect_v4i64_broadcast_rrm: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vbroadcastsd (%rdi), %ymm2 -; AVX512F-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX512F-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastq (%rdi), %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: bitselect_v4i64_broadcast_rrm: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpternlogq $228, (%rdi){1to4}, %ymm1, %ymm0 +; AVX512VL-NEXT: vpbroadcastq (%rdi), %ymm2 +; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: # kill: def $xmm2 killed $xmm2 killed $ymm2 +; AVX512VL-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512VL-NEXT: vpternlogq $248, %ymm2, %ymm1, %ymm0 ; AVX512VL-NEXT: retq %a2 = load i64, ptr %p2 %1 = insertelement <4 x i64> undef, i64 %a2, i32 0 @@ -914,19 +941,35 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovq %rdi, %xmm4 ; AVX2-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX2-NEXT: vpxor %xmm5, %xmm4, %xmm5 +; AVX2-NEXT: vpbroadcastq %xmm5, %ymm5 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: bitselect_v8i64_broadcast_rrr: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq %rdi, %zmm2 -; AVX512-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: bitselect_v8i64_broadcast_rrr: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq %rdi, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2 +; AVX512F-NEXT: vpternlogq $248, %zmm2, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: bitselect_v8i64_broadcast_rrr: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq %rdi, %zmm2 +; AVX512VL-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512VL-NEXT: # kill: def $xmm2 killed $xmm2 killed $zmm2 +; AVX512VL-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2 +; AVX512VL-NEXT: vpternlogq $248, %zmm2, %zmm1, %zmm0 +; AVX512VL-NEXT: retq %1 = insertelement <8 x i64> undef, i64 %a2, i32 0 %2 = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> zeroinitializer %3 = xor <8 x i64> %1, @@ -966,21 +1009,49 @@ ; XOP-NEXT: vpcmov %ymm4, %ymm3, %ymm1, %ymm1 ; XOP-NEXT: retq ; -; AVX-LABEL: bitselect_v8i64_broadcast_rrm: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastsd (%rdi), %ymm4 -; AVX-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX-NEXT: vandnps %ymm3, %ymm4, %ymm3 -; AVX-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: bitselect_v8i64_broadcast_rrm: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vandnps %ymm3, %ymm4, %ymm3 +; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vandnps %ymm2, %ymm4, %ymm2 +; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: retq ; -; AVX512-LABEL: bitselect_v8i64_broadcast_rrm: -; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogq $228, (%rdi){1to8}, %zmm1, %zmm0 -; AVX512-NEXT: retq +; AVX2-LABEL: bitselect_v8i64_broadcast_rrm: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX2-NEXT: vpxor %xmm5, %xmm4, %xmm5 +; AVX2-NEXT: vpbroadcastq %xmm5, %ymm5 +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: bitselect_v8i64_broadcast_rrm: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq (%rdi), %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2 +; AVX512F-NEXT: vpternlogq $248, %zmm2, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: bitselect_v8i64_broadcast_rrm: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq (%rdi), %zmm2 +; AVX512VL-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512VL-NEXT: # kill: def $xmm2 killed $xmm2 killed $zmm2 +; AVX512VL-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2 +; AVX512VL-NEXT: vpternlogq $248, %zmm2, %zmm1, %zmm0 +; AVX512VL-NEXT: retq %a2 = load i64, ptr %p2 %1 = insertelement <8 x i64> undef, i64 %a2, i32 0 %2 = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/X86/combine-bswap.ll b/llvm/test/CodeGen/X86/combine-bswap.ll --- a/llvm/test/CodeGen/X86/combine-bswap.ll +++ b/llvm/test/CodeGen/X86/combine-bswap.ll @@ -42,15 +42,15 @@ define i16 @test_bswap_srli_8_bswap_i16(i16 %a) nounwind { ; X86-LABEL: test_bswap_srli_8_bswap_i16: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shll $8, %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: rolw $8, %ax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; ; X64-LABEL: test_bswap_srli_8_bswap_i16: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $8, %eax +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: rolw $8, %ax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %1 = call i16 @llvm.bswap.i16(i16 %a) @@ -106,7 +106,8 @@ ; X64-LABEL: test_bswap_shli_8_bswap_i16: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: movzbl %ah, %eax +; X64-NEXT: andl $65280, %eax # imm = 0xFF00 +; X64-NEXT: rolw $8, %ax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %1 = call i16 @llvm.bswap.i16(i16 %a) @@ -136,8 +137,12 @@ define i64 @test_bswap_shli_16_bswap_i64(i64 %a) nounwind { ; X86-LABEL: test_bswap_shli_16_bswap_i64: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shll $16, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: shrl $16, %edx ; X86-NEXT: retl ; ; X64-LABEL: test_bswap_shli_16_bswap_i64: @@ -220,7 +225,7 @@ define i64 @test_bswap64_shift48(i64 %a0) { ; X86-LABEL: test_bswap64_shift48: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: rolw $8, %ax ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: xorl %edx, %edx diff --git a/llvm/test/CodeGen/X86/combine-concatvectors.ll b/llvm/test/CodeGen/X86/combine-concatvectors.ll --- a/llvm/test/CodeGen/X86/combine-concatvectors.ll +++ b/llvm/test/CodeGen/X86/combine-concatvectors.ll @@ -48,8 +48,7 @@ ; AVX1-NEXT: movl $1091567616, 30256(%rax) # imm = 0x41100000 ; AVX1-NEXT: movabsq $4294967297, %rcx # imm = 0x100000001 ; AVX1-NEXT: movq %rcx, 46348(%rax) -; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [7.812501848093234E-3,7.812501848093234E-3,7.812501848093234E-3,7.812501848093234E-3] -; AVX1-NEXT: # ymm0 = mem[0,1,0,1] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm0 = [?,?,?,?] ; AVX1-NEXT: vmovups %ymm0, 48296(%rax) ; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vmovsd %xmm0, 47372(%rax) @@ -91,25 +90,24 @@ ; AVX1-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-NEXT: vmovaps (%rsi), %ymm1 ; AVX1-NEXT: vmovaps (%rdx), %ymm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6],ymm0[7] +; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0,2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,0],xmm0[0,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[2,0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: concat_of_broadcast_v4f32_v8f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,0] -; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = [6,7,4,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vmovaps (%rsi), %ymm1 +; AVX2-NEXT: vmovaps {{.*#+}} xmm2 = <6,0,u,3> +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %ld0 = load volatile <8 x float>, ptr %a0 diff --git a/llvm/test/CodeGen/X86/combine-multiplies.ll b/llvm/test/CodeGen/X86/combine-multiplies.ll --- a/llvm/test/CodeGen/X86/combine-multiplies.ll +++ b/llvm/test/CodeGen/X86/combine-multiplies.ll @@ -105,21 +105,21 @@ define void @testCombineMultiplies_splat(<4 x i32> %v1) nounwind { ; CHECK-LABEL: testCombineMultiplies_splat: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,11,11,11] -; CHECK-NEXT: paddd %xmm0, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [22,22,22,22] +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [22,22,22,22] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-NEXT: pmuludq %xmm2, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,242,242,242] -; CHECK-NEXT: paddd %xmm0, %xmm2 +; CHECK-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; CHECK-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; CHECK-NEXT: movdqa %xmm2, v2 -; CHECK-NEXT: movdqa %xmm0, v3 -; CHECK-NEXT: movdqa %xmm1, x +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [242,242,242,242] +; CHECK-NEXT: paddd %xmm2, %xmm1 +; CHECK-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; CHECK-NEXT: movdqa %xmm1, v2 +; CHECK-NEXT: movdqa %xmm2, v3 +; CHECK-NEXT: movdqa %xmm0, x ; CHECK-NEXT: retl entry: %add1 = add <4 x i32> %v1, @@ -139,20 +139,20 @@ define void @testCombineMultiplies_non_splat(<4 x i32> %v1) nounwind { ; CHECK-LABEL: testCombineMultiplies_non_splat: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,22,33,44] -; CHECK-NEXT: paddd %xmm0, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [22,33,44,55] +; CHECK-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,726,1452,2420] -; CHECK-NEXT: paddd %xmm0, %xmm2 +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,726,1452,2420] +; CHECK-NEXT: paddd %xmm1, %xmm2 +; CHECK-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; CHECK-NEXT: movdqa %xmm2, v2 -; CHECK-NEXT: movdqa %xmm0, v3 -; CHECK-NEXT: movdqa %xmm1, x +; CHECK-NEXT: movdqa %xmm1, v3 +; CHECK-NEXT: movdqa %xmm0, x ; CHECK-NEXT: retl entry: %add1 = add <4 x i32> %v1, diff --git a/llvm/test/CodeGen/X86/combine-or.ll b/llvm/test/CodeGen/X86/combine-or.ll --- a/llvm/test/CodeGen/X86/combine-or.ll +++ b/llvm/test/CodeGen/X86/combine-or.ll @@ -356,7 +356,9 @@ define <4 x float> @test25(<4 x float> %a0) { ; CHECK-LABEL: test25: ; CHECK: # %bb.0: -; CHECK-NEXT: blendps {{.*#+}} xmm0 = mem[0],xmm0[1,2],mem[3] +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %bc1 = bitcast <4 x float> %a0 to <4 x i32> %bc2 = bitcast <4 x float> to <4 x i32> diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll --- a/llvm/test/CodeGen/X86/combine-pmuldq.ll +++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll @@ -254,18 +254,18 @@ ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %edi ; AVX2-NEXT: vpextrd $1, %xmm0, %esi @@ -277,18 +277,18 @@ ; AVX512VL: # %bb.0: # %entry ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512VL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] -; AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX512VL-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vmovd %xmm0, %edi ; AVX512VL-NEXT: vpextrd $1, %xmm0, %esi @@ -300,18 +300,18 @@ ; AVX512DQVL: # %bb.0: # %entry ; AVX512DQVL-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512DQVL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX512DQVL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; AVX512DQVL-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] -; AVX512DQVL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; AVX512DQVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX512DQVL-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovd %xmm0, %edi ; AVX512DQVL-NEXT: vpextrd $1, %xmm0, %esi @@ -597,7 +597,7 @@ ; AVX512VL-NEXT: .p2align 4, 0x90 ; AVX512VL-NEXT: .LBB8_1: # %loop ; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512VL-NEXT: vpmovsxdq 2097152(%rdi,%rax), %zmm2 ; AVX512VL-NEXT: vpmuldq %zmm2, %zmm1, %zmm2 ; AVX512VL-NEXT: vpsrlq $32, %zmm2, %zmm2 ; AVX512VL-NEXT: vpmovqd %zmm2, %ymm2 @@ -616,7 +616,7 @@ ; AVX512DQVL-NEXT: .p2align 4, 0x90 ; AVX512DQVL-NEXT: .LBB8_1: # %loop ; AVX512DQVL-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512DQVL-NEXT: vpmovzxdq {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512DQVL-NEXT: vpmovsxdq 2097152(%rdi,%rax), %zmm2 ; AVX512DQVL-NEXT: vpmuldq %zmm2, %zmm1, %zmm2 ; AVX512DQVL-NEXT: vpsrlq $32, %zmm2, %zmm2 ; AVX512DQVL-NEXT: vpmovqd %zmm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/combine-rotates.ll b/llvm/test/CodeGen/X86/combine-rotates.ll --- a/llvm/test/CodeGen/X86/combine-rotates.ll +++ b/llvm/test/CodeGen/X86/combine-rotates.ll @@ -164,8 +164,10 @@ ; ; AVX512-LABEL: combine_vec_rot_select_zero: ; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vptestnmd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm2 {%k1} +; AVX512-NEXT: vmovdqa %xmm2, %xmm0 ; AVX512-NEXT: retq %3 = and <4 x i32> %1, %4 = shl <4 x i32> %0, %3 diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -1031,19 +1031,19 @@ ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrld $28, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psrld $29, %xmm3 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE2-NEXT: psrld $30, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] +; SSE2-NEXT: psrld $29, %xmm1 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE2-NEXT: psrld $30, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] +; SSE2-NEXT: paddd %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: psrad $4, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1] ; SSE2-NEXT: psrad $3, %xmm3 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] ; SSE2-NEXT: psrad $2, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -1115,37 +1115,37 @@ ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: psrld $28, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrld $29, %xmm4 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE2-NEXT: psrld $30, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3] -; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,0,1] +; SSE2-NEXT: psrld $29, %xmm0 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE2-NEXT: psrld $30, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[0,3] +; SSE2-NEXT: paddd %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm3 ; SSE2-NEXT: psrad $4, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,0,1] ; SSE2-NEXT: psrad $3, %xmm4 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE2-NEXT: psrad $2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: psrld $28, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psrld $29, %xmm4 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE2-NEXT: psrld $30, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3] -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,1] +; SSE2-NEXT: psrld $29, %xmm2 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE2-NEXT: psrld $30, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[0,3] +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm3 ; SSE2-NEXT: psrad $4, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,1] ; SSE2-NEXT: psrad $3, %xmm4 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE2-NEXT: psrad $2, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm4[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: retq @@ -1258,73 +1258,73 @@ ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: psrld $28, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: psrld $29, %xmm6 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; SSE2-NEXT: psrld $30, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3] -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,1,0,1] +; SSE2-NEXT: psrld $29, %xmm0 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] +; SSE2-NEXT: psrld $30, %xmm6 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm0[0,3] +; SSE2-NEXT: paddd %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm5 ; SSE2-NEXT: psrad $4, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,0,1] ; SSE2-NEXT: psrad $3, %xmm6 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] ; SSE2-NEXT: psrad $2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: psrld $28, %xmm5 -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: psrld $29, %xmm6 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; SSE2-NEXT: psrld $30, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3] -; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,0,1] +; SSE2-NEXT: psrld $29, %xmm1 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm5[1] +; SSE2-NEXT: psrld $30, %xmm6 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm1[0,3] +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm5 ; SSE2-NEXT: psrad $4, %xmm5 -; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] ; SSE2-NEXT: psrad $3, %xmm6 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] ; SSE2-NEXT: psrad $2, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm6[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: psrld $28, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: psrld $29, %xmm6 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; SSE2-NEXT: psrld $30, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3] -; SSE2-NEXT: paddd %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] +; SSE2-NEXT: psrld $29, %xmm4 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; SSE2-NEXT: psrld $30, %xmm6 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm4[0,3] +; SSE2-NEXT: paddd %xmm2, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm5 ; SSE2-NEXT: psrad $4, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,0,1] ; SSE2-NEXT: psrad $3, %xmm6 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] ; SSE2-NEXT: psrad $2, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm6[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3] -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: psrad $31, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: psrld $28, %xmm2 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: psrld $29, %xmm6 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE2-NEXT: psrld $30, %xmm5 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3] -; SSE2-NEXT: paddd %xmm3, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: psrld $28, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,1,0,1] +; SSE2-NEXT: psrld $29, %xmm2 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; SSE2-NEXT: psrld $30, %xmm6 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm2[0,3] +; SSE2-NEXT: paddd %xmm3, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm2 ; SSE2-NEXT: psrad $4, %xmm2 -; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,1,0,1] ; SSE2-NEXT: psrad $3, %xmm6 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1] ; SSE2-NEXT: psrad $2, %xmm5 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm6[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3] ; SSE2-NEXT: movaps %xmm4, %xmm2 ; SSE2-NEXT: movaps %xmm5, %xmm3 @@ -1988,25 +1988,25 @@ ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrld $28, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psrld $29, %xmm3 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE2-NEXT: psrld $30, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $4, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] +; SSE2-NEXT: psrld $29, %xmm1 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE2-NEXT: psrld $30, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] +; SSE2-NEXT: paddd %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrad $4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] ; SSE2-NEXT: psrad $3, %xmm3 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE2-NEXT: psrad $2, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: psubd %xmm1, %xmm2 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE2-NEXT: psrad $2, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubd %xmm2, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: @@ -3055,7 +3055,8 @@ ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; XOP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15],xmm2[1,3,5,7,9,11,13,15] +; XOP-NEXT: vpsrlw $8, %xmm2, %xmm2 +; XOP-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 @@ -3159,7 +3160,7 @@ ; CHECK-NEXT: testw %di, %di ; CHECK-NEXT: cmovnsl %edi, %eax ; CHECK-NEXT: cwtl -; CHECK-NEXT: sarl $8, %eax +; CHECK-NEXT: shrl $8, %eax ; CHECK-NEXT: negl %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll --- a/llvm/test/CodeGen/X86/combine-shl.ll +++ b/llvm/test/CodeGen/X86/combine-shl.ll @@ -711,13 +711,10 @@ define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) { ; SSE2-LABEL: combine_vec_shl_mul0: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [20,20,20,20] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pslld $2, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pslld $2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_shl_mul0: diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll --- a/llvm/test/CodeGen/X86/combine-sra.ll +++ b/llvm/test/CodeGen/X86/combine-sra.ll @@ -225,7 +225,8 @@ ; ; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_lshr: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7] +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -297,7 +298,8 @@ ; ; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_ashr: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7] +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/combine-srem.ll b/llvm/test/CodeGen/X86/combine-srem.ll --- a/llvm/test/CodeGen/X86/combine-srem.ll +++ b/llvm/test/CodeGen/X86/combine-srem.ll @@ -494,7 +494,7 @@ ; CHECK-NEXT: leal 15(%rax), %ecx ; CHECK-NEXT: testw %ax, %ax ; CHECK-NEXT: cmovnsl %edi, %ecx -; CHECK-NEXT: andl $-16, %ecx +; CHECK-NEXT: andl $65520, %ecx # imm = 0xFFF0 ; CHECK-NEXT: subl %ecx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $rax ; CHECK-NEXT: retq @@ -509,7 +509,7 @@ ; CHECK-NEXT: leal 255(%rax), %ecx ; CHECK-NEXT: testw %ax, %ax ; CHECK-NEXT: cmovnsl %edi, %ecx -; CHECK-NEXT: andl $-256, %ecx +; CHECK-NEXT: andl $65280, %ecx # imm = 0xFF00 ; CHECK-NEXT: subl %ecx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $rax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll --- a/llvm/test/CodeGen/X86/combine-srl.ll +++ b/llvm/test/CodeGen/X86/combine-srl.ll @@ -174,10 +174,8 @@ ; ; AVX2-FAST-ALL-LABEL: combine_vec_lshr_trunc_lshr0: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpsrlq $48, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7],zero,zero,ymm0[14,15],zero,zero,ymm0[u,u,u,u,u,u,u,u,22,23],zero,zero,ymm0[30,31],zero,zero,ymm0[u,u,u,u,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-ALL-NEXT: vzeroupper ; AVX2-FAST-ALL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll b/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll --- a/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll +++ b/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll @@ -139,18 +139,18 @@ ; SSE-LABEL: demandedelts_pblendvb: ; SSE: # %bb.0: ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: pblendvb %xmm0, %xmm1, %xmm3 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: pshufb %xmm0, %xmm3 +; SSE-NEXT: pshufb %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: demandedelts_pblendvb: ; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> zeroinitializer %2 = shufflevector <16 x i8> %a1, <16 x i8> undef, <16 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/X86/combine-sub.ll b/llvm/test/CodeGen/X86/combine-sub.ll --- a/llvm/test/CodeGen/X86/combine-sub.ll +++ b/llvm/test/CodeGen/X86/combine-sub.ll @@ -286,10 +286,10 @@ ; SSE-NEXT: movdqu (%rdi), %xmm0 ; SSE-NEXT: movdqu 16(%rdi), %xmm1 ; SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE-NEXT: psubd %xmm2, %xmm1 ; SSE-NEXT: psubd %xmm2, %xmm0 -; SSE-NEXT: movdqu %xmm0, (%rdi) +; SSE-NEXT: psubd %xmm2, %xmm1 ; SSE-NEXT: movdqu %xmm1, 16(%rdi) +; SSE-NEXT: movdqu %xmm0, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: PR52032_oneuse_constant: @@ -317,14 +317,14 @@ ; SSE-NEXT: movdqu 16(%rdi), %xmm2 ; SSE-NEXT: movdqu 32(%rdi), %xmm3 ; SSE-NEXT: movdqu 48(%rdi), %xmm4 -; SSE-NEXT: psubd %xmm0, %xmm2 ; SSE-NEXT: psubd %xmm0, %xmm1 -; SSE-NEXT: movdqu %xmm1, (%rdi) +; SSE-NEXT: psubd %xmm0, %xmm2 ; SSE-NEXT: movdqu %xmm2, 16(%rdi) -; SSE-NEXT: psubd %xmm0, %xmm4 +; SSE-NEXT: movdqu %xmm1, (%rdi) ; SSE-NEXT: psubd %xmm0, %xmm3 -; SSE-NEXT: movdqu %xmm3, 32(%rdi) +; SSE-NEXT: psubd %xmm0, %xmm4 ; SSE-NEXT: movdqu %xmm4, 48(%rdi) +; SSE-NEXT: movdqu %xmm3, 32(%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: PR52032: diff --git a/llvm/test/CodeGen/X86/commute-blend-sse41.ll b/llvm/test/CodeGen/X86/commute-blend-sse41.ll --- a/llvm/test/CodeGen/X86/commute-blend-sse41.ll +++ b/llvm/test/CodeGen/X86/commute-blend-sse41.ll @@ -54,11 +54,11 @@ define void @baz(ptr %arg, ptr %arg1) optsize { ; CHECK-LABEL: baz: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movaps (%rdi), %xmm0 -; CHECK-NEXT: movaps {{.*#+}} xmm1 = [3,3] -; CHECK-NEXT: andps %xmm0, %xmm1 -; CHECK-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] -; CHECK-NEXT: movups %xmm1, (%rsi) +; CHECK-NEXT: movq (%rdi), %rax +; CHECK-NEXT: movq 8(%rdi), %rcx +; CHECK-NEXT: movq %rax, (%rsi) +; CHECK-NEXT: andl $3, %ecx +; CHECK-NEXT: movq %rcx, 8(%rsi) ; CHECK-NEXT: retq bb: %tmp = load <2 x i64>, ptr %arg, align 16 diff --git a/llvm/test/CodeGen/X86/conditional-tailcall.ll b/llvm/test/CodeGen/X86/conditional-tailcall.ll --- a/llvm/test/CodeGen/X86/conditional-tailcall.ll +++ b/llvm/test/CodeGen/X86/conditional-tailcall.ll @@ -476,8 +476,8 @@ ; WIN64-NEXT: # %bb.5: # %sw.bb ; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1 ; WIN64-NEXT: movzbl (%rcx), %r9d # encoding: [0x44,0x0f,0xb6,0x09] -; WIN64-NEXT: cmpl $43, %r9d # encoding: [0x41,0x83,0xf9,0x2b] ; WIN64-NEXT: movl $1, %r8d # encoding: [0x41,0xb8,0x01,0x00,0x00,0x00] +; WIN64-NEXT: cmpl $43, %r9d # encoding: [0x41,0x83,0xf9,0x2b] ; WIN64-NEXT: je .LBB3_10 # encoding: [0x74,A] ; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1 ; WIN64-NEXT: # %bb.6: # %sw.bb diff --git a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll --- a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll +++ b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll @@ -1595,10 +1595,10 @@ define i64 @test_i64_140737488289792_mask_lshr_15(i64 %a0) { ; X86-LABEL: test_i64_140737488289792_mask_lshr_15: ; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $16, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shldl $17, %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll $17, %ecx +; X86-NEXT: leal (%ecx,%eax,2), %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: retl ; @@ -1615,10 +1615,11 @@ define i64 @test_i64_140737488289792_mask_lshr_16(i64 %a0) { ; X86-LABEL: test_i64_140737488289792_mask_lshr_16: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $32767, %eax # imm = 0x7FFF ; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shldl $16, %ecx, %eax +; X86-NEXT: shll $16, %eax +; X86-NEXT: orl %ecx, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: retl ; @@ -1769,10 +1770,10 @@ define i64 @test_i64_140737488289792_mask_ashr_15(i64 %a0) { ; X86-LABEL: test_i64_140737488289792_mask_ashr_15: ; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $16, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shldl $17, %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll $17, %ecx +; X86-NEXT: leal (%ecx,%eax,2), %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: retl ; @@ -1789,10 +1790,11 @@ define i64 @test_i64_140737488289792_mask_ashr_16(i64 %a0) { ; X86-LABEL: test_i64_140737488289792_mask_ashr_16: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $32767, %eax # imm = 0x7FFF ; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shldl $16, %ecx, %eax +; X86-NEXT: shll $16, %eax +; X86-NEXT: orl %ecx, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: retl ; @@ -1996,12 +1998,13 @@ define i64 @test_i64_140737488289792_mask_shl_15(i64 %a0) { ; X86-LABEL: test_i64_140737488289792_mask_shl_15: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shll $16, %ecx ; X86-NEXT: movl $32767, %edx # imm = 0x7FFF ; X86-NEXT: andl {{[0-9]+}}(%esp), %edx -; X86-NEXT: shldl $15, %eax, %edx -; X86-NEXT: andl $65536, %eax # imm = 0x10000 -; X86-NEXT: shll $15, %eax +; X86-NEXT: shldl $15, %ecx, %edx +; X86-NEXT: shll $31, %eax ; X86-NEXT: retl ; ; X64-LABEL: test_i64_140737488289792_mask_shl_15: @@ -2017,7 +2020,8 @@ define i64 @test_i64_140737488289792_mask_shl_16(i64 %a0) { ; X86-LABEL: test_i64_140737488289792_mask_shl_16: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shll $16, %eax ; X86-NEXT: movl $32767, %edx # imm = 0x7FFF ; X86-NEXT: andl {{[0-9]+}}(%esp), %edx ; X86-NEXT: shldl $16, %eax, %edx diff --git a/llvm/test/CodeGen/X86/dagcombine-cse.ll b/llvm/test/CodeGen/X86/dagcombine-cse.ll --- a/llvm/test/CodeGen/X86/dagcombine-cse.ll +++ b/llvm/test/CodeGen/X86/dagcombine-cse.ll @@ -17,7 +17,8 @@ ; X64-NEXT: imull %ecx, %esi ; X64-NEXT: addl %edx, %esi ; X64-NEXT: movslq %esi, %rax -; X64-NEXT: movl (%rdi,%rax), %eax +; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: movd %xmm0, %eax ; X64-NEXT: retq entry: %tmp7 = mul i32 %idxY, %ref_frame_stride ; [#uses=2] @@ -51,13 +52,12 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edi ; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl %edx, %ebp @@ -66,17 +66,18 @@ ; X86-NEXT: adcl %edx, %ebp ; X86-NEXT: setb %al ; X86-NEXT: movzbl %al, %ecx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ebx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl %ebp, %ebx ; X86-NEXT: adcl %edx, %ecx -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %edi ; X86-NEXT: addl (%esp), %edi ## 4-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/dagcombine-select.ll b/llvm/test/CodeGen/X86/dagcombine-select.ll --- a/llvm/test/CodeGen/X86/dagcombine-select.ll +++ b/llvm/test/CodeGen/X86/dagcombine-select.ll @@ -7,7 +7,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $11, %edi -; CHECK-NEXT: cmovgel %esi, %eax +; CHECK-NEXT: setge %al +; CHECK-NEXT: negl %eax +; CHECK-NEXT: andl %esi, %eax ; CHECK-NEXT: retq %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 0, i32 -1 @@ -20,7 +22,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $11, %edi -; CHECK-NEXT: cmovgel %esi, %eax +; CHECK-NEXT: setge %al +; CHECK-NEXT: negl %eax +; CHECK-NEXT: andl %esi, %eax ; CHECK-NEXT: retq %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 0, i32 -1 @@ -33,7 +37,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $11, %edi -; CHECK-NEXT: cmovll %esi, %eax +; CHECK-NEXT: setl %al +; CHECK-NEXT: negl %eax +; CHECK-NEXT: andl %esi, %eax ; CHECK-NEXT: retq %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 -1, i32 0 @@ -61,9 +67,11 @@ define i32 @select_or1(i32 %x, i32 %y) { ; CHECK-LABEL: select_or1: ; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $11, %edi -; CHECK-NEXT: movl $-1, %eax -; CHECK-NEXT: cmovll %esi, %eax +; CHECK-NEXT: setge %al +; CHECK-NEXT: negl %eax +; CHECK-NEXT: orl %esi, %eax ; CHECK-NEXT: retq %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 0, i32 -1 @@ -74,9 +82,11 @@ define i32 @select_or2(i32 %x, i32 %y) { ; CHECK-LABEL: select_or2: ; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $11, %edi -; CHECK-NEXT: movl $-1, %eax -; CHECK-NEXT: cmovll %esi, %eax +; CHECK-NEXT: setge %al +; CHECK-NEXT: negl %eax +; CHECK-NEXT: orl %esi, %eax ; CHECK-NEXT: retq %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 0, i32 -1 @@ -87,9 +97,11 @@ define i32 @select_or3(i32 %x, i32 %y) { ; CHECK-LABEL: select_or3: ; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $11, %edi -; CHECK-NEXT: movl $-1, %eax -; CHECK-NEXT: cmovgel %esi, %eax +; CHECK-NEXT: setl %al +; CHECK-NEXT: negl %eax +; CHECK-NEXT: orl %esi, %eax ; CHECK-NEXT: retq %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 -1, i32 0 @@ -180,10 +192,9 @@ define i32 @sel_constants_shl_constant(i1 %cond) { ; CHECK-LABEL: sel_constants_shl_constant: ; CHECK: # %bb.0: -; CHECK-NEXT: notb %dil -; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: orl $2, %eax +; CHECK-NEXT: xorl $3, %eax ; CHECK-NEXT: shll $8, %eax ; CHECK-NEXT: retq %sel = select i1 %cond, i32 2, i32 3 @@ -194,10 +205,12 @@ define i32 @shl_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: shl_constant_sel_constants: ; CHECK: # %bb.0: -; CHECK-NEXT: notb %dil -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: leal 4(,%rax,4), %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: xorb $3, %cl +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shll %cl, %eax ; CHECK-NEXT: retq %sel = select i1 %cond, i32 2, i32 3 %bo = shl i32 1, %sel @@ -207,10 +220,12 @@ define i32 @shl_constant_sel_setcc(i32 %a) { ; CHECK-LABEL: shl_constant_sel_setcc: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: sete %al -; CHECK-NEXT: leal 4(,%rax,4), %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: xorb $3, %cl +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shll %cl, %eax ; CHECK-NEXT: retq %m = and i32 %a, 1 %cond = icmp ne i32 %m, 0 @@ -222,9 +237,12 @@ define i32 @lshr_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: lshr_constant_sel_constants: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: leal 8(,%rdi,8), %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: xorb $3, %cl +; CHECK-NEXT: movl $64, %eax +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shrl %cl, %eax ; CHECK-NEXT: retq %sel = select i1 %cond, i32 2, i32 3 %bo = lshr i32 64, %sel @@ -234,9 +252,12 @@ define i32 @lshr_constant_sel_setcc(i32 %a) { ; CHECK-LABEL: lshr_constant_sel_setcc: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: leal 8(,%rdi,8), %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: xorb $3, %cl +; CHECK-NEXT: movl $64, %eax +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shrl %cl, %eax ; CHECK-NEXT: retq %m = and i32 %a, 1 %cond = icmp ne i32 %m, 0 @@ -248,10 +269,12 @@ define i32 @ashr_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: ashr_constant_sel_constants: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: shll $4, %edi -; CHECK-NEXT: leal 16(%rdi), %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: xorb $3, %cl +; CHECK-NEXT: movl $128, %eax +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shrl %cl, %eax ; CHECK-NEXT: retq %sel = select i1 %cond, i32 2, i32 3 %bo = ashr i32 128, %sel @@ -261,10 +284,12 @@ define i32 @ashr_constant_sel_setcc(i32 %a) { ; CHECK-LABEL: ashr_constant_sel_setcc: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: shll $4, %edi -; CHECK-NEXT: leal 16(%rdi), %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: xorb $3, %cl +; CHECK-NEXT: movl $128, %eax +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shrl %cl, %eax ; CHECK-NEXT: retq %m = and i32 %a, 1 %cond = icmp ne i32 %m, 0 diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -581,8 +581,8 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: sbbl %ebp, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: movl %ecx, 4(%eax) +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: movl %esi, 8(%eax) ; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: addl $152, %esp @@ -1025,35 +1025,35 @@ ; X86-NEXT: movd %xmm2, %esi ; X86-NEXT: cltd ; X86-NEXT: idivl %esi -; X86-NEXT: movd %eax, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X86-NEXT: movd %xmm2, %eax -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X86-NEXT: movd %xmm2, %esi +; X86-NEXT: movd %eax, %xmm2 +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-NEXT: movd %xmm3, %eax +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; X86-NEXT: movd %xmm3, %esi ; X86-NEXT: cltd ; X86-NEXT: idivl %esi -; X86-NEXT: movd %eax, %xmm2 -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X86-NEXT: movd %eax, %xmm3 +; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; X86-NEXT: movd %xmm0, %eax ; X86-NEXT: movd %xmm1, %esi ; X86-NEXT: cltd ; X86-NEXT: idivl %esi -; X86-NEXT: movd %eax, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; X86-NEXT: movd %xmm4, %eax -; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; X86-NEXT: movd %xmm4, %esi +; X86-NEXT: movd %eax, %xmm4 +; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] +; X86-NEXT: movd %xmm5, %eax +; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; X86-NEXT: movd %xmm5, %esi ; X86-NEXT: cltd ; X86-NEXT: idivl %esi -; X86-NEXT: movd %eax, %xmm4 -; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; X86-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; X86-NEXT: movdqa %xmm3, (%ecx) -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; X86-NEXT: pmuludq %xmm1, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; X86-NEXT: movd %eax, %xmm5 +; X86-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; X86-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; X86-NEXT: movdqa %xmm4, (%ecx) +; X86-NEXT: pmuludq %xmm1, %xmm4 +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; X86-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-NEXT: pmuludq %xmm2, %xmm1 +; X86-NEXT: pmuludq %xmm5, %xmm1 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; X86-NEXT: psubd %xmm3, %xmm0 @@ -1081,25 +1081,25 @@ ; X64-NEXT: movd %xmm1, %ecx ; X64-NEXT: cltd ; X64-NEXT: idivl %ecx -; X64-NEXT: movd %eax, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; X64-NEXT: movd %xmm4, %eax -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; X64-NEXT: movd %xmm4, %ecx +; X64-NEXT: movd %eax, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] +; X64-NEXT: movd %xmm5, %eax +; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; X64-NEXT: movd %xmm5, %ecx ; X64-NEXT: cltd ; X64-NEXT: idivl %ecx -; X64-NEXT: movd %eax, %xmm4 -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X64-NEXT: movdqa %xmm2, (%rdi) -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; X64-NEXT: pmuludq %xmm1, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X64-NEXT: movd %eax, %xmm5 +; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; X64-NEXT: movdqa %xmm4, (%rdi) +; X64-NEXT: pmuludq %xmm1, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; X64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X64-NEXT: pmuludq %xmm3, %xmm1 +; X64-NEXT: pmuludq %xmm5, %xmm1 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X64-NEXT: psubd %xmm2, %xmm0 +; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; X64-NEXT: psubd %xmm3, %xmm0 ; X64-NEXT: retq %div = sdiv <4 x i32> %x, %y store <4 x i32> %div, ptr %divdst, align 16 diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll @@ -543,8 +543,8 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %esi, (%eax) ; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %esi, (%eax) ; X86-NEXT: movl %ebx, 8(%eax) ; X86-NEXT: movl %ecx, 12(%eax) ; X86-NEXT: addl $132, %esp @@ -987,35 +987,35 @@ ; X86-NEXT: movd %xmm2, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %esi -; X86-NEXT: movd %eax, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X86-NEXT: movd %xmm2, %eax -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X86-NEXT: movd %xmm2, %esi +; X86-NEXT: movd %eax, %xmm2 +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-NEXT: movd %xmm3, %eax +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; X86-NEXT: movd %xmm3, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %esi -; X86-NEXT: movd %eax, %xmm2 -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X86-NEXT: movd %eax, %xmm3 +; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; X86-NEXT: movd %xmm0, %eax ; X86-NEXT: movd %xmm1, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %esi -; X86-NEXT: movd %eax, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; X86-NEXT: movd %xmm4, %eax -; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; X86-NEXT: movd %xmm4, %esi +; X86-NEXT: movd %eax, %xmm4 +; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] +; X86-NEXT: movd %xmm5, %eax +; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; X86-NEXT: movd %xmm5, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %esi -; X86-NEXT: movd %eax, %xmm4 -; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; X86-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; X86-NEXT: movdqa %xmm3, (%ecx) -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; X86-NEXT: pmuludq %xmm1, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; X86-NEXT: movd %eax, %xmm5 +; X86-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; X86-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; X86-NEXT: movdqa %xmm4, (%ecx) +; X86-NEXT: pmuludq %xmm1, %xmm4 +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; X86-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-NEXT: pmuludq %xmm2, %xmm1 +; X86-NEXT: pmuludq %xmm5, %xmm1 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; X86-NEXT: psubd %xmm3, %xmm0 @@ -1043,25 +1043,25 @@ ; X64-NEXT: movd %xmm1, %ecx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divl %ecx -; X64-NEXT: movd %eax, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; X64-NEXT: movd %xmm4, %eax -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; X64-NEXT: movd %xmm4, %ecx +; X64-NEXT: movd %eax, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] +; X64-NEXT: movd %xmm5, %eax +; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; X64-NEXT: movd %xmm5, %ecx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divl %ecx -; X64-NEXT: movd %eax, %xmm4 -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X64-NEXT: movdqa %xmm2, (%rdi) -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; X64-NEXT: pmuludq %xmm1, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X64-NEXT: movd %eax, %xmm5 +; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; X64-NEXT: movdqa %xmm4, (%rdi) +; X64-NEXT: pmuludq %xmm1, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; X64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X64-NEXT: pmuludq %xmm3, %xmm1 +; X64-NEXT: pmuludq %xmm5, %xmm1 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X64-NEXT: psubd %xmm2, %xmm0 +; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; X64-NEXT: psubd %xmm3, %xmm0 ; X64-NEXT: retq %div = udiv <4 x i32> %x, %y store <4 x i32> %div, ptr %divdst, align 16 diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll --- a/llvm/test/CodeGen/X86/divide-by-constant.ll +++ b/llvm/test/CodeGen/X86/divide-by-constant.ll @@ -320,7 +320,10 @@ ; X64-FAST-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 ; X64-FAST-NEXT: movq %rdi, %rax ; X64-FAST-NEXT: mulq %rcx -; X64-FAST-NEXT: movq %rdx, %rax +; X64-FAST-NEXT: subq %rdx, %rdi +; X64-FAST-NEXT: shrq %rdi +; X64-FAST-NEXT: leaq (%rdi,%rdx), %rax +; X64-FAST-NEXT: shrq $2, %rax ; X64-FAST-NEXT: retq ; ; X64-SLOW-LABEL: PR23590: @@ -329,10 +332,14 @@ ; X64-SLOW-NEXT: movq %rdi, %rax ; X64-SLOW-NEXT: mulq %rcx ; X64-SLOW-NEXT: shrq $12, %rdx -; X64-SLOW-NEXT: imulq $12345, %rdx, %rax # imm = 0x3039 -; X64-SLOW-NEXT: subq %rax, %rdi +; X64-SLOW-NEXT: imull $12345, %edx, %eax # imm = 0x3039 +; X64-SLOW-NEXT: subl %eax, %edi ; X64-SLOW-NEXT: imulq $613566757, %rdi, %rax # imm = 0x24924925 ; X64-SLOW-NEXT: shrq $32, %rax +; X64-SLOW-NEXT: subl %eax, %edi +; X64-SLOW-NEXT: shrl %edi +; X64-SLOW-NEXT: addl %edi, %eax +; X64-SLOW-NEXT: shrl $2, %eax ; X64-SLOW-NEXT: retq entry: %rem = urem i64 %x, 12345 diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll --- a/llvm/test/CodeGen/X86/divmod128.ll +++ b/llvm/test/CodeGen/X86/divmod128.ll @@ -19,8 +19,8 @@ ; WIN64-NEXT: subq $72, %rsp ; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; WIN64-NEXT: callq __modti3 @@ -49,8 +49,8 @@ ; WIN64-NEXT: subq $72, %rsp ; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; WIN64-NEXT: callq __divti3 @@ -79,8 +79,8 @@ ; WIN64-NEXT: subq $72, %rsp ; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $11, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $11, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; WIN64-NEXT: callq __umodti3 @@ -969,8 +969,8 @@ ; WIN64-NEXT: movq %rdx, 8(%rax) ; WIN64-NEXT: movq %rcx, (%rax) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: movq $3, (%rdx) ; WIN64-NEXT: andq $0, 8(%rdx) +; WIN64-NEXT: movq $3, (%rdx) ; WIN64-NEXT: movq %rax, %rcx ; WIN64-NEXT: callq __umodti3 ; WIN64-NEXT: movq %xmm0, %rax @@ -1001,8 +1001,8 @@ ; WIN64-NEXT: movq %rdx, 8(%rax) ; WIN64-NEXT: movq %rcx, (%rax) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: movq $3, (%rdx) ; WIN64-NEXT: movq $0, 8(%rdx) +; WIN64-NEXT: movq $3, (%rdx) ; WIN64-NEXT: movq %rax, %rcx ; WIN64-NEXT: callq __umodti3 ; WIN64-NEXT: movq %xmm0, %rax diff --git a/llvm/test/CodeGen/X86/dont-trunc-store-double-to-float.ll b/llvm/test/CodeGen/X86/dont-trunc-store-double-to-float.ll --- a/llvm/test/CodeGen/X86/dont-trunc-store-double-to-float.ll +++ b/llvm/test/CodeGen/X86/dont-trunc-store-double-to-float.ll @@ -10,10 +10,13 @@ ; CHECK-NEXT: movl %esp, %ebp ; CHECK-NEXT: .cfi_def_cfa_register %ebp ; CHECK-NEXT: andl $-8, %esp -; CHECK-NEXT: subl $16, %esp +; CHECK-NEXT: subl $24, %esp ; CHECK-NEXT: movl $1074339512, {{[0-9]+}}(%esp) # imm = 0x40091EB8 -; CHECK-NEXT: movl $1374389535, (%esp) # imm = 0x51EB851F -; CHECK-NEXT: movl $1078523331, {{[0-9]+}}(%esp) # imm = 0x4048F5C3 +; CHECK-NEXT: movl $1374389535, {{[0-9]+}}(%esp) # imm = 0x51EB851F +; CHECK-NEXT: fldl {{[0-9]+}}(%esp) +; CHECK-NEXT: fstps {{[0-9]+}}(%esp) +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: fstps {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %ebp, %esp ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: .cfi_def_cfa %esp, 4 diff --git a/llvm/test/CodeGen/X86/dpbusd.ll b/llvm/test/CodeGen/X86/dpbusd.ll --- a/llvm/test/CodeGen/X86/dpbusd.ll +++ b/llvm/test/CodeGen/X86/dpbusd.ll @@ -6,13 +6,18 @@ define i32 @no_dpbusd(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVXVNNI-LABEL: no_dpbusd: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 +; AVXVNNI-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax @@ -22,13 +27,16 @@ ; ; AVX512-LABEL: no_dpbusd: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmulld %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -49,44 +57,41 @@ define i32 @vpdpbusd_mutate(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVXVNNI-LABEL: vpdpbusd_mutate: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vmovdqa (%rsi), %xmm0 -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: {vex} vpdpbusd (%rdi), %xmm0, %xmm1 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVXVNNI-NEXT: vpmovsxbd 8(%rdi), %ymm0 +; AVXVNNI-NEXT: vpmovsxbd (%rdi), %ymm1 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 +; AVXVNNI-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax ; AVXVNNI-NEXT: addl %edx, %eax +; AVXVNNI-NEXT: vzeroupper ; AVXVNNI-NEXT: retq ; -; AVX512VNNI-LABEL: vpdpbusd_mutate: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VNNI-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VNNI-NEXT: vmovd %xmm0, %eax -; AVX512VNNI-NEXT: addl %edx, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: vpdpbusd_mutate: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpdpbusd (%rdi), %xmm0, %xmm1 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax -; AVX512VLVNNI-NEXT: addl %edx, %eax -; AVX512VLVNNI-NEXT: retq +; AVX512-LABEL: vpdpbusd_mutate: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxbd (%rdi), %zmm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmulld %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: addl %edx, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %0 = load <16 x i8>, ptr %a, align 16 %1 = sext <16 x i8> %0 to <16 x i32> @@ -109,9 +114,9 @@ ; AVXVNNI-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax @@ -128,9 +133,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -163,9 +168,9 @@ ; AVXVNNI-NEXT: vpmovsxwd %xmm0, %ymm0 ; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax @@ -182,9 +187,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -210,44 +215,41 @@ define i32 @vpdpbusd_512(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVXVNNI-LABEL: vpdpbusd_512: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vmovdqa (%rdi), %xmm0 -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: {vex} vpdpbusd (%rsi), %xmm0, %xmm1 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovsxbd 8(%rsi), %ymm2 +; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0 +; AVXVNNI-NEXT: vpmovsxbd (%rsi), %ymm2 +; AVXVNNI-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 +; AVXVNNI-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax ; AVXVNNI-NEXT: addl %edx, %eax +; AVXVNNI-NEXT: vzeroupper ; AVXVNNI-NEXT: retq ; -; AVX512VNNI-LABEL: vpdpbusd_512: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VNNI-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VNNI-NEXT: vpdpbusd %zmm1, %zmm0, %zmm2 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VNNI-NEXT: vmovd %xmm0, %eax -; AVX512VNNI-NEXT: addl %edx, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: vpdpbusd_512: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpdpbusd (%rsi), %xmm0, %xmm1 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax -; AVX512VLVNNI-NEXT: addl %edx, %eax -; AVX512VLVNNI-NEXT: retq +; AVX512-LABEL: vpdpbusd_512: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovsxbd (%rsi), %zmm1 +; AVX512-NEXT: vpmulld %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: addl %edx, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %0 = load <16 x i8>, ptr %a, align 16 %1 = zext <16 x i8> %0 to <16 x i32> @@ -264,40 +266,35 @@ define i32 @vpdpbusd_256(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVXVNNI-LABEL: vpdpbusd_256: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVXVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVXVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVXVNNI-NEXT: {vex} vpdpbusd %xmm0, %xmm1, %xmm2 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; AVXVNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovsxbd (%rsi), %ymm1 +; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 +; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax ; AVXVNNI-NEXT: addl %edx, %eax +; AVXVNNI-NEXT: vzeroupper ; AVXVNNI-NEXT: retq ; -; AVX512VNNI-LABEL: vpdpbusd_256: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX512VNNI-NEXT: vmovd %xmm0, %eax -; AVX512VNNI-NEXT: addl %edx, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: vpdpbusd_256: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLVNNI-NEXT: vpdpbusd %xmm0, %xmm1, %xmm2 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax -; AVX512VLVNNI-NEXT: addl %edx, %eax -; AVX512VLVNNI-NEXT: retq +; AVX512-LABEL: vpdpbusd_256: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vpmovsxbd (%rsi), %ymm1 +; AVX512-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: addl %edx, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %0 = load <8 x i8>, ptr %a, align 8 %1 = zext <8 x i8> %0 to <8 x i32> @@ -314,42 +311,29 @@ define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVXVNNI-LABEL: vpdpbusd_128: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVXVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVXVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] -; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; AVXVNNI-NEXT: {vex} vpdpbusd %xmm1, %xmm0, %xmm2 -; AVXVNNI-NEXT: vmovd %xmm2, %eax +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVXVNNI-NEXT: vpmovsxbd (%rsi), %xmm1 +; AVXVNNI-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vmovd %xmm0, %eax ; AVXVNNI-NEXT: addl %edx, %eax ; AVXVNNI-NEXT: retq ; -; AVX512VNNI-LABEL: vpdpbusd_128: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] -; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2 -; AVX512VNNI-NEXT: vmovd %xmm2, %eax -; AVX512VNNI-NEXT: addl %edx, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: vpdpbusd_128: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] -; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLVNNI-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2 -; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax -; AVX512VLVNNI-NEXT: addl %edx, %eax -; AVX512VLVNNI-NEXT: retq +; AVX512-LABEL: vpdpbusd_128: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX512-NEXT: vpmovsxbd (%rsi), %xmm1 +; AVX512-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: addl %edx, %eax +; AVX512-NEXT: retq entry: %0 = load <4 x i8>, ptr %a, align 8 %1 = zext <4 x i8> %0 to <4 x i32> @@ -367,40 +351,28 @@ ; AVXVNNI-LABEL: vpdpbusd_2xi32: ; AVXVNNI: # %bb.0: # %entry ; AVXVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVXVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVXVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] -; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; AVXVNNI-NEXT: {vex} vpdpbusd %xmm1, %xmm0, %xmm2 -; AVXVNNI-NEXT: vmovd %xmm2, %eax +; AVXVNNI-NEXT: vpmovsxbd %xmm1, %xmm1 +; AVXVNNI-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vmovd %xmm0, %eax ; AVXVNNI-NEXT: addl %edx, %eax ; AVXVNNI-NEXT: retq ; -; AVX512VNNI-LABEL: vpdpbusd_2xi32: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512VNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; AVX512VNNI-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512VNNI-NEXT: vpandq %zmm1, %zmm2, %zmm1 -; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2 -; AVX512VNNI-NEXT: vmovd %xmm2, %eax -; AVX512VNNI-NEXT: addl %edx, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: vpdpbusd_2xi32: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] -; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; AVX512VLVNNI-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2 -; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax -; AVX512VLVNNI-NEXT: addl %edx, %eax -; AVX512VLVNNI-NEXT: retq +; AVX512-LABEL: vpdpbusd_2xi32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vpmovsxbd %xmm1, %xmm1 +; AVX512-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: addl %edx, %eax +; AVX512-NEXT: retq entry: %0 = load <2 x i8>, ptr %a, align 8 %1 = zext <2 x i8> %0 to <2 x i32> @@ -417,13 +389,25 @@ define i32 @vpdpbusd_32xi32(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVXVNNI-LABEL: vpdpbusd_32xi32: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vmovdqu (%rdi), %ymm0 -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: {vex} vpdpbusd (%rsi), %ymm0, %ymm1 -; AVXVNNI-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVXVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovsxbd 16(%rsi), %ymm4 +; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm4, %ymm0 +; AVXVNNI-NEXT: vpmovsxbd 24(%rsi), %ymm4 +; AVXVNNI-NEXT: vpmaddwd %ymm1, %ymm4, %ymm1 +; AVXVNNI-NEXT: vpmovsxbd 8(%rsi), %ymm4 +; AVXVNNI-NEXT: vpmaddwd %ymm2, %ymm4, %ymm2 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; AVXVNNI-NEXT: vpmovsxbd (%rsi), %ymm2 +; AVXVNNI-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2 +; AVXVNNI-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax @@ -431,38 +415,27 @@ ; AVXVNNI-NEXT: vzeroupper ; AVXVNNI-NEXT: retq ; -; AVX512VNNI-LABEL: vpdpbusd_32xi32: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512VNNI-NEXT: vmovdqu (%rsi), %ymm1 -; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VNNI-NEXT: vpdpbusd %zmm1, %zmm0, %zmm2 -; AVX512VNNI-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VNNI-NEXT: vmovd %xmm0, %eax -; AVX512VNNI-NEXT: addl %edx, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: vpdpbusd_32xi32: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpdpbusd (%rsi), %ymm0, %ymm1 -; AVX512VLVNNI-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax -; AVX512VLVNNI-NEXT: addl %edx, %eax -; AVX512VLVNNI-NEXT: vzeroupper -; AVX512VLVNNI-NEXT: retq +; AVX512-LABEL: vpdpbusd_32xi32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovsxbd (%rsi), %zmm2 +; AVX512-NEXT: vpmulld %zmm0, %zmm2, %zmm0 +; AVX512-NEXT: vpmovsxbd 16(%rsi), %zmm2 +; AVX512-NEXT: vpmulld %zmm1, %zmm2, %zmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: addl %edx, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %0 = load <32 x i8>, ptr %a, align 16 %1 = zext <32 x i8> %0 to <32 x i32> @@ -479,17 +452,41 @@ define i32 @vpdpbusd_64xi32(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVXVNNI-LABEL: vpdpbusd_64xi32: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vmovdqu (%rdi), %ymm0 -; AVXVNNI-NEXT: vmovdqu 32(%rdi), %ymm1 -; AVXVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVXVNNI-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVXVNNI-NEXT: {vex} vpdpbusd 32(%rsi), %ymm1, %ymm3 -; AVXVNNI-NEXT: {vex} vpdpbusd (%rsi), %ymm0, %ymm2 -; AVXVNNI-NEXT: vpaddd %ymm3, %ymm2, %ymm0 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovsxbd 40(%rsi), %ymm8 +; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm8, %ymm0 +; AVXVNNI-NEXT: vpmovsxbd 56(%rsi), %ymm8 +; AVXVNNI-NEXT: vpmaddwd %ymm1, %ymm8, %ymm1 +; AVXVNNI-NEXT: vpmovsxbd 32(%rsi), %ymm8 +; AVXVNNI-NEXT: vpmaddwd %ymm2, %ymm8, %ymm2 +; AVXVNNI-NEXT: vpmovsxbd 48(%rsi), %ymm8 +; AVXVNNI-NEXT: vpmaddwd %ymm3, %ymm8, %ymm3 +; AVXVNNI-NEXT: vpmovsxbd 16(%rsi), %ymm8 +; AVXVNNI-NEXT: vpmaddwd %ymm4, %ymm8, %ymm4 +; AVXVNNI-NEXT: vpaddd %ymm3, %ymm4, %ymm3 +; AVXVNNI-NEXT: vpmovsxbd (%rsi), %ymm4 +; AVXVNNI-NEXT: vpmaddwd %ymm5, %ymm4, %ymm4 +; AVXVNNI-NEXT: vpaddd %ymm2, %ymm4, %ymm2 +; AVXVNNI-NEXT: vpaddd %ymm3, %ymm2, %ymm2 +; AVXVNNI-NEXT: vpmovsxbd 24(%rsi), %ymm3 +; AVXVNNI-NEXT: vpmaddwd %ymm6, %ymm3, %ymm3 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm3, %ymm1 +; AVXVNNI-NEXT: vpmovsxbd 8(%rsi), %ymm3 +; AVXVNNI-NEXT: vpmaddwd %ymm7, %ymm3, %ymm3 +; AVXVNNI-NEXT: vpaddd %ymm0, %ymm3, %ymm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax @@ -499,15 +496,27 @@ ; ; AVX512-LABEL: vpdpbusd_64xi32: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpdpbusd (%rsi), %zmm0, %zmm1 -; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovsxbd 16(%rsi), %zmm4 +; AVX512-NEXT: vpmulld %zmm0, %zmm4, %zmm0 +; AVX512-NEXT: vpmovsxbd 48(%rsi), %zmm4 +; AVX512-NEXT: vpmulld %zmm1, %zmm4, %zmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovsxbd (%rsi), %zmm1 +; AVX512-NEXT: vpmulld %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpmovsxbd 32(%rsi), %zmm2 +; AVX512-NEXT: vpmulld %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -526,3 +535,6 @@ } declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX512VLVNNI: {{.*}} +; AVX512VNNI: {{.*}} diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll --- a/llvm/test/CodeGen/X86/dpbusd_const.ll +++ b/llvm/test/CodeGen/X86/dpbusd_const.ll @@ -24,35 +24,17 @@ } define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) { -; AVXVNNI-LABEL: mul_4xi8_zc: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVXVNNI-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVXVNNI-NEXT: vmovd %xmm1, %eax -; AVXVNNI-NEXT: addl %edi, %eax -; AVXVNNI-NEXT: retq -; -; AVX512VNNI-LABEL: mul_4xi8_zc: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512VNNI-NEXT: vmovd %xmm1, %eax -; AVX512VNNI-NEXT: addl %edi, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: mul_4xi8_zc: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512VLVNNI-NEXT: vmovd %xmm1, %eax -; AVX512VLVNNI-NEXT: addl %edi, %eax -; AVX512VLVNNI-NEXT: retq +; ALL-LABEL: mul_4xi8_zc: +; ALL: # %bb.0: # %entry +; ALL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; ALL-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: addl %edi, %eax +; ALL-NEXT: retq entry: %0 = zext <4 x i8> %a to <4 x i32> %1 = mul nsw <4 x i32> %0, @@ -64,35 +46,39 @@ define i32 @mul_4xi4_cz(<4 x i4> %a, i32 %c) { ; AVXVNNI-LABEL: mul_4xi4_cz: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVXVNNI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVXVNNI-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVXVNNI-NEXT: vmovd %xmm1, %eax +; AVXVNNI-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] +; AVXVNNI-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vmovd %xmm0, %eax ; AVXVNNI-NEXT: addl %edi, %eax ; AVXVNNI-NEXT: retq ; ; AVX512VNNI-LABEL: mul_4xi4_cz: ; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512VNNI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512VNNI-NEXT: vmovd %xmm1, %eax +; AVX512VNNI-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] +; AVX512VNNI-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VNNI-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VNNI-NEXT: vmovd %xmm0, %eax ; AVX512VNNI-NEXT: addl %edi, %eax -; AVX512VNNI-NEXT: vzeroupper ; AVX512VNNI-NEXT: retq ; ; AVX512VLVNNI-LABEL: mul_4xi4_cz: ; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vpmovdb %xmm0, %xmm0 ; AVX512VLVNNI-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512VLVNNI-NEXT: vmovd %xmm1, %eax +; AVX512VLVNNI-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax ; AVX512VLVNNI-NEXT: addl %edi, %eax ; AVX512VLVNNI-NEXT: retq entry: @@ -104,38 +90,17 @@ } define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) { -; AVXVNNI-LABEL: mul_4xi8_cs: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVXVNNI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] -; AVXVNNI-NEXT: {vex} vpdpbusd %xmm0, %xmm2, %xmm1 -; AVXVNNI-NEXT: vmovd %xmm1, %eax -; AVXVNNI-NEXT: addl %edi, %eax -; AVXVNNI-NEXT: retq -; -; AVX512VNNI-LABEL: mul_4xi8_cs: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2 -; AVX512VNNI-NEXT: vmovd %xmm2, %eax -; AVX512VNNI-NEXT: addl %edi, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: mul_4xi8_cs: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VLVNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLVNNI-NEXT: vpdpbusd %xmm0, %xmm1, %xmm2 -; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax -; AVX512VLVNNI-NEXT: addl %edi, %eax -; AVX512VLVNNI-NEXT: retq +; ALL-LABEL: mul_4xi8_cs: +; ALL: # %bb.0: # %entry +; ALL-NEXT: vpmovsxbd %xmm0, %xmm0 +; ALL-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: addl %edi, %eax +; ALL-NEXT: retq entry: %0 = sext <4 x i8> %a to <4 x i32> %1 = mul nsw <4 x i32> , %0 @@ -167,41 +132,41 @@ define i32 @mul_16xi8_zc(<16 x i8> %a, i32 %c) { ; AVXVNNI-LABEL: mul_16xi8_zc: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVXVNNI-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,64,0,0,0,1,0,2,0,64,0] +; AVXVNNI-NEXT: # ymm2 = mem[0,1,0,1] +; AVXVNNI-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax ; AVXVNNI-NEXT: addl %edi, %eax +; AVXVNNI-NEXT: vzeroupper ; AVXVNNI-NEXT: retq ; -; AVX512VNNI-LABEL: mul_16xi8_zc: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vmovdqa %xmm0, %xmm0 -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VNNI-NEXT: vmovd %xmm0, %eax -; AVX512VNNI-NEXT: addl %edi, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: mul_16xi8_zc: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax -; AVX512VLVNNI-NEXT: addl %edi, %eax -; AVX512VLVNNI-NEXT: retq +; AVX512-LABEL: mul_16xi8_zc: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: addl %edi, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %0 = zext <16 x i8> %a to <16 x i32> %1 = mul nsw <16 x i32> %0, @@ -213,12 +178,26 @@ define i32 @mul_32xi8_zc(<32 x i8> %a, i32 %c) { ; AVXVNNI-LABEL: mul_32xi8_zc: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 -; AVXVNNI-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVXVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVXVNNI-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,1,0,2,0,64,0,0,0,1,0,2,0,64,0] +; AVXVNNI-NEXT: # ymm4 = mem[0,1,0,1] +; AVXVNNI-NEXT: vpmaddwd %ymm4, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 +; AVXVNNI-NEXT: vpmaddwd %ymm4, %ymm1, %ymm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm3, %ymm1 +; AVXVNNI-NEXT: vpmaddwd %ymm4, %ymm2, %ymm2 +; AVXVNNI-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax @@ -232,9 +211,9 @@ ; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 ; AVX512VNNI-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX512VNNI-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VNNI-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512VNNI-NEXT: vmovd %xmm0, %eax @@ -247,9 +226,9 @@ ; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VLVNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX512VLVNNI-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX512VLVNNI-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VLVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax @@ -267,16 +246,41 @@ define i32 @mul_64xi8_zc(<64 x i8> %a, i32 %c) { ; AVXVNNI-LABEL: mul_64xi8_zc: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64] -; AVXVNNI-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVXVNNI-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVXVNNI-NEXT: {vex} vpdpbusd %ymm2, %ymm1, %ymm4 -; AVXVNNI-NEXT: {vex} vpdpbusd %ymm2, %ymm0, %ymm3 -; AVXVNNI-NEXT: vpaddd %ymm4, %ymm3, %ymm0 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVXVNNI-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVXVNNI-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,0,2,0,64,0,0,0,1,0,2,0,64,0] +; AVXVNNI-NEXT: # ymm8 = mem[0,1,0,1] +; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm8, %ymm0 +; AVXVNNI-NEXT: vpmaddwd %ymm5, %ymm8, %ymm5 +; AVXVNNI-NEXT: vpmaddwd %ymm7, %ymm8, %ymm7 +; AVXVNNI-NEXT: vpmaddwd %ymm6, %ymm8, %ymm6 +; AVXVNNI-NEXT: vpmaddwd %ymm3, %ymm8, %ymm3 +; AVXVNNI-NEXT: vpaddd %ymm3, %ymm6, %ymm3 +; AVXVNNI-NEXT: vpmaddwd %ymm1, %ymm8, %ymm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm7, %ymm1 +; AVXVNNI-NEXT: vpaddd %ymm3, %ymm1, %ymm1 +; AVXVNNI-NEXT: vpmaddwd %ymm4, %ymm8, %ymm3 +; AVXVNNI-NEXT: vpaddd %ymm3, %ymm5, %ymm3 +; AVXVNNI-NEXT: vpmaddwd %ymm2, %ymm8, %ymm2 +; AVXVNNI-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpaddd %ymm3, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax @@ -291,9 +295,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm0 ; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/dpbusd_i4.ll b/llvm/test/CodeGen/X86/dpbusd_i4.ll --- a/llvm/test/CodeGen/X86/dpbusd_i4.ll +++ b/llvm/test/CodeGen/X86/dpbusd_i4.ll @@ -6,15 +6,20 @@ define i32 @mul_i8i8(ptr%a, <16 x i8> %b, i32 %c) { ; CHECK-LABEL: mul_i8i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovdqa (%rdi), %xmm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpdpbusd %xmm0, %xmm1, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; CHECK-NEXT: vpmovsxbd %xmm0, %zmm0 +; CHECK-NEXT: vpmulld %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: addl %esi, %eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: %0 = load <16 x i8>, ptr %a, align 16 @@ -30,14 +35,20 @@ ; CHECK-LABEL: mul_i4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; CHECK-NEXT: vpmovsxbd %xmm1, %zmm1 +; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: addl %edi, %eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: %0 = zext <16 x i4> %a to <16 x i32> @@ -51,20 +62,23 @@ define i32 @mul_i4i4(<16 x i4> %a, <16 x i4> %b, i32 %c) { ; CHECK-LABEL: mul_i4i4: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpsllw $4, %xmm1, %xmm1 -; CHECK-NEXT: vpsrlw $4, %xmm1, %xmm1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; CHECK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1 -; CHECK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; CHECK-NEXT: vpslld $28, %zmm1, %zmm1 +; CHECK-NEXT: vpsrad $28, %zmm1, %zmm1 +; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: addl %edi, %eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: %0 = zext <16 x i4> %a to <16 x i32> @@ -78,17 +92,20 @@ define i32 @mul_sext_i4i4(<16 x i4> %a, <16 x i4> %b, i32 %c) { ; CHECK-LABEL: mul_sext_i4i4: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; CHECK-NEXT: vpsllw $12, %ymm1, %ymm1 -; CHECK-NEXT: vpsraw $12, %ymm1, %ymm1 -; CHECK-NEXT: vpsllw $12, %ymm0, %ymm0 -; CHECK-NEXT: vpsraw $12, %ymm0, %ymm0 -; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; CHECK-NEXT: vpslld $28, %zmm0, %zmm0 +; CHECK-NEXT: vpsrad $28, %zmm0, %zmm0 +; CHECK-NEXT: vpslld $28, %zmm1, %zmm1 +; CHECK-NEXT: vpsrad $28, %zmm1, %zmm1 +; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vmovd %xmm0, %eax @@ -108,16 +125,22 @@ ; CHECK-LABEL: mul_zext_i4i4: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: addl %edi, %eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: %0 = zext <16 x i4> %a to <16 x i32> diff --git a/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll b/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll --- a/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll +++ b/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll @@ -625,10 +625,10 @@ ; ALL: # %bb.0: ; ALL-NEXT: movq (%rdi), %rax ; ALL-NEXT: movq 8(%rdi), %rcx -; ALL-NEXT: notq %rcx ; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, (%rsi) +; ALL-NEXT: notq %rcx ; ALL-NEXT: movq %rcx, 8(%rsi) +; ALL-NEXT: movq %rax, (%rsi) ; ALL-NEXT: movq %rcx, 24(%rsi) ; ALL-NEXT: movq %rax, 16(%rsi) ; ALL-NEXT: retq @@ -1038,10 +1038,10 @@ ; ALL: # %bb.0: ; ALL-NEXT: movq (%rdi), %rax ; ALL-NEXT: movq 8(%rdi), %rcx -; ALL-NEXT: notq %rcx ; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, (%rsi) +; ALL-NEXT: notq %rcx ; ALL-NEXT: movq %rcx, 8(%rsi) +; ALL-NEXT: movq %rax, (%rsi) ; ALL-NEXT: movq %rcx, 24(%rsi) ; ALL-NEXT: movq %rax, 16(%rsi) ; ALL-NEXT: movq %rcx, 40(%rsi) @@ -1563,10 +1563,10 @@ ; ALL: # %bb.0: ; ALL-NEXT: movq (%rdi), %rax ; ALL-NEXT: movq 8(%rdi), %rcx -; ALL-NEXT: notq %rcx ; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, (%rsi) +; ALL-NEXT: notq %rcx ; ALL-NEXT: movq %rcx, 8(%rsi) +; ALL-NEXT: movq %rax, (%rsi) ; ALL-NEXT: movq %rcx, 24(%rsi) ; ALL-NEXT: movq %rax, 16(%rsi) ; ALL-NEXT: movq %rcx, 40(%rsi) @@ -1590,22 +1590,22 @@ define void @vec512_i256(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { ; ALL-LABEL: vec512_i256: ; ALL: # %bb.0: -; ALL-NEXT: movq 16(%rdi), %rax -; ALL-NEXT: movq 24(%rdi), %rcx +; ALL-NEXT: movq 24(%rdi), %rax +; ALL-NEXT: movq 16(%rdi), %rcx ; ALL-NEXT: movq (%rdi), %rdx ; ALL-NEXT: movq 8(%rdi), %rdi -; ALL-NEXT: notq %rdi ; ALL-NEXT: notq %rdx +; ALL-NEXT: notq %rdi ; ALL-NEXT: notq %rcx ; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, 16(%rsi) -; ALL-NEXT: movq %rcx, 24(%rsi) -; ALL-NEXT: movq %rdx, (%rsi) +; ALL-NEXT: movq %rax, 24(%rsi) +; ALL-NEXT: movq %rcx, 16(%rsi) ; ALL-NEXT: movq %rdi, 8(%rsi) -; ALL-NEXT: movq %rax, 48(%rsi) -; ALL-NEXT: movq %rcx, 56(%rsi) -; ALL-NEXT: movq %rdx, 32(%rsi) +; ALL-NEXT: movq %rdx, (%rsi) +; ALL-NEXT: movq %rax, 56(%rsi) +; ALL-NEXT: movq %rcx, 48(%rsi) ; ALL-NEXT: movq %rdi, 40(%rsi) +; ALL-NEXT: movq %rdx, 32(%rsi) ; ALL-NEXT: retq %in.elt.not = load i256, ptr %in.elt.ptr, align 64 %in.elt = xor i256 %in.elt.not, -1 diff --git a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll --- a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll +++ b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll @@ -531,9 +531,11 @@ ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: vpbroadcastw %xmm0, %xmm1 -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [4,0,0,0] -; AVX512-NEXT: vpermi2ps (%rsp), %xmm1, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: addq $40, %rsp ; AVX512-NEXT: .cfi_def_cfa_offset 8 ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll --- a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll +++ b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll @@ -130,39 +130,38 @@ ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: paddd %xmm2, %xmm1 -; SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE-NEXT: psubd %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; SSE-NEXT: movd %xmm2, %ecx -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; SSE-NEXT: movd %xmm2, %eax +; SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: cltd ; SSE-NEXT: idivl %ecx -; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE-NEXT: movd %xmm3, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE-NEXT: movd %xmm3, %eax ; SSE-NEXT: cltd ; SSE-NEXT: idivl %ecx ; SSE-NEXT: movd %eax, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: cltd ; SSE-NEXT: idivl %ecx -; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: cltd ; SSE-NEXT: idivl %ecx ; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movdqa %xmm2, (%rdi) +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movdqa %xmm1, (%rdi) ; SSE-NEXT: retq ; ; AVX1-LABEL: vp_sdiv_v4i32: @@ -309,39 +308,38 @@ ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: paddd %xmm2, %xmm1 -; SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE-NEXT: psubd %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; SSE-NEXT: movd %xmm2, %ecx -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; SSE-NEXT: movd %xmm2, %eax +; SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: xorl %edx, %edx ; SSE-NEXT: divl %ecx -; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE-NEXT: movd %xmm3, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE-NEXT: movd %xmm3, %eax ; SSE-NEXT: xorl %edx, %edx ; SSE-NEXT: divl %ecx ; SSE-NEXT: movd %eax, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: xorl %edx, %edx ; SSE-NEXT: divl %ecx -; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: xorl %edx, %edx ; SSE-NEXT: divl %ecx ; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movdqa %xmm2, (%rdi) +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movdqa %xmm1, (%rdi) ; SSE-NEXT: retq ; ; AVX1-LABEL: vp_udiv_v4i32: @@ -488,39 +486,38 @@ ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: paddd %xmm2, %xmm1 -; SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE-NEXT: psubd %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; SSE-NEXT: movd %xmm2, %ecx -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; SSE-NEXT: movd %xmm2, %eax +; SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: cltd ; SSE-NEXT: idivl %ecx -; SSE-NEXT: movd %edx, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; SSE-NEXT: movd %edx, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE-NEXT: movd %xmm3, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE-NEXT: movd %xmm3, %eax ; SSE-NEXT: cltd ; SSE-NEXT: idivl %ecx ; SSE-NEXT: movd %edx, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: cltd ; SSE-NEXT: idivl %ecx -; SSE-NEXT: movd %edx, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: movd %edx, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: cltd ; SSE-NEXT: idivl %ecx ; SSE-NEXT: movd %edx, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movdqa %xmm2, (%rdi) +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movdqa %xmm1, (%rdi) ; SSE-NEXT: retq ; ; AVX1-LABEL: vp_srem_v4i32: @@ -667,39 +664,38 @@ ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: paddd %xmm2, %xmm1 -; SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE-NEXT: psubd %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; SSE-NEXT: movd %xmm2, %ecx -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; SSE-NEXT: movd %xmm2, %eax +; SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: xorl %edx, %edx ; SSE-NEXT: divl %ecx -; SSE-NEXT: movd %edx, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; SSE-NEXT: movd %edx, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE-NEXT: movd %xmm3, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE-NEXT: movd %xmm3, %eax ; SSE-NEXT: xorl %edx, %edx ; SSE-NEXT: divl %ecx ; SSE-NEXT: movd %edx, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: xorl %edx, %edx ; SSE-NEXT: divl %ecx -; SSE-NEXT: movd %edx, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: movd %edx, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: xorl %edx, %edx ; SSE-NEXT: divl %ecx ; SSE-NEXT: movd %edx, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movdqa %xmm2, (%rdi) +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movdqa %xmm1, (%rdi) ; SSE-NEXT: retq ; ; AVX1-LABEL: vp_urem_v4i32: diff --git a/llvm/test/CodeGen/X86/extract-bits.ll b/llvm/test/CodeGen/X86/extract-bits.ll --- a/llvm/test/CodeGen/X86/extract-bits.ll +++ b/llvm/test/CodeGen/X86/extract-bits.ll @@ -6174,13 +6174,13 @@ ; X64-NOBMI-LABEL: bextr64_32_c0: ; X64-NOBMI: # %bb.0: ; X64-NOBMI-NEXT: movq %rsi, %rcx +; X64-NOBMI-NEXT: movq %rdi, %rax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-NOBMI-NEXT: shrq %cl, %rdi +; X64-NOBMI-NEXT: shrq %cl, %rax ; X64-NOBMI-NEXT: negb %dl -; X64-NOBMI-NEXT: movq $-1, %rax ; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: shlq %cl, %rax ; X64-NOBMI-NEXT: shrq %cl, %rax -; X64-NOBMI-NEXT: andl %edi, %eax ; X64-NOBMI-NEXT: # kill: def $eax killed $eax killed $rax ; X64-NOBMI-NEXT: retq ; @@ -6196,7 +6196,8 @@ ; X64-BMI2-LABEL: bextr64_32_c0: ; X64-BMI2: # %bb.0: ; X64-BMI2-NEXT: shrxq %rsi, %rdi, %rax -; X64-BMI2-NEXT: bzhil %edx, %eax, %eax +; X64-BMI2-NEXT: bzhiq %rdx, %rax, %rax +; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax ; X64-BMI2-NEXT: retq %shifted = lshr i64 %val, %numskipbits %numhighbits = sub i64 64, %numlowbits diff --git a/llvm/test/CodeGen/X86/extract-concat.ll b/llvm/test/CodeGen/X86/extract-concat.ll --- a/llvm/test/CodeGen/X86/extract-concat.ll +++ b/llvm/test/CodeGen/X86/extract-concat.ll @@ -9,22 +9,17 @@ ; SSE2-LABEL: foo: ; SSE2: # %bb.0: ; SSE2-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: shll $8, %ecx -; SSE2-NEXT: orl %eax, %ecx -; SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: orl $-16777216, %eax # imm = 0xFF000000 -; SSE2-NEXT: movl %eax, (%rdi) +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movd %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSE42-LABEL: foo: ; SSE42: # %bb.0: ; SSE42-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE42-NEXT: movl $255, %eax ; SSE42-NEXT: pinsrb $3, %eax, %xmm0 ; SSE42-NEXT: movd %xmm0, (%rdi) @@ -33,7 +28,7 @@ ; AVX-LABEL: foo: ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: movl $255, %eax ; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, (%rdi) @@ -162,11 +157,32 @@ ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; SSE-NEXT: retq ; -; AVX-LABEL: cat_ext_straddle: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX-NEXT: retq +; AVX1-LABEL: cat_ext_straddle: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm1 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: cat_ext_straddle: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-NEXT: vbroadcastsd (%rsi), %ymm1 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: cat_ext_straddle: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps (%rdi), %ymm0 +; AVX512F-NEXT: vbroadcastsd (%rsi), %ymm1 +; AVX512F-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq %x = load <6 x i32>, ptr %px %y = load <6 x i32>, ptr %py %cat = shufflevector <6 x i32> %x, <6 x i32> %y, <12 x i32> diff --git a/llvm/test/CodeGen/X86/extract-fp.ll b/llvm/test/CodeGen/X86/extract-fp.ll --- a/llvm/test/CodeGen/X86/extract-fp.ll +++ b/llvm/test/CodeGen/X86/extract-fp.ll @@ -86,8 +86,8 @@ define float @ext_maxnum_v4f32(<4 x float> %x) nounwind { ; CHECK-LABEL: ext_maxnum_v4f32: ; CHECK: # %bb.0: +; CHECK-NEXT: maxps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: maxss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %v = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> ) %r = extractelement <4 x float> %v, i32 2 diff --git a/llvm/test/CodeGen/X86/extract-insert.ll b/llvm/test/CodeGen/X86/extract-insert.ll --- a/llvm/test/CodeGen/X86/extract-insert.ll +++ b/llvm/test/CodeGen/X86/extract-insert.ll @@ -32,8 +32,8 @@ define i8 @extractelt_bitcast_extra_use(i32 %x, ptr %p) nounwind { ; X86-LABEL: extractelt_bitcast_extra_use: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/extract-lowbits.ll b/llvm/test/CodeGen/X86/extract-lowbits.ll --- a/llvm/test/CodeGen/X86/extract-lowbits.ll +++ b/llvm/test/CodeGen/X86/extract-lowbits.ll @@ -3083,23 +3083,25 @@ ; X64-NOBMI-LABEL: bzhi64_32_c0: ; X64-NOBMI: # %bb.0: ; X64-NOBMI-NEXT: movq %rsi, %rcx +; X64-NOBMI-NEXT: movq %rdi, %rax ; X64-NOBMI-NEXT: negb %cl -; X64-NOBMI-NEXT: movq $-1, %rax +; X64-NOBMI-NEXT: shlq %cl, %rax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rax -; X64-NOBMI-NEXT: andl %edi, %eax ; X64-NOBMI-NEXT: # kill: def $eax killed $eax killed $rax ; X64-NOBMI-NEXT: retq ; ; X64-BMI1-LABEL: bzhi64_32_c0: ; X64-BMI1: # %bb.0: ; X64-BMI1-NEXT: shll $8, %esi -; X64-BMI1-NEXT: bextrl %esi, %edi, %eax +; X64-BMI1-NEXT: bextrq %rsi, %rdi, %rax +; X64-BMI1-NEXT: # kill: def $eax killed $eax killed $rax ; X64-BMI1-NEXT: retq ; ; X64-BMI2-LABEL: bzhi64_32_c0: ; X64-BMI2: # %bb.0: -; X64-BMI2-NEXT: bzhil %esi, %edi, %eax +; X64-BMI2-NEXT: bzhiq %rsi, %rdi, %rax +; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax ; X64-BMI2-NEXT: retq %numhighbits = sub i64 64, %numlowbits %mask = lshr i64 -1, %numhighbits diff --git a/llvm/test/CodeGen/X86/extractelement-fp.ll b/llvm/test/CodeGen/X86/extractelement-fp.ll --- a/llvm/test/CodeGen/X86/extractelement-fp.ll +++ b/llvm/test/CodeGen/X86/extractelement-fp.ll @@ -571,16 +571,16 @@ define float @fmaxnum_v4f32(<4 x float> %x, <4 x float> %y) nounwind { ; X64-LABEL: fmaxnum_v4f32: ; X64: # %bb.0: -; X64-NEXT: vmaxss %xmm0, %xmm1, %xmm2 -; X64-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmaxps %xmm0, %xmm1, %xmm2 +; X64-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0 ; X64-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: fmaxnum_v4f32: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vmaxss %xmm0, %xmm1, %xmm2 -; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm2 +; X86-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0 ; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -594,9 +594,9 @@ define double @fmaxnum_v4f64(<4 x double> %x, <4 x double> %y) nounwind { ; X64-LABEL: fmaxnum_v4f64: ; X64: # %bb.0: -; X64-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 -; X64-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 -; X64-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; X64-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm2 +; X64-NEXT: vmaxpd %xmm0, %xmm1, %xmm0 +; X64-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; @@ -606,9 +606,9 @@ ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $8, %esp -; X86-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 -; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 -; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; X86-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm2 +; X86-NEXT: vmaxpd %xmm0, %xmm1, %xmm0 +; X86-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovlpd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp @@ -623,16 +623,16 @@ define float @fminnum_v4f32(<4 x float> %x, <4 x float> %y) nounwind { ; X64-LABEL: fminnum_v4f32: ; X64: # %bb.0: -; X64-NEXT: vminss %xmm0, %xmm1, %xmm2 -; X64-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; X64-NEXT: vminps %xmm0, %xmm1, %xmm2 +; X64-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0 ; X64-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: fminnum_v4f32: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vminss %xmm0, %xmm1, %xmm2 -; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; X86-NEXT: vminps %xmm0, %xmm1, %xmm2 +; X86-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0 ; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -646,9 +646,9 @@ define double @fminnum_v4f64(<4 x double> %x, <4 x double> %y) nounwind { ; X64-LABEL: fminnum_v4f64: ; X64: # %bb.0: -; X64-NEXT: vminsd %xmm0, %xmm1, %xmm2 -; X64-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 -; X64-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; X64-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm2 +; X64-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; X64-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; @@ -658,9 +658,9 @@ ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $8, %esp -; X86-NEXT: vminsd %xmm0, %xmm1, %xmm2 -; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 -; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; X86-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm2 +; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; X86-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovlpd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll --- a/llvm/test/CodeGen/X86/extractelement-load.ll +++ b/llvm/test/CodeGen/X86/extractelement-load.ll @@ -406,10 +406,10 @@ ; X32-SSE2-NEXT: andl $-32, %esp ; X32-SSE2-NEXT: subl $64, %esp ; X32-SSE2-NEXT: movdqa zero, %xmm0 -; X32-SSE2-NEXT: movaps n1+16, %xmm1 -; X32-SSE2-NEXT: movaps n1, %xmm2 -; X32-SSE2-NEXT: movaps %xmm2, zero -; X32-SSE2-NEXT: movaps %xmm1, zero+16 +; X32-SSE2-NEXT: movaps n1, %xmm1 +; X32-SSE2-NEXT: movaps n1+16, %xmm2 +; X32-SSE2-NEXT: movaps %xmm2, zero+16 +; X32-SSE2-NEXT: movaps %xmm1, zero ; X32-SSE2-NEXT: movaps {{.*#+}} xmm1 = [2,2,2,2] ; X32-SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X32-SSE2-NEXT: movaps %xmm1, (%esp) @@ -444,8 +444,8 @@ ; X64-SSSE3-NEXT: movq n1@GOTPCREL(%rip), %rax ; X64-SSSE3-NEXT: movaps (%rax), %xmm1 ; X64-SSSE3-NEXT: movaps 16(%rax), %xmm2 -; X64-SSSE3-NEXT: movaps %xmm1, zero(%rip) ; X64-SSSE3-NEXT: movaps %xmm2, zero+16(%rip) +; X64-SSSE3-NEXT: movaps %xmm1, zero(%rip) ; X64-SSSE3-NEXT: movaps {{.*#+}} xmm1 = [2,2,2,2] ; X64-SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; X64-SSSE3-NEXT: movaps %xmm1, (%rsp) diff --git a/llvm/test/CodeGen/X86/f16c-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/f16c-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/f16c-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/f16c-intrinsics-upgrade.ll @@ -92,7 +92,9 @@ ; X86-LABEL: test_x86_vcvtph2ps_128_scalar: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00] +; X86-NEXT: vmovsd (%eax), %xmm0 # encoding: [0xc5,0xfb,0x10,0x00] +; X86-NEXT: # xmm0 = mem[0],zero +; X86-NEXT: vcvtph2ps %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtph2ps_128_scalar: @@ -103,7 +105,9 @@ ; X86-AVX512VL-LABEL: test_x86_vcvtph2ps_128_scalar: ; X86-AVX512VL: # %bb.0: ; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512VL-NEXT: vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00] +; X86-AVX512VL-NEXT: vmovsd (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x00] +; X86-AVX512VL-NEXT: # xmm0 = mem[0],zero +; X86-AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0] ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_128_scalar: @@ -122,7 +126,9 @@ ; X86-LABEL: test_x86_vcvtph2ps_128_scalar2: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00] +; X86-NEXT: vmovsd (%eax), %xmm0 # encoding: [0xc5,0xfb,0x10,0x00] +; X86-NEXT: # xmm0 = mem[0],zero +; X86-NEXT: vcvtph2ps %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtph2ps_128_scalar2: @@ -133,7 +139,9 @@ ; X86-AVX512VL-LABEL: test_x86_vcvtph2ps_128_scalar2: ; X86-AVX512VL: # %bb.0: ; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512VL-NEXT: vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00] +; X86-AVX512VL-NEXT: vmovsd (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x00] +; X86-AVX512VL-NEXT: # xmm0 = mem[0],zero +; X86-AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0] ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_128_scalar2: diff --git a/llvm/test/CodeGen/X86/fdiv.ll b/llvm/test/CodeGen/X86/fdiv.ll --- a/llvm/test/CodeGen/X86/fdiv.ll +++ b/llvm/test/CodeGen/X86/fdiv.ll @@ -85,11 +85,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movaps %xmm0, %xmm3 ; CHECK-NEXT: subss %xmm1, %xmm3 +; CHECK-NEXT: mulss %xmm2, %xmm3 ; CHECK-NEXT: subss %xmm0, %xmm1 -; CHECK-NEXT: mulss %xmm2, %xmm1 -; CHECK-NEXT: subss %xmm2, %xmm3 -; CHECK-NEXT: divss %xmm3, %xmm1 -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: addss %xmm2, %xmm1 +; CHECK-NEXT: divss %xmm1, %xmm3 +; CHECK-NEXT: movaps %xmm3, %xmm0 ; CHECK-NEXT: retq %sub1 = fsub fast float %a0, %a1 %mul2 = fmul fast float %sub1, %a2 diff --git a/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll b/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll --- a/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll +++ b/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll @@ -68,14 +68,14 @@ define float @test_fneg_fma_subx_negy_negz_f32(float %w, float %x, float %y, float %z) { ; FMA3-LABEL: test_fneg_fma_subx_negy_negz_f32: ; FMA3: # %bb.0: # %entry -; FMA3-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; FMA3-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm3 +; FMA3-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; FMA3-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 ; FMA3-NEXT: retq ; ; FMA4-LABEL: test_fneg_fma_subx_negy_negz_f32: ; FMA4: # %bb.0: # %entry -; FMA4-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; FMA4-NEXT: vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm3 +; FMA4-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; FMA4-NEXT: vfnmaddss {{.*#+}} xmm0 = -(xmm0 * xmm2) + xmm3 ; FMA4-NEXT: retq entry: %subx = fsub nsz float %w, %x diff --git a/llvm/test/CodeGen/X86/fma.ll b/llvm/test/CodeGen/X86/fma.ll --- a/llvm/test/CodeGen/X86/fma.ll +++ b/llvm/test/CodeGen/X86/fma.ll @@ -443,20 +443,20 @@ ; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x54] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18] ; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x1c] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x18,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x10,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] ; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x14,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x10,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x18,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; FMACALL32_BDVER2-NEXT: addl $108, %esp ## encoding: [0x83,0xc4,0x6c] ; FMACALL32_BDVER2-NEXT: retl ## encoding: [0xc3] @@ -756,43 +756,43 @@ ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x68] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x74] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x80,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x8c,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x98,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xa4,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x28] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x28,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x18,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x1c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x2c] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x18,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x24,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[2,3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x14,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x14,0x20] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x20,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0],xmm1[3] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x20,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x10,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x10,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x1c,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1,2],mem[0] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] ; FMACALL32_BDVER2-NEXT: addl $284, %esp ## encoding: [0x81,0xc4,0x1c,0x01,0x00,0x00] @@ -1336,84 +1336,84 @@ ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x58] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xd0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x40] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xdc,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x60] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x80,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x5c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xe8,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x4c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xf4,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x48] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x00,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x44] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x0c,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x5c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x50] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x18,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x58] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x24,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x54] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x30,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x50] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x3c,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x4c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x54] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x48,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x48] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x54,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x44] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x40] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x3c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x58] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x38,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x40,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x5c] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x28,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x4c,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[2,3] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x4c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x54] ; FMACALL32_BDVER2-NEXT: ## xmm2 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x48,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x34,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0],mem[0],xmm2[2,3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x34,0x20] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x3c,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x48,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0],xmm1[3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x44,0x20] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x30,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0,1],mem[0],xmm2[3] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x30,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x38,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x20,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x44,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1,2],mem[0] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x40,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x2c,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0,1,2],mem[0] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x5c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x50] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x58,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x28,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[2,3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x54,0x20] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x24,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0],xmm1[3] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x50,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x20,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1,2],mem[0] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01] ; FMACALL32_BDVER2-NEXT: movl %ebp, %esp ## encoding: [0x89,0xec] @@ -1508,13 +1508,13 @@ ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x58] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20] -; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x28] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x28] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0,1] ; FMACALL32_BDVER2-NEXT: addl $108, %esp ## encoding: [0x83,0xc4,0x6c] ; FMACALL32_BDVER2-NEXT: retl ## encoding: [0xc3] @@ -1723,23 +1723,23 @@ ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x68] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x30] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x38] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x18] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x44] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x30] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x18] -; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x30] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x28] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x28] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x18] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0,1] -; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x18] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0,1] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] ; FMACALL32_BDVER2-NEXT: addl $236, %esp ## encoding: [0x81,0xc4,0xec,0x00,0x00,0x00] @@ -2048,44 +2048,44 @@ ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x12,0x84,0x24,0x48,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x60] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x78] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x20] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x58] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x30] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x50] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x9c,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xa0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x48] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x50] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x9c,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x68] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x88,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x78] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x58] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x94,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x70] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x68] -; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x48] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x78] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x50] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x8c,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x58] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0,1] -; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x48] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x50] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0,1] ; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfb,0x10,0x54,0x24,0x70] ; FMACALL32_BDVER2-NEXT: ## xmm2 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x16,0x54,0x24,0x68] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x16,0x54,0x24,0x48] ; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0,1],mem[0,1] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] -; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x8c,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x68] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x78] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x58] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0,1] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01] ; FMACALL32_BDVER2-NEXT: movl %ebp, %esp ## encoding: [0x89,0xec] diff --git a/llvm/test/CodeGen/X86/fma_patterns.ll b/llvm/test/CodeGen/X86/fma_patterns.ll --- a/llvm/test/CodeGen/X86/fma_patterns.ll +++ b/llvm/test/CodeGen/X86/fma_patterns.ll @@ -1293,20 +1293,20 @@ ; ; FMA-NOINFS-LABEL: test_f32_interp: ; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213ss {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA-NOINFS-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; FMA-NOINFS-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; FMA-NOINFS-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 ; FMA-NOINFS-NEXT: retq ; ; FMA4-NOINFS-LABEL: test_f32_interp: ; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubss {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA4-NOINFS-NEXT: vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 +; FMA4-NOINFS-NEXT: vfnmaddss {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; FMA4-NOINFS-NEXT: vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 ; FMA4-NOINFS-NEXT: retq ; ; AVX512-NOINFS-LABEL: test_f32_interp: ; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213ss {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; AVX512-NOINFS-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; AVX512-NOINFS-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; AVX512-NOINFS-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 ; AVX512-NOINFS-NEXT: retq %t1 = fsub nsz float 1.0, %t %tx = fmul nsz float %x, %t @@ -1342,20 +1342,20 @@ ; ; FMA-NOINFS-LABEL: test_v4f32_interp: ; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; FMA-NOINFS-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; FMA-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 ; FMA-NOINFS-NEXT: retq ; ; FMA4-NOINFS-LABEL: test_v4f32_interp: ; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 +; FMA4-NOINFS-NEXT: vfnmaddps {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; FMA4-NOINFS-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 ; FMA4-NOINFS-NEXT: retq ; ; AVX512-NOINFS-LABEL: test_v4f32_interp: ; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; AVX512-NOINFS-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 ; AVX512-NOINFS-NEXT: retq %t1 = fsub nsz <4 x float> , %t %tx = fmul nsz <4 x float> %x, %t @@ -1391,20 +1391,20 @@ ; ; FMA-NOINFS-LABEL: test_v8f32_interp: ; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1 +; FMA-NOINFS-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm1 +; FMA-NOINFS-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 ; FMA-NOINFS-NEXT: retq ; ; FMA4-NOINFS-LABEL: test_v8f32_interp: ; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm1 +; FMA4-NOINFS-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm1 +; FMA4-NOINFS-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1 ; FMA4-NOINFS-NEXT: retq ; ; AVX512-NOINFS-LABEL: test_v8f32_interp: ; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1 +; AVX512-NOINFS-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm1 +; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 ; AVX512-NOINFS-NEXT: retq %t1 = fsub nsz <8 x float> , %t %tx = fmul nsz <8 x float> %x, %t @@ -1440,20 +1440,20 @@ ; ; FMA-NOINFS-LABEL: test_f64_interp: ; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213sd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA-NOINFS-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; FMA-NOINFS-NEXT: vfnmadd213sd {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; FMA-NOINFS-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 ; FMA-NOINFS-NEXT: retq ; ; FMA4-NOINFS-LABEL: test_f64_interp: ; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubsd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA4-NOINFS-NEXT: vfmsubsd {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 +; FMA4-NOINFS-NEXT: vfnmaddsd {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; FMA4-NOINFS-NEXT: vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 ; FMA4-NOINFS-NEXT: retq ; ; AVX512-NOINFS-LABEL: test_f64_interp: ; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213sd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; AVX512-NOINFS-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; AVX512-NOINFS-NEXT: vfnmadd213sd {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; AVX512-NOINFS-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 ; AVX512-NOINFS-NEXT: retq %t1 = fsub nsz double 1.0, %t %tx = fmul nsz double %x, %t @@ -1492,20 +1492,20 @@ ; ; FMA-NOINFS-LABEL: test_v2f64_interp: ; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; FMA-NOINFS-NEXT: vfnmadd213pd {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; FMA-NOINFS-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 ; FMA-NOINFS-NEXT: retq ; ; FMA4-NOINFS-LABEL: test_v2f64_interp: ; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 +; FMA4-NOINFS-NEXT: vfnmaddpd {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; FMA4-NOINFS-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 ; FMA4-NOINFS-NEXT: retq ; ; AVX512-NOINFS-LABEL: test_v2f64_interp: ; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; AVX512-NOINFS-NEXT: vfnmadd213pd {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; AVX512-NOINFS-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 ; AVX512-NOINFS-NEXT: retq %t1 = fsub nsz <2 x double> , %t %tx = fmul nsz <2 x double> %x, %t @@ -1541,20 +1541,20 @@ ; ; FMA-NOINFS-LABEL: test_v4f64_interp: ; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1 +; FMA-NOINFS-NEXT: vfnmadd213pd {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm1 +; FMA-NOINFS-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 ; FMA-NOINFS-NEXT: retq ; ; FMA4-NOINFS-LABEL: test_v4f64_interp: ; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm1 +; FMA4-NOINFS-NEXT: vfnmaddpd {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm1 +; FMA4-NOINFS-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1 ; FMA4-NOINFS-NEXT: retq ; ; AVX512-NOINFS-LABEL: test_v4f64_interp: ; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1 +; AVX512-NOINFS-NEXT: vfnmadd213pd {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm1 +; AVX512-NOINFS-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 ; AVX512-NOINFS-NEXT: retq %t1 = fsub nsz <4 x double> , %t %tx = fmul nsz <4 x double> %x, %t @@ -1612,17 +1612,26 @@ define <4 x float> @test_v4f32_fneg_fnmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { ; FMA-LABEL: test_v4f32_fneg_fnmadd: ; FMA: # %bb.0: -; FMA-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 +; FMA-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; FMA-NEXT: vfnmsub231ps {{.*#+}} xmm3 = -(xmm1 * xmm0) - xmm3 +; FMA-NEXT: vaddps %xmm2, %xmm3, %xmm0 +; FMA-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; FMA-NEXT: retq ; ; FMA4-LABEL: test_v4f32_fneg_fnmadd: ; FMA4: # %bb.0: -; FMA4-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 +; FMA4-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; FMA4-NEXT: vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm3 +; FMA4-NEXT: vaddps %xmm2, %xmm0, %xmm0 +; FMA4-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; FMA4-NEXT: retq ; ; AVX512-LABEL: test_v4f32_fneg_fnmadd: ; AVX512: # %bb.0: -; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 +; AVX512-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512-NEXT: vfnmsub231ps {{.*#+}} xmm3 = -(xmm1 * xmm0) - xmm3 +; AVX512-NEXT: vaddps %xmm2, %xmm3, %xmm0 +; AVX512-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512-NEXT: retq %mul = fmul nsz <4 x float> %a0, %a1 %neg0 = fsub nsz <4 x float> , %mul @@ -1634,17 +1643,23 @@ define <4 x double> @test_v4f64_fneg_fnmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { ; FMA-LABEL: test_v4f64_fneg_fnmsub: ; FMA: # %bb.0: -; FMA-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 +; FMA-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; FMA-NEXT: vfnmsub231pd {{.*#+}} ymm3 = -(ymm1 * ymm0) - ymm3 +; FMA-NEXT: vsubpd %ymm3, %ymm2, %ymm0 ; FMA-NEXT: retq ; ; FMA4-LABEL: test_v4f64_fneg_fnmsub: ; FMA4: # %bb.0: -; FMA4-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2 +; FMA4-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; FMA4-NEXT: vfnmsubpd {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm3 +; FMA4-NEXT: vsubpd %ymm0, %ymm2, %ymm0 ; FMA4-NEXT: retq ; ; AVX512-LABEL: test_v4f64_fneg_fnmsub: ; AVX512: # %bb.0: -; AVX512-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 +; AVX512-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; AVX512-NEXT: vfnmsub231pd {{.*#+}} ymm3 = -(ymm1 * ymm0) - ymm3 +; AVX512-NEXT: vsubpd %ymm3, %ymm2, %ymm0 ; AVX512-NEXT: retq %mul = fmul nsz <4 x double> %a0, %a1 %neg0 = fsub nsz <4 x double> , %mul @@ -1888,28 +1903,26 @@ define <2 x double> @fadd_fma_fmul_3(<2 x double> %x1, <2 x double> %x2, <2 x double> %x3, <2 x double> %x4, <2 x double> %x5, <2 x double> %x6, <2 x double> %x7, <2 x double> %x8) nounwind { ; FMA-LABEL: fadd_fma_fmul_3: ; FMA: # %bb.0: -; FMA-NEXT: vmulpd %xmm3, %xmm2, %xmm2 -; FMA-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2 -; FMA-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm7 * xmm6) + xmm2 -; FMA-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm5 * xmm4) + xmm2 -; FMA-NEXT: vmovapd %xmm2, %xmm0 +; FMA-NEXT: vmulpd %xmm7, %xmm6, %xmm6 +; FMA-NEXT: vfmadd231pd {{.*#+}} xmm6 = (xmm5 * xmm4) + xmm6 +; FMA-NEXT: vfmadd231pd {{.*#+}} xmm6 = (xmm3 * xmm2) + xmm6 +; FMA-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm6 ; FMA-NEXT: retq ; ; FMA4-LABEL: fadd_fma_fmul_3: ; FMA4: # %bb.0: -; FMA4-NEXT: vmulpd %xmm3, %xmm2, %xmm2 +; FMA4-NEXT: vmulpd %xmm7, %xmm6, %xmm6 +; FMA4-NEXT: vfmaddpd {{.*#+}} xmm4 = (xmm4 * xmm5) + xmm6 +; FMA4-NEXT: vfmaddpd {{.*#+}} xmm2 = (xmm2 * xmm3) + xmm4 ; FMA4-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 -; FMA4-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm6 * xmm7) + xmm0 -; FMA4-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm4 * xmm5) + xmm0 ; FMA4-NEXT: retq ; ; AVX512-LABEL: fadd_fma_fmul_3: ; AVX512: # %bb.0: -; AVX512-NEXT: vmulpd %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2 -; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm7 * xmm6) + xmm2 -; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm5 * xmm4) + xmm2 -; AVX512-NEXT: vmovapd %xmm2, %xmm0 +; AVX512-NEXT: vmulpd %xmm7, %xmm6, %xmm6 +; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm6 = (xmm5 * xmm4) + xmm6 +; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm6 = (xmm3 * xmm2) + xmm6 +; AVX512-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm6 ; AVX512-NEXT: retq %m1 = fmul fast <2 x double> %x1, %x2 %m2 = fmul fast <2 x double> %x3, %x4 diff --git a/llvm/test/CodeGen/X86/fma_patterns_wide.ll b/llvm/test/CodeGen/X86/fma_patterns_wide.ll --- a/llvm/test/CodeGen/X86/fma_patterns_wide.ll +++ b/llvm/test/CodeGen/X86/fma_patterns_wide.ll @@ -849,24 +849,24 @@ ; ; FMA-NOINFS-LABEL: test_v16f32_interp: ; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm3 = (ymm5 * ymm3) - ymm3 -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm2 = (ymm4 * ymm2) - ymm2 -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm2 -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm5 * ymm1) - ymm3 +; FMA-NOINFS-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm5 * ymm3) + ymm3 +; FMA-NOINFS-NEXT: vfnmadd213ps {{.*#+}} ymm2 = -(ymm4 * ymm2) + ymm2 +; FMA-NOINFS-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm4 * ymm0) + ymm2 +; FMA-NOINFS-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm5 * ymm1) + ymm3 ; FMA-NOINFS-NEXT: retq ; ; FMA4-NOINFS-LABEL: test_v16f32_interp: ; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm3 = (ymm5 * ymm3) - ymm3 -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm2 = (ymm4 * ymm2) - ymm2 -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm4) - ymm2 -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm5) - ymm3 +; FMA4-NOINFS-NEXT: vfnmaddps {{.*#+}} ymm3 = -(ymm5 * ymm3) + ymm3 +; FMA4-NOINFS-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm4 * ymm2) + ymm2 +; FMA4-NOINFS-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm2 +; FMA4-NOINFS-NEXT: vfmaddps {{.*#+}} ymm1 = (ymm1 * ymm5) + ymm3 ; FMA4-NOINFS-NEXT: retq ; ; AVX512-NOINFS-LABEL: test_v16f32_interp: ; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} zmm1 = (zmm2 * zmm1) - zmm1 -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm2 * zmm0) - zmm1 +; AVX512-NOINFS-NEXT: vfnmadd213ps {{.*#+}} zmm1 = -(zmm2 * zmm1) + zmm1 +; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm2 * zmm0) + zmm1 ; AVX512-NOINFS-NEXT: retq %t1 = fsub nsz <16 x float> , %t %tx = fmul nsz <16 x float> %x, %t @@ -908,24 +908,24 @@ ; ; FMA-NOINFS-LABEL: test_v8f64_interp: ; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm3 = (ymm5 * ymm3) - ymm3 -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm2 = (ymm4 * ymm2) - ymm2 -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm2 -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm1 = (ymm5 * ymm1) - ymm3 +; FMA-NOINFS-NEXT: vfnmadd213pd {{.*#+}} ymm3 = -(ymm5 * ymm3) + ymm3 +; FMA-NOINFS-NEXT: vfnmadd213pd {{.*#+}} ymm2 = -(ymm4 * ymm2) + ymm2 +; FMA-NOINFS-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm4 * ymm0) + ymm2 +; FMA-NOINFS-NEXT: vfmadd213pd {{.*#+}} ymm1 = (ymm5 * ymm1) + ymm3 ; FMA-NOINFS-NEXT: retq ; ; FMA4-NOINFS-LABEL: test_v8f64_interp: ; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm3 = (ymm5 * ymm3) - ymm3 -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm2 = (ymm4 * ymm2) - ymm2 -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm0 = (ymm0 * ymm4) - ymm2 -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm1 = (ymm1 * ymm5) - ymm3 +; FMA4-NOINFS-NEXT: vfnmaddpd {{.*#+}} ymm3 = -(ymm5 * ymm3) + ymm3 +; FMA4-NOINFS-NEXT: vfnmaddpd {{.*#+}} ymm2 = -(ymm4 * ymm2) + ymm2 +; FMA4-NOINFS-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm2 +; FMA4-NOINFS-NEXT: vfmaddpd {{.*#+}} ymm1 = (ymm1 * ymm5) + ymm3 ; FMA4-NOINFS-NEXT: retq ; ; AVX512-NOINFS-LABEL: test_v8f64_interp: ; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} zmm1 = (zmm2 * zmm1) - zmm1 -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm2 * zmm0) - zmm1 +; AVX512-NOINFS-NEXT: vfnmadd213pd {{.*#+}} zmm1 = -(zmm2 * zmm1) + zmm1 +; AVX512-NOINFS-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm2 * zmm0) + zmm1 ; AVX512-NOINFS-NEXT: retq %t1 = fsub nsz <8 x double> , %t %tx = fmul nsz <8 x double> %x, %t @@ -999,7 +999,10 @@ ; ; AVX512-LABEL: test_v16f32_fneg_fnmadd: ; AVX512: # %bb.0: -; AVX512-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2 +; AVX512-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512-NEXT: vfnmsub231ps {{.*#+}} zmm3 = -(zmm1 * zmm0) - zmm3 +; AVX512-NEXT: vaddps %zmm2, %zmm3, %zmm0 +; AVX512-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512-NEXT: retq %mul = fmul nsz <16 x float> %a0, %a1 %neg0 = fsub nsz <16 x float> , %mul @@ -1023,7 +1026,9 @@ ; ; AVX512-LABEL: test_v8f64_fneg_fnmsub: ; AVX512: # %bb.0: -; AVX512-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 +; AVX512-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; AVX512-NEXT: vfnmsub231pd {{.*#+}} zmm3 = -(zmm1 * zmm0) - zmm3 +; AVX512-NEXT: vsubpd %zmm3, %zmm2, %zmm0 ; AVX512-NEXT: retq %mul = fmul nsz <8 x double> %a0, %a1 %neg0 = fsub nsz <8 x double> , %mul diff --git a/llvm/test/CodeGen/X86/fmul-combines.ll b/llvm/test/CodeGen/X86/fmul-combines.ll --- a/llvm/test/CodeGen/X86/fmul-combines.ll +++ b/llvm/test/CodeGen/X86/fmul-combines.ll @@ -114,10 +114,12 @@ ret <4 x float> %z } -; CHECK: float 5 -; CHECK: float 12 -; CHECK: float 21 -; CHECK: float 32 + +; CHECK: .LCPI12_0: +; CHECK-NEXT: .long 0x40a00000 +; CHECK-NEXT: .long 0x41400000 +; CHECK-NEXT: .long 0x41a80000 +; CHECK-NEXT: .long 0x42000000 ; We should be able to pre-multiply the two constant vectors. define <4 x float> @fmul_v4f32_two_consts_no_splat(<4 x float> %x) { @@ -165,17 +167,26 @@ ret <4 x float> %z } -; CHECK: float 6 -; CHECK: float 14 -; CHECK: float 24 -; CHECK: float 36 +; CHECK: .LCPI16_0: +; CHECK-NEXT: .long 0x3f800000 +; CHECK-NEXT: .long 0x40000000 +; CHECK-NEXT: .long 0x40400000 +; CHECK-NEXT: .long 0x40800000 +; CHECK: .LCPI16_1: +; CHECK-NEXT: .long 0x40a00000 +; CHECK-NEXT: .long 0x41400000 +; CHECK-NEXT: .long 0x41a80000 +; CHECK-NEXT: .long 0x42000000 ; More than one use of a constant multiply should not inhibit the optimization. ; Instead of a chain of 2 dependent mults, this test will have 2 independent mults. define <4 x float> @fmul_v4f32_two_consts_no_splat_multiple_use(<4 x float> %x) { ; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_multiple_use: ; CHECK: # %bb.0: +; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; CHECK-NEXT: mulps %xmm0, %xmm1 ; CHECK-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: addps %xmm1, %xmm0 ; CHECK-NEXT: retq %y = fmul fast <4 x float> %x, %z = fmul fast <4 x float> %y, @@ -186,10 +197,11 @@ ; PR22698 - http://llvm.org/bugs/show_bug.cgi?id=22698 ; Make sure that we don't infinite loop swapping constants back and forth. -; CHECK: float 24 -; CHECK: float 24 -; CHECK: float 24 -; CHECK: float 24 +; CHECK: .LCPI17_0: +; CHECK-NEXT: .long 0x41c00000 +; CHECK-NEXT: .long 0x41c00000 +; CHECK-NEXT: .long 0x41c00000 +; CHECK-NEXT: .long 0x41c00000 define <4 x float> @PR22698_splats(<4 x float> %a) { ; CHECK-LABEL: PR22698_splats: @@ -204,10 +216,11 @@ ; Same as above, but verify that non-splat vectors are handled correctly too. -; CHECK: float 45 -; CHECK: float 120 -; CHECK: float 231 -; CHECK: float 384 +; CHECK: .LCPI18_0: +; CHECK-NEXT: .long 0x42340000 +; CHECK-NEXT: .long 0x42f00000 +; CHECK-NEXT: .long 0x43670000 +; CHECK-NEXT: .long 0x43c00000 define <4 x float> @PR22698_no_splats(<4 x float> %a) { ; CHECK-LABEL: PR22698_no_splats: @@ -269,7 +282,14 @@ ; CHECK-LABEL: getNegatedExpression_crash: ; CHECK: # %bb.0: ; CHECK-NEXT: movl $0, (%rdi) -; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: mulss %xmm0, %xmm1 +; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: mulss %xmm0, %xmm2 +; CHECK-NEXT: mulss %xmm2, %xmm0 +; CHECK-NEXT: mulss %xmm2, %xmm0 +; CHECK-NEXT: mulss %xmm1, %xmm0 ; CHECK-NEXT: retq store float 0.0, ptr %p, align 1 %real = load float, ptr %p, align 1 diff --git a/llvm/test/CodeGen/X86/fold-call-3.ll b/llvm/test/CodeGen/X86/fold-call-3.ll --- a/llvm/test/CodeGen/X86/fold-call-3.ll +++ b/llvm/test/CodeGen/X86/fold-call-3.ll @@ -60,8 +60,8 @@ ; pre-RA-NEXT: movq %rax, %rsi ; pre-RA-NEXT: callq *560(%rcx) ; pre-RA-NEXT: incl %ebp -; pre-RA-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; pre-RA-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; pre-RA-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; pre-RA-NEXT: cmpl _NumTrials(%rip), %ebp ; pre-RA-NEXT: jb LBB0_2 ; pre-RA-NEXT: ## %bb.3: diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -805,25 +805,49 @@ define float @fmul_pow_select(i32 %cnt, i1 %c) nounwind { ; CHECK-SSE-LABEL: fmul_pow_select: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-SSE-NEXT: leal 1(%rdi), %eax -; CHECK-SSE-NEXT: testb $1, %sil -; CHECK-SSE-NEXT: cmovnel %edi, %eax -; CHECK-SSE-NEXT: shll $23, %eax -; CHECK-SSE-NEXT: addl $1091567616, %eax # imm = 0x41100000 -; CHECK-SSE-NEXT: movd %eax, %xmm0 +; CHECK-SSE-NEXT: movl %edi, %ecx +; CHECK-SSE-NEXT: andl $1, %esi +; CHECK-SSE-NEXT: movl $2, %eax +; CHECK-SSE-NEXT: subl %esi, %eax +; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-SSE-NEXT: shll %cl, %eax +; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm0 +; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: fmul_pow_select: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-AVX-NEXT: leal 1(%rdi), %eax -; CHECK-AVX-NEXT: testb $1, %sil -; CHECK-AVX-NEXT: cmovnel %edi, %eax -; CHECK-AVX-NEXT: shll $23, %eax -; CHECK-AVX-NEXT: addl $1091567616, %eax # imm = 0x41100000 -; CHECK-AVX-NEXT: vmovd %eax, %xmm0 -; CHECK-AVX-NEXT: retq +; CHECK-AVX2-LABEL: fmul_pow_select: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: movl %edi, %ecx +; CHECK-AVX2-NEXT: andl $1, %esi +; CHECK-AVX2-NEXT: movl $2, %eax +; CHECK-AVX2-NEXT: subl %esi, %eax +; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-AVX2-NEXT: shll %cl, %eax +; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-NO-FASTFMA-LABEL: fmul_pow_select: +; CHECK-NO-FASTFMA: # %bb.0: +; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx +; CHECK-NO-FASTFMA-NEXT: andl $1, %esi +; CHECK-NO-FASTFMA-NEXT: movl $2, %eax +; CHECK-NO-FASTFMA-NEXT: subl %esi, %eax +; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax +; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0 +; CHECK-NO-FASTFMA-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NO-FASTFMA-NEXT: retq +; +; CHECK-FMA-LABEL: fmul_pow_select: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: andl $1, %esi +; CHECK-FMA-NEXT: movl $2, %eax +; CHECK-FMA-NEXT: subl %esi, %eax +; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax +; CHECK-FMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0 +; CHECK-FMA-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-FMA-NEXT: retq %shl2 = shl nuw i32 2, %cnt %shl1 = shl nuw i32 1, %cnt %shl = select i1 %c, i32 %shl1, i32 %shl2 @@ -835,25 +859,53 @@ define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_fly_pow_mul_min_pow2: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: addl $3, %edi -; CHECK-SSE-NEXT: cmpl $13, %edi -; CHECK-SSE-NEXT: movl $13, %eax -; CHECK-SSE-NEXT: cmovbl %edi, %eax -; CHECK-SSE-NEXT: shll $23, %eax -; CHECK-SSE-NEXT: addl $1091567616, %eax # imm = 0x41100000 -; CHECK-SSE-NEXT: movd %eax, %xmm0 +; CHECK-SSE-NEXT: movq %rdi, %rcx +; CHECK-SSE-NEXT: movl $8, %eax +; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-SSE-NEXT: shlq %cl, %rax +; CHECK-SSE-NEXT: cmpq $8192, %rax # imm = 0x2000 +; CHECK-SSE-NEXT: movl $8192, %ecx # imm = 0x2000 +; CHECK-SSE-NEXT: cmovbq %rax, %rcx +; CHECK-SSE-NEXT: cvtsi2ss %rcx, %xmm0 +; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: fmul_fly_pow_mul_min_pow2: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: addl $3, %edi -; CHECK-AVX-NEXT: cmpl $13, %edi -; CHECK-AVX-NEXT: movl $13, %eax -; CHECK-AVX-NEXT: cmovbl %edi, %eax -; CHECK-AVX-NEXT: shll $23, %eax -; CHECK-AVX-NEXT: addl $1091567616, %eax # imm = 0x41100000 -; CHECK-AVX-NEXT: vmovd %eax, %xmm0 -; CHECK-AVX-NEXT: retq +; CHECK-AVX2-LABEL: fmul_fly_pow_mul_min_pow2: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: movq %rdi, %rcx +; CHECK-AVX2-NEXT: movl $8, %eax +; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-AVX2-NEXT: shlq %cl, %rax +; CHECK-AVX2-NEXT: cmpq $8192, %rax # imm = 0x2000 +; CHECK-AVX2-NEXT: movl $8192, %ecx # imm = 0x2000 +; CHECK-AVX2-NEXT: cmovbq %rax, %rcx +; CHECK-AVX2-NEXT: vcvtsi2ss %rcx, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-NO-FASTFMA-LABEL: fmul_fly_pow_mul_min_pow2: +; CHECK-NO-FASTFMA: # %bb.0: +; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx +; CHECK-NO-FASTFMA-NEXT: movl $8, %eax +; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx +; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax +; CHECK-NO-FASTFMA-NEXT: cmpq $8192, %rax # imm = 0x2000 +; CHECK-NO-FASTFMA-NEXT: movl $8192, %ecx # imm = 0x2000 +; CHECK-NO-FASTFMA-NEXT: cmovbq %rax, %rcx +; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %rcx, %xmm0, %xmm0 +; CHECK-NO-FASTFMA-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NO-FASTFMA-NEXT: retq +; +; CHECK-FMA-LABEL: fmul_fly_pow_mul_min_pow2: +; CHECK-FMA: # %bb.0: +; CHECK-FMA-NEXT: movl $8, %eax +; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax +; CHECK-FMA-NEXT: cmpq $8192, %rax # imm = 0x2000 +; CHECK-FMA-NEXT: movl $8192, %ecx # imm = 0x2000 +; CHECK-FMA-NEXT: cmovbq %rax, %rcx +; CHECK-FMA-NEXT: vcvtsi2ss %rcx, %xmm0, %xmm0 +; CHECK-FMA-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-FMA-NEXT: retq %shl8 = shl nuw i64 8, %cnt %shl = call i64 @llvm.umin.i64(i64 %shl8, i64 8192) %conv = uitofp i64 %shl to float @@ -1089,10 +1141,9 @@ ; ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: ; CHECK-FMA: # %bb.0: -; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2] -; CHECK-FMA-NEXT: vpsllvd %xmm0, %xmm2, %xmm0 -; CHECK-FMA-NEXT: vcvtudq2ps %xmm0, %xmm0 -; CHECK-FMA-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm1 +; CHECK-FMA-NEXT: vpslld $23, %xmm0, %xmm0 +; CHECK-FMA-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-FMA-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-FMA-NEXT: retq %shl = shl nsw nuw <4 x i32> , %cnt %conv = uitofp <4 x i32> %shl to <4 x float> diff --git a/llvm/test/CodeGen/X86/fold-masked-merge.ll b/llvm/test/CodeGen/X86/fold-masked-merge.ll --- a/llvm/test/CodeGen/X86/fold-masked-merge.ll +++ b/llvm/test/CodeGen/X86/fold-masked-merge.ll @@ -30,11 +30,10 @@ define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) { ; NOBMI-LABEL: masked_merge1: ; NOBMI: # %bb.0: -; NOBMI-NEXT: movl %edi, %eax -; NOBMI-NEXT: andl %edi, %esi -; NOBMI-NEXT: notl %eax -; NOBMI-NEXT: andl %edx, %eax -; NOBMI-NEXT: orl %esi, %eax +; NOBMI-NEXT: movl %esi, %eax +; NOBMI-NEXT: xorl %edx, %eax +; NOBMI-NEXT: andl %edi, %eax +; NOBMI-NEXT: xorl %edx, %eax ; NOBMI-NEXT: # kill: def $ax killed $ax killed $eax ; NOBMI-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/fold-rmw-ops.ll b/llvm/test/CodeGen/X86/fold-rmw-ops.ll --- a/llvm/test/CodeGen/X86/fold-rmw-ops.ll +++ b/llvm/test/CodeGen/X86/fold-rmw-ops.ll @@ -1041,9 +1041,12 @@ define void @and32_imm_br() nounwind { ; CHECK-LABEL: and32_imm_br: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: andl $-2147483648, g32(%rip) # encoding: [0x81,0x25,A,A,A,A,0x00,0x00,0x00,0x80] -; CHECK-NEXT: # fixup A - offset: 2, value: g32-8, kind: reloc_riprel_4byte +; CHECK-NEXT: movl $-2147483648, %eax # encoding: [0xb8,0x00,0x00,0x00,0x80] ; CHECK-NEXT: # imm = 0x80000000 +; CHECK-NEXT: andl g32(%rip), %eax # encoding: [0x23,0x05,A,A,A,A] +; CHECK-NEXT: # fixup A - offset: 2, value: g32-4, kind: reloc_riprel_4byte_relax +; CHECK-NEXT: movl %eax, g32(%rip) # encoding: [0x89,0x05,A,A,A,A] +; CHECK-NEXT: # fixup A - offset: 2, value: g32-4, kind: reloc_riprel_4byte ; CHECK-NEXT: jne b # TAILCALL ; CHECK-NEXT: # encoding: [0x75,A] ; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1 @@ -1127,9 +1130,12 @@ define void @and16_imm_br() nounwind { ; CHECK-LABEL: and16_imm_br: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: andw $-32768, g16(%rip) # encoding: [0x66,0x81,0x25,A,A,A,A,0x00,0x80] -; CHECK-NEXT: # fixup A - offset: 3, value: g16-6, kind: reloc_riprel_4byte +; CHECK-NEXT: movzwl g16(%rip), %eax # encoding: [0x0f,0xb7,0x05,A,A,A,A] +; CHECK-NEXT: # fixup A - offset: 3, value: g16-4, kind: reloc_riprel_4byte +; CHECK-NEXT: andl $32768, %eax # encoding: [0x25,0x00,0x80,0x00,0x00] ; CHECK-NEXT: # imm = 0x8000 +; CHECK-NEXT: movw %ax, g16(%rip) # encoding: [0x66,0x89,0x05,A,A,A,A] +; CHECK-NEXT: # fixup A - offset: 3, value: g16-4, kind: reloc_riprel_4byte ; CHECK-NEXT: jne b # TAILCALL ; CHECK-NEXT: # encoding: [0x75,A] ; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1 diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll @@ -1133,10 +1133,10 @@ ; X87-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X87-NEXT: movl {{[0-9]+}}(%esp), %edx ; X87-NEXT: movl {{[0-9]+}}(%esp), %edi -; X87-NEXT: movl %edi, 8(%esi) -; X87-NEXT: movl %edx, 12(%esi) -; X87-NEXT: movl %eax, (%esi) +; X87-NEXT: movl %edi, 12(%esi) +; X87-NEXT: movl %edx, 8(%esi) ; X87-NEXT: movl %ecx, 4(%esi) +; X87-NEXT: movl %eax, (%esi) ; X87-NEXT: movl %esi, %eax ; X87-NEXT: addl $36, %esp ; X87-NEXT: popl %esi @@ -1470,10 +1470,10 @@ ; X87-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X87-NEXT: movl {{[0-9]+}}(%esp), %edx ; X87-NEXT: movl {{[0-9]+}}(%esp), %edi -; X87-NEXT: movl %edi, 8(%esi) -; X87-NEXT: movl %edx, 12(%esi) -; X87-NEXT: movl %eax, (%esi) +; X87-NEXT: movl %edi, 12(%esi) +; X87-NEXT: movl %edx, 8(%esi) ; X87-NEXT: movl %ecx, 4(%esi) +; X87-NEXT: movl %eax, (%esi) ; X87-NEXT: movl %esi, %eax ; X87-NEXT: addl $36, %esp ; X87-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/fp-logic.ll b/llvm/test/CodeGen/X86/fp-logic.ll --- a/llvm/test/CodeGen/X86/fp-logic.ll +++ b/llvm/test/CodeGen/X86/fp-logic.ll @@ -231,8 +231,9 @@ define float @movmsk(float %x) { ; CHECK-LABEL: movmsk: ; CHECK: # %bb.0: -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: andps %xmm1, %xmm0 +; CHECK-NEXT: movmskps %xmm0, %eax +; CHECK-NEXT: shll $31, %eax +; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: retq %bc1 = bitcast float %x to i32 %and = and i32 %bc1, 2147483648 diff --git a/llvm/test/CodeGen/X86/fp128-cast-strict.ll b/llvm/test/CodeGen/X86/fp128-cast-strict.ll --- a/llvm/test/CodeGen/X86/fp128-cast-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-cast-strict.ll @@ -496,10 +496,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -635,10 +635,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -675,10 +675,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -715,10 +715,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -753,10 +753,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -792,10 +792,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -833,10 +833,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -873,10 +873,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -913,10 +913,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -951,10 +951,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -990,10 +990,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -1031,10 +1031,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/fp128-cast.ll b/llvm/test/CodeGen/X86/fp128-cast.ll --- a/llvm/test/CodeGen/X86/fp128-cast.ll +++ b/llvm/test/CodeGen/X86/fp128-cast.ll @@ -1123,11 +1123,9 @@ ; X64-SSE-NEXT: movaps %xmm0, %xmm1 ; X64-SSE-NEXT: callq __multf3@PLT ; X64-SSE-NEXT: movaps %xmm0, (%rsp) -; X64-SSE-NEXT: movq (%rsp), %rcx -; X64-SSE-NEXT: movq %rcx, %rdx -; X64-SSE-NEXT: shrq $32, %rdx +; X64-SSE-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; X64-SSE-NEXT: xorl %eax, %eax -; X64-SSE-NEXT: orl %ecx, %edx +; X64-SSE-NEXT: orl (%rsp), %ecx ; X64-SSE-NEXT: sete %al ; X64-SSE-NEXT: addq $24, %rsp ; X64-SSE-NEXT: retq @@ -1169,11 +1167,9 @@ ; X64-AVX-NEXT: vmovaps %xmm0, %xmm1 ; X64-AVX-NEXT: callq __multf3@PLT ; X64-AVX-NEXT: vmovaps %xmm0, (%rsp) -; X64-AVX-NEXT: movq (%rsp), %rcx -; X64-AVX-NEXT: movq %rcx, %rdx -; X64-AVX-NEXT: shrq $32, %rdx +; X64-AVX-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; X64-AVX-NEXT: xorl %eax, %eax -; X64-AVX-NEXT: orl %ecx, %edx +; X64-AVX-NEXT: orl (%rsp), %ecx ; X64-AVX-NEXT: sete %al ; X64-AVX-NEXT: addq $24, %rsp ; X64-AVX-NEXT: retq @@ -1221,14 +1217,14 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: addl $3, %ecx -; X32-NEXT: adcl $0, %edx -; X32-NEXT: adcl $0, %esi +; X32-NEXT: addl $3, %esi ; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, 8(%eax) -; X32-NEXT: movl %edx, 4(%eax) -; X32-NEXT: movl %ecx, (%eax) -; X32-NEXT: movl %edi, 12(%eax) +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %edx +; X32-NEXT: movl %ecx, 8(%eax) +; X32-NEXT: movl %edi, 4(%eax) +; X32-NEXT: movl %esi, (%eax) +; X32-NEXT: movl %edx, 12(%eax) ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: retl $4 diff --git a/llvm/test/CodeGen/X86/fpclamptosat.ll b/llvm/test/CodeGen/X86/fpclamptosat.ll --- a/llvm/test/CodeGen/X86/fpclamptosat.ll +++ b/llvm/test/CodeGen/X86/fpclamptosat.ll @@ -55,7 +55,7 @@ ; CHECK-NEXT: cmovlq %rcx, %rax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: cmovlel %ecx, %eax +; CHECK-NEXT: cmovsl %ecx, %eax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq entry: @@ -121,7 +121,7 @@ ; CHECK-NEXT: cmovlq %rcx, %rax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: cmovlel %ecx, %eax +; CHECK-NEXT: cmovsl %ecx, %eax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq entry: @@ -198,7 +198,7 @@ ; CHECK-NEXT: cmovlq %rcx, %rax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: cmovlel %ecx, %eax +; CHECK-NEXT: cmovsl %ecx, %eax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: retq @@ -260,7 +260,7 @@ ; CHECK-NEXT: cmovll %eax, %ecx ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testl %ecx, %ecx -; CHECK-NEXT: cmovgl %ecx, %eax +; CHECK-NEXT: cmovnsl %ecx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq entry: @@ -319,7 +319,7 @@ ; CHECK-NEXT: cmovll %eax, %ecx ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testl %ecx, %ecx -; CHECK-NEXT: cmovgl %ecx, %eax +; CHECK-NEXT: cmovnsl %ecx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq entry: @@ -391,7 +391,7 @@ ; CHECK-NEXT: cmovll %eax, %ecx ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testl %ecx, %ecx -; CHECK-NEXT: cmovgl %ecx, %eax +; CHECK-NEXT: cmovnsl %ecx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: retq @@ -453,14 +453,11 @@ ; CHECK-NEXT: callq __fixdfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: cmovgq %rcx, %rax ; CHECK-NEXT: movl $1, %esi ; CHECK-NEXT: cmovleq %rdx, %rsi -; CHECK-NEXT: cmovgq %rcx, %rax -; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: negq %rdx -; CHECK-NEXT: movl $0, %edx -; CHECK-NEXT: sbbq %rsi, %rdx -; CHECK-NEXT: cmovgeq %rcx, %rax +; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: cmovsq %rcx, %rax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: retq entry: @@ -519,14 +516,11 @@ ; CHECK-NEXT: callq __fixsfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: cmovgq %rcx, %rax ; CHECK-NEXT: movl $1, %esi ; CHECK-NEXT: cmovleq %rdx, %rsi -; CHECK-NEXT: cmovgq %rcx, %rax -; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: negq %rdx -; CHECK-NEXT: movl $0, %edx -; CHECK-NEXT: sbbq %rsi, %rdx -; CHECK-NEXT: cmovgeq %rcx, %rax +; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: cmovsq %rcx, %rax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: retq entry: @@ -591,14 +585,11 @@ ; CHECK-NEXT: callq __fixhfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: cmovgq %rcx, %rax ; CHECK-NEXT: movl $1, %esi ; CHECK-NEXT: cmovleq %rdx, %rsi -; CHECK-NEXT: cmovgq %rcx, %rax -; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: negq %rdx -; CHECK-NEXT: movl $0, %edx -; CHECK-NEXT: sbbq %rsi, %rdx -; CHECK-NEXT: cmovgeq %rcx, %rax +; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: cmovsq %rcx, %rax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll --- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll @@ -15,31 +15,32 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm1, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-NEXT: pxor %xmm4, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 ; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; CHECK-NEXT: pcmpgtd %xmm2, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; CHECK-NEXT: por %xmm2, %xmm3 ; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; CHECK-NEXT: por %xmm1, %xmm3 ; CHECK-NEXT: pxor %xmm3, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 -; CHECK-NEXT: pcmpeqd %xmm1, %xmm2 -; CHECK-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; CHECK-NEXT: pand %xmm2, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [18446744069414584320,18446744069414584320] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: pand %xmm0, %xmm3 -; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: por %xmm3, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: pand %xmm4, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-NEXT: por %xmm0, %xmm1 +; CHECK-NEXT: pand %xmm1, %xmm3 +; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: por %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; CHECK-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -76,11 +77,13 @@ ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] ; CHECK-NEXT: pxor %xmm2, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] -; CHECK-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: pandn %xmm1, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259455,9223372039002259455] +; CHECK-NEXT: movdqa %xmm1, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm0, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-NEXT: pand %xmm3, %xmm0 ; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 ; CHECK-NEXT: pand %xmm2, %xmm0 @@ -107,14 +110,14 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm1, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-NEXT: pxor %xmm4, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 ; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] -; CHECK-NEXT: pcmpgtd %xmm2, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; CHECK-NEXT: por %xmm2, %xmm3 ; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -163,27 +166,27 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm4, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm6, %xmm6 -; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 -; CHECK-NEXT: movdqa {{.*#+}} xmm7 = [4294967295,4294967295] -; CHECK-NEXT: movdqa %xmm7, %xmm8 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm8 -; CHECK-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm9 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] -; CHECK-NEXT: por %xmm9, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] +; CHECK-NEXT: movdqa %xmm5, %xmm6 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm5, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm7, %xmm8 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] +; CHECK-NEXT: por %xmm8, %xmm1 ; CHECK-NEXT: pand %xmm1, %xmm4 ; CHECK-NEXT: pandn %xmm3, %xmm1 ; CHECK-NEXT: por %xmm4, %xmm1 ; CHECK-NEXT: movdqa %xmm2, %xmm4 ; CHECK-NEXT: pxor %xmm0, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm4, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; CHECK-NEXT: movdqa %xmm5, %xmm6 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm5, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; CHECK-NEXT: pand %xmm7, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; CHECK-NEXT: por %xmm4, %xmm5 ; CHECK-NEXT: pand %xmm5, %xmm2 ; CHECK-NEXT: pandn %xmm3, %xmm5 @@ -191,30 +194,31 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] ; CHECK-NEXT: movdqa %xmm5, %xmm3 ; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm6, %xmm6 -; CHECK-NEXT: pcmpeqd %xmm6, %xmm4 -; CHECK-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320] -; CHECK-NEXT: pcmpgtd %xmm7, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm8 +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [18446744069414584320,18446744069414584320] +; CHECK-NEXT: movdqa %xmm3, %xmm6 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-NEXT: por %xmm8, %xmm3 -; CHECK-NEXT: pand %xmm3, %xmm5 -; CHECK-NEXT: pandn %xmm2, %xmm3 -; CHECK-NEXT: por %xmm5, %xmm3 +; CHECK-NEXT: pand %xmm7, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; CHECK-NEXT: por %xmm3, %xmm6 +; CHECK-NEXT: pand %xmm6, %xmm5 +; CHECK-NEXT: pandn %xmm2, %xmm6 +; CHECK-NEXT: por %xmm5, %xmm6 ; CHECK-NEXT: pxor %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm6, %xmm4 -; CHECK-NEXT: pcmpgtd %xmm7, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: por %xmm5, %xmm0 +; CHECK-NEXT: pand %xmm5, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; CHECK-NEXT: por %xmm4, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm1 ; CHECK-NEXT: pandn %xmm2, %xmm0 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] ; CHECK-NEXT: retq entry: %conv = fptosi <4 x float> %x to <4 x i64> @@ -273,27 +277,27 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; CHECK-NEXT: movdqa %xmm0, %xmm3 ; CHECK-NEXT: pxor %xmm2, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm5, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm4, %xmm6 +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; CHECK-NEXT: movdqa %xmm4, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm2, %xmm3 -; CHECK-NEXT: pand %xmm6, %xmm3 -; CHECK-NEXT: pcmpeqd %xmm4, %xmm4 +; CHECK-NEXT: pand %xmm5, %xmm3 +; CHECK-NEXT: pcmpeqd %xmm5, %xmm5 ; CHECK-NEXT: pand %xmm3, %xmm0 -; CHECK-NEXT: pxor %xmm4, %xmm3 +; CHECK-NEXT: pxor %xmm5, %xmm3 ; CHECK-NEXT: por %xmm0, %xmm3 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: pxor %xmm2, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm2 +; CHECK-NEXT: movdqa %xmm4, %xmm0 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] -; CHECK-NEXT: pcmpgtd %xmm6, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm2, %xmm0 -; CHECK-NEXT: pand %xmm5, %xmm0 -; CHECK-NEXT: pxor %xmm0, %xmm4 +; CHECK-NEXT: pcmpeqd %xmm4, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; CHECK-NEXT: pand %xmm6, %xmm0 +; CHECK-NEXT: pxor %xmm0, %xmm5 ; CHECK-NEXT: pand %xmm1, %xmm0 -; CHECK-NEXT: por %xmm4, %xmm0 +; CHECK-NEXT: por %xmm5, %xmm0 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; CHECK-NEXT: retq entry: @@ -326,27 +330,27 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm4, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm6, %xmm6 -; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 -; CHECK-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm7, %xmm8 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm8 -; CHECK-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm9 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] -; CHECK-NEXT: por %xmm9, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] +; CHECK-NEXT: movdqa %xmm5, %xmm6 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm5, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm7, %xmm8 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] +; CHECK-NEXT: por %xmm8, %xmm1 ; CHECK-NEXT: pand %xmm1, %xmm4 ; CHECK-NEXT: pandn %xmm3, %xmm1 ; CHECK-NEXT: por %xmm4, %xmm1 ; CHECK-NEXT: movdqa %xmm2, %xmm4 ; CHECK-NEXT: pxor %xmm0, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm4, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; CHECK-NEXT: movdqa %xmm5, %xmm6 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm5, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; CHECK-NEXT: pand %xmm7, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; CHECK-NEXT: por %xmm4, %xmm5 ; CHECK-NEXT: pand %xmm5, %xmm2 ; CHECK-NEXT: pandn %xmm3, %xmm5 @@ -421,28 +425,28 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm3, %xmm1 -; CHECK-NEXT: movdqa %xmm3, %xmm8 +; CHECK-NEXT: movdqa %xmm3, %xmm7 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm4, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] -; CHECK-NEXT: movdqa %xmm5, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; CHECK-NEXT: pand %xmm3, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] -; CHECK-NEXT: por %xmm7, %xmm1 -; CHECK-NEXT: pand %xmm1, %xmm8 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; CHECK-NEXT: por %xmm6, %xmm1 +; CHECK-NEXT: pand %xmm1, %xmm7 ; CHECK-NEXT: pandn %xmm2, %xmm1 -; CHECK-NEXT: por %xmm8, %xmm1 +; CHECK-NEXT: por %xmm7, %xmm1 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm7, %xmm3 -; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm4, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] +; CHECK-NEXT: movdqa %xmm7, %xmm4 +; CHECK-NEXT: pxor %xmm0, %xmm4 +; CHECK-NEXT: movdqa %xmm3, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; CHECK-NEXT: pand %xmm6, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] ; CHECK-NEXT: por %xmm3, %xmm4 @@ -453,30 +457,31 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] ; CHECK-NEXT: movdqa %xmm4, %xmm3 ; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm6, %xmm6 -; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 -; CHECK-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320] -; CHECK-NEXT: pcmpgtd %xmm7, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm8 +; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [18446744069414584320,18446744069414584320] +; CHECK-NEXT: movdqa %xmm3, %xmm6 +; CHECK-NEXT: pcmpgtd %xmm5, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm5, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-NEXT: por %xmm8, %xmm3 -; CHECK-NEXT: pand %xmm3, %xmm4 -; CHECK-NEXT: pandn %xmm2, %xmm3 -; CHECK-NEXT: por %xmm4, %xmm3 +; CHECK-NEXT: pand %xmm7, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; CHECK-NEXT: por %xmm3, %xmm6 +; CHECK-NEXT: pand %xmm6, %xmm4 +; CHECK-NEXT: pandn %xmm2, %xmm6 +; CHECK-NEXT: por %xmm4, %xmm6 ; CHECK-NEXT: pxor %xmm1, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm6, %xmm4 -; CHECK-NEXT: pcmpgtd %xmm7, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; CHECK-NEXT: movdqa %xmm0, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm5, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm5, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] ; CHECK-NEXT: pand %xmm4, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; CHECK-NEXT: por %xmm5, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm1 ; CHECK-NEXT: pandn %xmm2, %xmm0 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -553,28 +558,28 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; CHECK-NEXT: movdqa %xmm0, %xmm2 ; CHECK-NEXT: pxor %xmm1, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm4, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259455,9223372039002259455] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm1, %xmm2 -; CHECK-NEXT: pand %xmm5, %xmm2 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm3 +; CHECK-NEXT: pand %xmm4, %xmm2 +; CHECK-NEXT: pcmpeqd %xmm4, %xmm4 ; CHECK-NEXT: pand %xmm2, %xmm0 -; CHECK-NEXT: pxor %xmm3, %xmm2 +; CHECK-NEXT: pxor %xmm4, %xmm2 ; CHECK-NEXT: por %xmm0, %xmm2 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm6, %xmm0 -; CHECK-NEXT: pxor %xmm1, %xmm0 +; CHECK-NEXT: pxor %xmm6, %xmm1 +; CHECK-NEXT: movdqa %xmm3, %xmm0 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; CHECK-NEXT: pcmpgtd %xmm5, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-NEXT: pand %xmm4, %xmm0 -; CHECK-NEXT: pxor %xmm0, %xmm3 +; CHECK-NEXT: pcmpeqd %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm0 +; CHECK-NEXT: pxor %xmm0, %xmm4 ; CHECK-NEXT: pand %xmm6, %xmm0 -; CHECK-NEXT: por %xmm3, %xmm0 +; CHECK-NEXT: por %xmm4, %xmm0 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -625,28 +630,28 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm3, %xmm1 -; CHECK-NEXT: movdqa %xmm3, %xmm8 +; CHECK-NEXT: movdqa %xmm3, %xmm7 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm4, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm5, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; CHECK-NEXT: pand %xmm3, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] -; CHECK-NEXT: por %xmm7, %xmm1 -; CHECK-NEXT: pand %xmm1, %xmm8 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; CHECK-NEXT: por %xmm6, %xmm1 +; CHECK-NEXT: pand %xmm1, %xmm7 ; CHECK-NEXT: pandn %xmm2, %xmm1 -; CHECK-NEXT: por %xmm8, %xmm1 +; CHECK-NEXT: por %xmm7, %xmm1 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm7, %xmm3 -; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm4, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] +; CHECK-NEXT: movdqa %xmm7, %xmm4 +; CHECK-NEXT: pxor %xmm0, %xmm4 +; CHECK-NEXT: movdqa %xmm3, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; CHECK-NEXT: pand %xmm6, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] ; CHECK-NEXT: por %xmm3, %xmm4 @@ -1591,31 +1596,32 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm1, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-NEXT: pxor %xmm4, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 ; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; CHECK-NEXT: pcmpgtd %xmm2, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; CHECK-NEXT: por %xmm2, %xmm3 ; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; CHECK-NEXT: por %xmm1, %xmm3 ; CHECK-NEXT: pxor %xmm3, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 -; CHECK-NEXT: pcmpeqd %xmm1, %xmm2 -; CHECK-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; CHECK-NEXT: pand %xmm2, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [18446744069414584320,18446744069414584320] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: pand %xmm0, %xmm3 -; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: por %xmm3, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: pand %xmm4, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-NEXT: por %xmm0, %xmm1 +; CHECK-NEXT: pand %xmm1, %xmm3 +; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: por %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; CHECK-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -1650,11 +1656,13 @@ ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] ; CHECK-NEXT: pxor %xmm2, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] -; CHECK-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: pandn %xmm1, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259455,9223372039002259455] +; CHECK-NEXT: movdqa %xmm1, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm0, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-NEXT: pand %xmm3, %xmm0 ; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 ; CHECK-NEXT: pand %xmm2, %xmm0 @@ -1680,14 +1688,14 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm1, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-NEXT: pxor %xmm4, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 ; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] -; CHECK-NEXT: pcmpgtd %xmm2, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; CHECK-NEXT: por %xmm2, %xmm3 ; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -1733,59 +1741,60 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm3, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm5, %xmm5 -; CHECK-NEXT: pcmpeqd %xmm5, %xmm4 -; CHECK-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295] -; CHECK-NEXT: movdqa %xmm6, %xmm7 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm8 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] -; CHECK-NEXT: por %xmm8, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647] +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] +; CHECK-NEXT: movdqa %xmm4, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm6, %xmm7 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-NEXT: por %xmm7, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] ; CHECK-NEXT: pand %xmm1, %xmm3 -; CHECK-NEXT: pandn %xmm4, %xmm1 +; CHECK-NEXT: pandn %xmm5, %xmm1 ; CHECK-NEXT: por %xmm3, %xmm1 ; CHECK-NEXT: movdqa %xmm2, %xmm3 ; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm5, %xmm7 +; CHECK-NEXT: movdqa %xmm4, %xmm6 ; CHECK-NEXT: pcmpgtd %xmm3, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; CHECK-NEXT: pand %xmm7, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; CHECK-NEXT: por %xmm3, %xmm5 -; CHECK-NEXT: pand %xmm5, %xmm2 -; CHECK-NEXT: pandn %xmm4, %xmm5 -; CHECK-NEXT: por %xmm2, %xmm5 -; CHECK-NEXT: movdqa %xmm5, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; CHECK-NEXT: por %xmm3, %xmm4 +; CHECK-NEXT: pand %xmm4, %xmm2 +; CHECK-NEXT: pandn %xmm5, %xmm4 +; CHECK-NEXT: por %xmm2, %xmm4 +; CHECK-NEXT: movdqa %xmm4, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm4, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320] -; CHECK-NEXT: pcmpgtd %xmm6, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2] -; CHECK-NEXT: pand %xmm3, %xmm7 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [18446744069414584320,18446744069414584320] +; CHECK-NEXT: movdqa %xmm2, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-NEXT: por %xmm7, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [18446744071562067968,18446744071562067968] -; CHECK-NEXT: pand %xmm2, %xmm5 -; CHECK-NEXT: pandn %xmm3, %xmm2 -; CHECK-NEXT: por %xmm5, %xmm2 -; CHECK-NEXT: pxor %xmm1, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm4, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm6, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; CHECK-NEXT: pand %xmm6, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; CHECK-NEXT: por %xmm2, %xmm5 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] ; CHECK-NEXT: pand %xmm5, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: por %xmm4, %xmm0 +; CHECK-NEXT: pandn %xmm2, %xmm5 +; CHECK-NEXT: por %xmm4, %xmm5 +; CHECK-NEXT: pxor %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-NEXT: pand %xmm6, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; CHECK-NEXT: por %xmm3, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: pandn %xmm3, %xmm0 +; CHECK-NEXT: pandn %xmm2, %xmm0 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] ; CHECK-NEXT: retq entry: %conv = fptosi <4 x float> %x to <4 x i64> @@ -1842,27 +1851,27 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; CHECK-NEXT: movdqa %xmm0, %xmm3 ; CHECK-NEXT: pxor %xmm2, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm5, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm4, %xmm6 +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; CHECK-NEXT: movdqa %xmm4, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm2, %xmm3 -; CHECK-NEXT: pand %xmm6, %xmm3 -; CHECK-NEXT: pcmpeqd %xmm4, %xmm4 +; CHECK-NEXT: pand %xmm5, %xmm3 +; CHECK-NEXT: pcmpeqd %xmm5, %xmm5 ; CHECK-NEXT: pand %xmm3, %xmm0 -; CHECK-NEXT: pxor %xmm4, %xmm3 +; CHECK-NEXT: pxor %xmm5, %xmm3 ; CHECK-NEXT: por %xmm0, %xmm3 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: pxor %xmm2, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm2 +; CHECK-NEXT: movdqa %xmm4, %xmm0 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] -; CHECK-NEXT: pcmpgtd %xmm6, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm2, %xmm0 -; CHECK-NEXT: pand %xmm5, %xmm0 -; CHECK-NEXT: pxor %xmm0, %xmm4 +; CHECK-NEXT: pcmpeqd %xmm4, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; CHECK-NEXT: pand %xmm6, %xmm0 +; CHECK-NEXT: pxor %xmm0, %xmm5 ; CHECK-NEXT: pand %xmm1, %xmm0 -; CHECK-NEXT: por %xmm4, %xmm0 +; CHECK-NEXT: por %xmm5, %xmm0 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; CHECK-NEXT: retq entry: @@ -1893,33 +1902,33 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm3, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm5, %xmm5 -; CHECK-NEXT: pcmpeqd %xmm5, %xmm4 -; CHECK-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm6, %xmm7 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm8 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] -; CHECK-NEXT: por %xmm8, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647] +; CHECK-NEXT: movdqa %xmm4, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm6, %xmm7 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-NEXT: por %xmm7, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] ; CHECK-NEXT: pand %xmm1, %xmm3 -; CHECK-NEXT: pandn %xmm4, %xmm1 +; CHECK-NEXT: pandn %xmm5, %xmm1 ; CHECK-NEXT: por %xmm3, %xmm1 ; CHECK-NEXT: movdqa %xmm2, %xmm3 ; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm5, %xmm7 +; CHECK-NEXT: movdqa %xmm4, %xmm6 ; CHECK-NEXT: pcmpgtd %xmm3, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; CHECK-NEXT: pand %xmm7, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; CHECK-NEXT: por %xmm3, %xmm5 -; CHECK-NEXT: pand %xmm5, %xmm2 -; CHECK-NEXT: pandn %xmm4, %xmm5 -; CHECK-NEXT: por %xmm2, %xmm5 -; CHECK-NEXT: movdqa %xmm5, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; CHECK-NEXT: por %xmm3, %xmm4 +; CHECK-NEXT: pand %xmm4, %xmm2 +; CHECK-NEXT: pandn %xmm5, %xmm4 +; CHECK-NEXT: por %xmm2, %xmm4 +; CHECK-NEXT: movdqa %xmm4, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm3 ; CHECK-NEXT: pcmpgtd %xmm0, %xmm3 @@ -1928,7 +1937,7 @@ ; CHECK-NEXT: pand %xmm3, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; CHECK-NEXT: por %xmm2, %xmm3 -; CHECK-NEXT: pand %xmm5, %xmm3 +; CHECK-NEXT: pand %xmm4, %xmm3 ; CHECK-NEXT: movdqa %xmm1, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm4 @@ -1986,63 +1995,64 @@ ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm2, %xmm1 -; CHECK-NEXT: movdqa %xmm2, %xmm7 +; CHECK-NEXT: movdqa %xmm2, %xmm6 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm3, %xmm3 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] -; CHECK-NEXT: movdqa %xmm4, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; CHECK-NEXT: pand %xmm2, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] +; CHECK-NEXT: movdqa %xmm2, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm2, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm4, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; CHECK-NEXT: por %xmm5, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] +; CHECK-NEXT: pand %xmm1, %xmm6 +; CHECK-NEXT: pandn %xmm3, %xmm1 ; CHECK-NEXT: por %xmm6, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647] -; CHECK-NEXT: pand %xmm1, %xmm7 -; CHECK-NEXT: pandn %xmm2, %xmm1 -; CHECK-NEXT: por %xmm7, %xmm1 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm7, %xmm5 -; CHECK-NEXT: pxor %xmm0, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm3, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm5, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2] -; CHECK-NEXT: pand %xmm6, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; CHECK-NEXT: por %xmm3, %xmm4 -; CHECK-NEXT: movdqa %xmm7, %xmm3 -; CHECK-NEXT: pand %xmm4, %xmm3 -; CHECK-NEXT: pandn %xmm2, %xmm4 -; CHECK-NEXT: por %xmm3, %xmm4 +; CHECK-NEXT: movdqa %xmm7, %xmm4 +; CHECK-NEXT: pxor %xmm0, %xmm4 +; CHECK-NEXT: movdqa %xmm2, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-NEXT: pand %xmm6, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; CHECK-NEXT: por %xmm2, %xmm4 +; CHECK-NEXT: movdqa %xmm7, %xmm2 +; CHECK-NEXT: pand %xmm4, %xmm2 +; CHECK-NEXT: pandn %xmm3, %xmm4 +; CHECK-NEXT: por %xmm2, %xmm4 ; CHECK-NEXT: movdqa %xmm4, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm5, %xmm5 -; CHECK-NEXT: pcmpeqd %xmm5, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320] -; CHECK-NEXT: pcmpgtd %xmm6, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2] -; CHECK-NEXT: pand %xmm3, %xmm7 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [18446744069414584320,18446744069414584320] +; CHECK-NEXT: movdqa %xmm2, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-NEXT: por %xmm7, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [18446744071562067968,18446744071562067968] -; CHECK-NEXT: pand %xmm2, %xmm4 -; CHECK-NEXT: pandn %xmm3, %xmm2 -; CHECK-NEXT: por %xmm4, %xmm2 +; CHECK-NEXT: pand %xmm6, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; CHECK-NEXT: por %xmm2, %xmm5 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] +; CHECK-NEXT: pand %xmm5, %xmm4 +; CHECK-NEXT: pandn %xmm2, %xmm5 +; CHECK-NEXT: por %xmm4, %xmm5 ; CHECK-NEXT: pxor %xmm1, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm5, %xmm4 -; CHECK-NEXT: pcmpgtd %xmm6, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: por %xmm5, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-NEXT: pand %xmm6, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; CHECK-NEXT: por %xmm3, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: pandn %xmm3, %xmm0 +; CHECK-NEXT: pandn %xmm2, %xmm0 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -2117,28 +2127,28 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; CHECK-NEXT: movdqa %xmm0, %xmm2 ; CHECK-NEXT: pxor %xmm1, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm4, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259455,9223372039002259455] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm1, %xmm2 -; CHECK-NEXT: pand %xmm5, %xmm2 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm3 +; CHECK-NEXT: pand %xmm4, %xmm2 +; CHECK-NEXT: pcmpeqd %xmm4, %xmm4 ; CHECK-NEXT: pand %xmm2, %xmm0 -; CHECK-NEXT: pxor %xmm3, %xmm2 +; CHECK-NEXT: pxor %xmm4, %xmm2 ; CHECK-NEXT: por %xmm0, %xmm2 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm6, %xmm0 -; CHECK-NEXT: pxor %xmm1, %xmm0 +; CHECK-NEXT: pxor %xmm6, %xmm1 +; CHECK-NEXT: movdqa %xmm3, %xmm0 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; CHECK-NEXT: pcmpgtd %xmm5, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-NEXT: pand %xmm4, %xmm0 -; CHECK-NEXT: pxor %xmm0, %xmm3 +; CHECK-NEXT: pcmpeqd %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm0 +; CHECK-NEXT: pxor %xmm0, %xmm4 ; CHECK-NEXT: pand %xmm6, %xmm0 -; CHECK-NEXT: por %xmm3, %xmm0 +; CHECK-NEXT: por %xmm4, %xmm0 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -2187,36 +2197,36 @@ ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm2, %xmm1 -; CHECK-NEXT: movdqa %xmm2, %xmm7 +; CHECK-NEXT: movdqa %xmm2, %xmm6 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm3, %xmm3 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm4, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; CHECK-NEXT: pand %xmm2, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647] +; CHECK-NEXT: movdqa %xmm2, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm2, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm4, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; CHECK-NEXT: por %xmm5, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] +; CHECK-NEXT: pand %xmm1, %xmm6 +; CHECK-NEXT: pandn %xmm3, %xmm1 ; CHECK-NEXT: por %xmm6, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] -; CHECK-NEXT: pand %xmm1, %xmm7 -; CHECK-NEXT: pandn %xmm2, %xmm1 -; CHECK-NEXT: por %xmm7, %xmm1 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm7, %xmm5 -; CHECK-NEXT: pxor %xmm0, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm3, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm5, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2] -; CHECK-NEXT: pand %xmm6, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; CHECK-NEXT: por %xmm3, %xmm4 -; CHECK-NEXT: movdqa %xmm7, %xmm3 -; CHECK-NEXT: pand %xmm4, %xmm3 -; CHECK-NEXT: pandn %xmm2, %xmm4 -; CHECK-NEXT: por %xmm3, %xmm4 +; CHECK-NEXT: movdqa %xmm7, %xmm4 +; CHECK-NEXT: pxor %xmm0, %xmm4 +; CHECK-NEXT: movdqa %xmm2, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-NEXT: pand %xmm6, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; CHECK-NEXT: por %xmm2, %xmm4 +; CHECK-NEXT: movdqa %xmm7, %xmm2 +; CHECK-NEXT: pand %xmm4, %xmm2 +; CHECK-NEXT: pandn %xmm3, %xmm4 +; CHECK-NEXT: por %xmm2, %xmm4 ; CHECK-NEXT: movdqa %xmm4, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm3 diff --git a/llvm/test/CodeGen/X86/fpenv-combine.ll b/llvm/test/CodeGen/X86/fpenv-combine.ll --- a/llvm/test/CodeGen/X86/fpenv-combine.ll +++ b/llvm/test/CodeGen/X86/fpenv-combine.ll @@ -21,14 +21,14 @@ ; X64-NEXT: movq (%rsp), %rcx ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; X64-NEXT: movq %rsi, 16(%r14) -; X64-NEXT: movq %rcx, (%r14) -; X64-NEXT: movq %rax, 24(%r14) +; X64-NEXT: movq %rsi, 24(%r14) ; X64-NEXT: movq %rdx, 8(%r14) -; X64-NEXT: movq %rsi, 16(%rbx) -; X64-NEXT: movq %rcx, (%rbx) -; X64-NEXT: movq %rax, 24(%rbx) +; X64-NEXT: movq %rax, 16(%r14) +; X64-NEXT: movq %rcx, (%r14) +; X64-NEXT: movq %rsi, 24(%rbx) +; X64-NEXT: movq %rax, 16(%rbx) ; X64-NEXT: movq %rdx, 8(%rbx) +; X64-NEXT: movq %rcx, (%rbx) ; X64-NEXT: addq $40, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r14 diff --git a/llvm/test/CodeGen/X86/fpenv.ll b/llvm/test/CodeGen/X86/fpenv.ll --- a/llvm/test/CodeGen/X86/fpenv.ll +++ b/llvm/test/CodeGen/X86/fpenv.ll @@ -16,7 +16,9 @@ ; X86-NOSSE: # %bb.0: ; X86-NOSSE-NEXT: pushl %eax ; X86-NOSSE-NEXT: fnstcw (%esp) -; X86-NOSSE-NEXT: orb $12, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movzwl (%esp), %eax +; X86-NOSSE-NEXT: orl $3072, %eax # imm = 0xC00 +; X86-NOSSE-NEXT: movw %ax, (%esp) ; X86-NOSSE-NEXT: fldcw (%esp) ; X86-NOSSE-NEXT: popl %eax ; X86-NOSSE-NEXT: retl @@ -25,7 +27,9 @@ ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pushl %eax ; X86-SSE-NEXT: fnstcw (%esp) -; X86-SSE-NEXT: orb $12, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movzwl (%esp), %eax +; X86-SSE-NEXT: orl $3072, %eax # imm = 0xC00 +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: fldcw (%esp) ; X86-SSE-NEXT: stmxcsr (%esp) ; X86-SSE-NEXT: orb $96, {{[0-9]+}}(%esp) @@ -36,7 +40,9 @@ ; X64-LABEL: func_01: ; X64: # %bb.0: ; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) -; X64-NEXT: orb $12, -{{[0-9]+}}(%rsp) +; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: orl $3072, %eax # imm = 0xC00 +; X64-NEXT: movw %ax, -{{[0-9]+}}(%rsp) ; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) ; X64-NEXT: stmxcsr -{{[0-9]+}}(%rsp) ; X64-NEXT: orb $96, -{{[0-9]+}}(%rsp) @@ -86,8 +92,8 @@ ; X86-NOSSE: # %bb.0: ; X86-NOSSE-NEXT: pushl %eax ; X86-NOSSE-NEXT: fnstcw (%esp) -; X86-NOSSE-NEXT: movl $-3073, %eax # imm = 0xF3FF -; X86-NOSSE-NEXT: andl (%esp), %eax +; X86-NOSSE-NEXT: movzwl (%esp), %eax +; X86-NOSSE-NEXT: andl $62463, %eax # imm = 0xF3FF ; X86-NOSSE-NEXT: orl $2048, %eax # imm = 0x800 ; X86-NOSSE-NEXT: movw %ax, (%esp) ; X86-NOSSE-NEXT: fldcw (%esp) @@ -98,8 +104,8 @@ ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pushl %eax ; X86-SSE-NEXT: fnstcw (%esp) -; X86-SSE-NEXT: movl $-3073, %eax # imm = 0xF3FF -; X86-SSE-NEXT: andl (%esp), %eax +; X86-SSE-NEXT: movzwl (%esp), %eax +; X86-SSE-NEXT: andl $62463, %eax # imm = 0xF3FF ; X86-SSE-NEXT: orl $2048, %eax # imm = 0x800 ; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: fldcw (%esp) @@ -115,8 +121,8 @@ ; X64-LABEL: func_03: ; X64: # %bb.0: ; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) -; X64-NEXT: movl $-3073, %eax # imm = 0xF3FF -; X64-NEXT: andl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: andl $62463, %eax # imm = 0xF3FF ; X64-NEXT: orl $2048, %eax # imm = 0x800 ; X64-NEXT: movw %ax, -{{[0-9]+}}(%rsp) ; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) @@ -136,8 +142,8 @@ ; X86-NOSSE: # %bb.0: ; X86-NOSSE-NEXT: pushl %eax ; X86-NOSSE-NEXT: fnstcw (%esp) -; X86-NOSSE-NEXT: movl $-3073, %eax # imm = 0xF3FF -; X86-NOSSE-NEXT: andl (%esp), %eax +; X86-NOSSE-NEXT: movzwl (%esp), %eax +; X86-NOSSE-NEXT: andl $62463, %eax # imm = 0xF3FF ; X86-NOSSE-NEXT: orl $1024, %eax # imm = 0x400 ; X86-NOSSE-NEXT: movw %ax, (%esp) ; X86-NOSSE-NEXT: fldcw (%esp) @@ -148,8 +154,8 @@ ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pushl %eax ; X86-SSE-NEXT: fnstcw (%esp) -; X86-SSE-NEXT: movl $-3073, %eax # imm = 0xF3FF -; X86-SSE-NEXT: andl (%esp), %eax +; X86-SSE-NEXT: movzwl (%esp), %eax +; X86-SSE-NEXT: andl $62463, %eax # imm = 0xF3FF ; X86-SSE-NEXT: orl $1024, %eax # imm = 0x400 ; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: fldcw (%esp) @@ -165,8 +171,8 @@ ; X64-LABEL: func_04: ; X64: # %bb.0: ; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) -; X64-NEXT: movl $-3073, %eax # imm = 0xF3FF -; X64-NEXT: andl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: andl $62463, %eax # imm = 0xF3FF ; X64-NEXT: orl $1024, %eax # imm = 0x400 ; X64-NEXT: movw %ax, -{{[0-9]+}}(%rsp) ; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) @@ -192,8 +198,8 @@ ; X86-NOSSE-NEXT: shll %cl, %eax ; X86-NOSSE-NEXT: andl $3072, %eax # imm = 0xC00 ; X86-NOSSE-NEXT: fnstcw (%esp) -; X86-NOSSE-NEXT: movl $-3073, %ecx # imm = 0xF3FF -; X86-NOSSE-NEXT: andl (%esp), %ecx +; X86-NOSSE-NEXT: movzwl (%esp), %ecx +; X86-NOSSE-NEXT: andl $62463, %ecx # imm = 0xF3FF ; X86-NOSSE-NEXT: orl %eax, %ecx ; X86-NOSSE-NEXT: movw %cx, (%esp) ; X86-NOSSE-NEXT: fldcw (%esp) @@ -210,8 +216,8 @@ ; X86-SSE-NEXT: shll %cl, %eax ; X86-SSE-NEXT: andl $3072, %eax # imm = 0xC00 ; X86-SSE-NEXT: fnstcw (%esp) -; X86-SSE-NEXT: movl $-3073, %ecx # imm = 0xF3FF -; X86-SSE-NEXT: andl (%esp), %ecx +; X86-SSE-NEXT: movzwl (%esp), %ecx +; X86-SSE-NEXT: andl $62463, %ecx # imm = 0xF3FF ; X86-SSE-NEXT: orl %eax, %ecx ; X86-SSE-NEXT: movw %cx, (%esp) ; X86-SSE-NEXT: fldcw (%esp) @@ -233,8 +239,8 @@ ; X64-NEXT: shll %cl, %eax ; X64-NEXT: andl $3072, %eax # imm = 0xC00 ; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) -; X64-NEXT: movl $-3073, %ecx # imm = 0xF3FF -; X64-NEXT: andl -{{[0-9]+}}(%rsp), %ecx +; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %ecx +; X64-NEXT: andl $62463, %ecx # imm = 0xF3FF ; X64-NEXT: orl %eax, %ecx ; X64-NEXT: movw %cx, -{{[0-9]+}}(%rsp) ; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) diff --git a/llvm/test/CodeGen/X86/freeze-unary.ll b/llvm/test/CodeGen/X86/freeze-unary.ll --- a/llvm/test/CodeGen/X86/freeze-unary.ll +++ b/llvm/test/CodeGen/X86/freeze-unary.ll @@ -6,6 +6,7 @@ ; X86-LABEL: freeze_sext: ; X86: # %bb.0: ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cwtl ; X86-NEXT: retl ; ; X64-LABEL: freeze_sext: @@ -40,6 +41,7 @@ ; X86-LABEL: freeze_zext: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: retl ; ; X64-LABEL: freeze_zext: diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll --- a/llvm/test/CodeGen/X86/freeze-vector.ll +++ b/llvm/test/CodeGen/X86/freeze-vector.ll @@ -349,15 +349,16 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl (%edx), %edx ; X86-NEXT: andl $15, %edx -; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 -; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7] -; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-NEXT: vmovdqa %xmm0, (%ecx) ; X86-NEXT: vmovd %edx, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X86-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7] +; X86-NEXT: vpand %xmm3, %xmm1, %xmm1 +; X86-NEXT: vmovdqa %xmm1, (%ecx) ; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] -; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpand %xmm3, %xmm0, %xmm0 ; X86-NEXT: vmovdqa %xmm0, (%eax) ; X86-NEXT: retl ; @@ -365,15 +366,15 @@ ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: andl $15, %eax -; X64-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7] -; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-NEXT: vmovdqa %xmm0, (%rdx) ; X64-NEXT: vmovd %eax, %xmm0 ; X64-NEXT: vpbroadcastd %xmm0, %xmm0 -; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] -; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1],xmm1[2,3] +; X64-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7] +; X64-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X64-NEXT: vmovdqa %xmm2, (%rdx) +; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] +; X64-NEXT: vpand %xmm3, %xmm0, %xmm0 ; X64-NEXT: vmovdqa %xmm0, (%rcx) ; X64-NEXT: retq %i0.src = load i32, ptr %origin0 diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll --- a/llvm/test/CodeGen/X86/funnel-shift.ll +++ b/llvm/test/CodeGen/X86/funnel-shift.ll @@ -180,7 +180,7 @@ ; X64-AVX2-NEXT: andq %rdx, %rax ; X64-AVX2-NEXT: movabsq $498560650640798693, %rdx # imm = 0x6EB3E45306EB3E5 ; X64-AVX2-NEXT: mulq %rdx -; X64-AVX2-NEXT: leal (%rdx,%rdx,8), %eax +; X64-AVX2-NEXT: leaq (%rdx,%rdx,8), %rax ; X64-AVX2-NEXT: leal (%rdx,%rax,4), %eax ; X64-AVX2-NEXT: subl %eax, %ecx ; X64-AVX2-NEXT: shlq $27, %rsi @@ -347,7 +347,7 @@ ; X64-AVX2-NEXT: andq %rdx, %rax ; X64-AVX2-NEXT: movabsq $498560650640798693, %rdx # imm = 0x6EB3E45306EB3E5 ; X64-AVX2-NEXT: mulq %rdx -; X64-AVX2-NEXT: leal (%rdx,%rdx,8), %eax +; X64-AVX2-NEXT: leaq (%rdx,%rdx,8), %rax ; X64-AVX2-NEXT: leal (%rdx,%rax,4), %eax ; X64-AVX2-NEXT: subl %eax, %ecx ; X64-AVX2-NEXT: addl $27, %ecx @@ -979,23 +979,21 @@ ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: leal (%eax,%eax,2), %esi -; X86-SSE2-NEXT: movzwl 8(%ecx,%esi,4), %edx -; X86-SSE2-NEXT: movl 4(%ecx,%esi,4), %edi -; X86-SSE2-NEXT: shrdl $8, %edx, %edi -; X86-SSE2-NEXT: xorl %eax, %edi +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SSE2-NEXT: leal (%eax,%eax,2), %edi +; X86-SSE2-NEXT: movzwl 8(%esi,%edi,4), %ecx +; X86-SSE2-NEXT: movl 4(%esi,%edi,4), %edx +; X86-SSE2-NEXT: shrdl $8, %ecx, %edx +; X86-SSE2-NEXT: xorl %eax, %edx ; X86-SSE2-NEXT: sarl $31, %eax -; X86-SSE2-NEXT: movzbl 10(%ecx,%esi,4), %ecx -; X86-SSE2-NEXT: shll $16, %ecx -; X86-SSE2-NEXT: orl %edx, %ecx -; X86-SSE2-NEXT: shll $8, %ecx -; X86-SSE2-NEXT: movl %ecx, %edx -; X86-SSE2-NEXT: sarl $8, %edx -; X86-SSE2-NEXT: sarl $31, %ecx -; X86-SSE2-NEXT: shldl $24, %edx, %ecx -; X86-SSE2-NEXT: xorl %eax, %ecx +; X86-SSE2-NEXT: movsbl 10(%esi,%edi,4), %esi +; X86-SSE2-NEXT: movl %esi, %edi +; X86-SSE2-NEXT: shll $16, %edi ; X86-SSE2-NEXT: orl %ecx, %edi +; X86-SSE2-NEXT: sarl $31, %esi +; X86-SSE2-NEXT: shldl $24, %edi, %esi +; X86-SSE2-NEXT: xorl %eax, %esi +; X86-SSE2-NEXT: orl %esi, %edx ; X86-SSE2-NEXT: jne .LBB46_1 ; X86-SSE2-NEXT: # %bb.2: ; X86-SSE2-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/h-registers-2.ll b/llvm/test/CodeGen/X86/h-registers-2.ll --- a/llvm/test/CodeGen/X86/h-registers-2.ll +++ b/llvm/test/CodeGen/X86/h-registers-2.ll @@ -8,8 +8,7 @@ ; CHECK-LABEL: foo: ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movzbl %ah, %eax +; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movb $77, (%ecx,%eax,8) ; CHECK-NEXT: shll $3, %eax ; CHECK-NEXT: retl diff --git a/llvm/test/CodeGen/X86/haddsub-2.ll b/llvm/test/CodeGen/X86/haddsub-2.ll --- a/llvm/test/CodeGen/X86/haddsub-2.ll +++ b/llvm/test/CodeGen/X86/haddsub-2.ll @@ -195,12 +195,12 @@ ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE3-NEXT: movd %xmm0, %esi ; SSE3-NEXT: addl %eax, %esi +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: movd %xmm1, %edi +; SSE3-NEXT: addl %eax, %edi ; SSE3-NEXT: movd %esi, %xmm0 -; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; SSE3-NEXT: movd %xmm2, %eax -; SSE3-NEXT: movd %xmm1, %esi -; SSE3-NEXT: addl %eax, %esi -; SSE3-NEXT: movd %esi, %xmm1 +; SSE3-NEXT: movd %edi, %xmm1 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE3-NEXT: movd %ecx, %xmm2 ; SSE3-NEXT: movd %edx, %xmm0 @@ -311,12 +311,12 @@ ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] ; SSE3-NEXT: movd %xmm0, %esi ; SSE3-NEXT: subl %esi, %edx -; SSE3-NEXT: movd %edx, %xmm0 -; SSE3-NEXT: movd %xmm1, %edx -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE3-NEXT: movd %xmm1, %esi -; SSE3-NEXT: subl %esi, %edx -; SSE3-NEXT: movd %edx, %xmm1 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE3-NEXT: movd %xmm0, %edi +; SSE3-NEXT: subl %edi, %esi +; SSE3-NEXT: movd %edx, %xmm0 +; SSE3-NEXT: movd %esi, %xmm1 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE3-NEXT: movd %eax, %xmm2 ; SSE3-NEXT: movd %ecx, %xmm0 diff --git a/llvm/test/CodeGen/X86/haddsub-3.ll b/llvm/test/CodeGen/X86/haddsub-3.ll --- a/llvm/test/CodeGen/X86/haddsub-3.ll +++ b/llvm/test/CodeGen/X86/haddsub-3.ll @@ -72,11 +72,11 @@ ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; SSE2-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: movapd %xmm2, %xmm3 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE2-NEXT: addpd %xmm3, %xmm2 -; SSE2-NEXT: divpd %xmm2, %xmm1 -; SSE2-NEXT: divpd %xmm2, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE2-NEXT: addsd %xmm2, %xmm3 +; SSE2-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0,0] +; SSE2-NEXT: divpd %xmm3, %xmm1 +; SSE2-NEXT: divpd %xmm3, %xmm0 ; SSE2-NEXT: xorpd %xmm2, %xmm2 ; SSE2-NEXT: addpd %xmm2, %xmm0 ; SSE2-NEXT: addpd %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/haddsub-4.ll b/llvm/test/CodeGen/X86/haddsub-4.ll --- a/llvm/test/CodeGen/X86/haddsub-4.ll +++ b/llvm/test/CodeGen/X86/haddsub-4.ll @@ -121,25 +121,26 @@ define <8 x float> @hadd_reverse3_v8f32(<8 x float> %a0, <8 x float> %a1) { ; SSE-LABEL: hadd_reverse3_v8f32: ; SSE: # %bb.0: -; SSE-NEXT: haddps %xmm1, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0,3,2] -; SSE-NEXT: haddps %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0,3,2] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: haddps %xmm2, %xmm4 +; SSE-NEXT: haddps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2,1,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,2,1,0] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: hadd_reverse3_v8f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vhaddps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_reverse3_v8f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vhaddps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-NEXT: retq %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll --- a/llvm/test/CodeGen/X86/haddsub-shuf.ll +++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll @@ -554,6 +554,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_v8i32b: @@ -670,6 +671,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vphsubd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hsub_v8i32b: @@ -814,6 +816,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_v16i16b: @@ -954,6 +957,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vphsubw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hsub_v16i16b: @@ -1013,15 +1017,45 @@ } define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) { -; SSE-LABEL: PR34724_2: -; SSE: # %bb.0: -; SSE-NEXT: haddps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE_SLOW-LABEL: PR34724_2: +; SSE_SLOW: # %bb.0: +; SSE_SLOW-NEXT: haddps %xmm1, %xmm0 +; SSE_SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE_SLOW-NEXT: addps %xmm1, %xmm2 +; SSE_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] +; SSE_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE_SLOW-NEXT: retq ; -; AVX-LABEL: PR34724_2: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE_FAST-LABEL: PR34724_2: +; SSE_FAST: # %bb.0: +; SSE_FAST-NEXT: haddps %xmm1, %xmm0 +; SSE_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: PR34724_2: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX1_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] +; AVX1_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 +; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: PR34724_2: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: PR34724_2: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX2_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] +; AVX2_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 +; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: PR34724_2: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX2_FAST-NEXT: retq %t0 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %t1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %t2 = fadd <4 x float> %t0, %t1 diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll --- a/llvm/test/CodeGen/X86/haddsub-undef.ll +++ b/llvm/test/CodeGen/X86/haddsub-undef.ll @@ -216,7 +216,7 @@ ; AVX-FAST-LABEL: test8_undef: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,1] ; AVX-FAST-NEXT: retq %vecext = extractelement <4 x float> %a, i32 0 %vecext1 = extractelement <4 x float> %a, i32 1 @@ -504,32 +504,17 @@ } define <4 x float> @add_ps_030(<4 x float> %x) { -; SSE-SLOW-LABEL: add_ps_030: -; SSE-SLOW: # %bb.0: -; SSE-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,2,3] -; SSE-SLOW-NEXT: addps %xmm1, %xmm0 -; SSE-SLOW-NEXT: retq -; -; SSE-FAST-LABEL: add_ps_030: -; SSE-FAST: # %bb.0: -; SSE-FAST-NEXT: haddps %xmm0, %xmm0 -; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,2,3] -; SSE-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: add_ps_030: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3] -; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: retq +; SSE-LABEL: add_ps_030: +; SSE: # %bb.0: +; SSE-NEXT: haddps %xmm0, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,2,3] +; SSE-NEXT: retq ; -; AVX-FAST-LABEL: add_ps_030: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] -; AVX-FAST-NEXT: retq +; AVX-LABEL: add_ps_030: +; AVX: # %bb.0: +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %add = fadd <4 x float> %l, %r @@ -584,14 +569,14 @@ ; SSE-LABEL: add_ps_016: ; SSE: # %bb.0: ; SSE-NEXT: haddps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0,3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0,3,1] ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: add_ps_016: ; AVX: # %bb.0: ; AVX-NEXT: vhaddps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,3,1] ; AVX-NEXT: retq %3 = shufflevector <4 x float> %1, <4 x float> %0, <2 x i32> %4 = shufflevector <4 x float> %1, <4 x float> %0, <2 x i32> @@ -1127,40 +1112,68 @@ ; SSE-SLOW-LABEL: PR34724_add_v4f64_u123: ; SSE-SLOW: # %bb.0: ; SSE-SLOW-NEXT: haddpd %xmm2, %xmm1 +; SSE-SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] ; SSE-SLOW-NEXT: movapd %xmm3, %xmm2 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-SLOW-NEXT: addsd %xmm3, %xmm2 -; SSE-SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] ; SSE-SLOW-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0] ; SSE-SLOW-NEXT: retq ; ; SSE-FAST-LABEL: PR34724_add_v4f64_u123: ; SSE-FAST: # %bb.0: ; SSE-FAST-NEXT: movapd %xmm1, %xmm0 -; SSE-FAST-NEXT: haddpd %xmm3, %xmm2 ; SSE-FAST-NEXT: haddpd %xmm1, %xmm0 +; SSE-FAST-NEXT: haddpd %xmm3, %xmm2 ; SSE-FAST-NEXT: movapd %xmm2, %xmm1 ; SSE-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: PR34724_add_v4f64_u123: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0] -; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: PR34724_add_v4f64_u123: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm2 = xmm0[0,0] +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX1-SLOW-NEXT: vaddsd %xmm3, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: PR34724_add_v4f64_u123: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX-FAST-NEXT: vhaddpd %ymm0, %ymm1, %ymm0 -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: PR34724_add_v4f64_u123: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 +; AVX1-FAST-NEXT: vhaddpd %ymm0, %ymm2, %ymm0 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-FAST-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: PR34724_add_v4f64_u123: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] +; AVX512-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX512-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: PR34724_add_v4f64_u123: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 +; AVX512-FAST-NEXT: vhaddpd %ymm0, %ymm2, %ymm0 +; AVX512-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX512-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-FAST-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX512-FAST-NEXT: retq %3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> %4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> %5 = fadd <2 x double> %3, %4 @@ -1193,21 +1206,48 @@ ; SSE-FAST-NEXT: haddpd %xmm3, %xmm1 ; SSE-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: PR34724_add_v4f64_0u23: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0] -; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: PR34724_add_v4f64_0u23: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0] +; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: PR34724_add_v4f64_0u23: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: PR34724_add_v4f64_0u23: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-FAST-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: PR34724_add_v4f64_0u23: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX512-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX512-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: PR34724_add_v4f64_0u23: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX512-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-FAST-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX512-FAST-NEXT: retq %3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> %4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> %5 = fadd <2 x double> %3, %4 @@ -1239,28 +1279,42 @@ ; SSE-FAST-NEXT: movapd %xmm3, %xmm1 ; SSE-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: PR34724_add_v4f64_01u3: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] -; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: PR34724_add_v4f64_01u3: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-SLOW-NEXT: retq ; ; AVX1-FAST-LABEL: PR34724_add_v4f64_01u3: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX1-FAST-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 +; AVX1-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] +; AVX1-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 ; AVX1-FAST-NEXT: retq ; +; AVX512-SLOW-LABEL: PR34724_add_v4f64_01u3: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX512-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX512-SLOW-NEXT: retq +; ; AVX512-FAST-LABEL: PR34724_add_v4f64_01u3: ; AVX512-FAST: # %bb.0: -; AVX512-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 -; AVX512-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX512-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-FAST-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX512-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-FAST-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX512-FAST-NEXT: retq %3 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> %4 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> @@ -1292,22 +1346,39 @@ ; SSE-FAST-NEXT: movapd %xmm2, %xmm1 ; SSE-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: PR34724_add_v4f64_012u: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: PR34724_add_v4f64_012u: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: PR34724_add_v4f64_012u: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: PR34724_add_v4f64_012u: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 +; AVX1-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: PR34724_add_v4f64_012u: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX512-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3] +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: PR34724_add_v4f64_012u: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX512-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: retq %3 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> %4 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> %5 = fadd <2 x double> %3, %4 diff --git a/llvm/test/CodeGen/X86/haddsub.ll b/llvm/test/CodeGen/X86/haddsub.ll --- a/llvm/test/CodeGen/X86/haddsub.ll +++ b/llvm/test/CodeGen/X86/haddsub.ll @@ -1638,33 +1638,25 @@ ; ; SSE3-FAST-LABEL: fadd_reduce_v8f32: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: haddps %xmm1, %xmm2 -; SSE3-FAST-NEXT: haddps %xmm2, %xmm2 +; SSE3-FAST-NEXT: addps %xmm2, %xmm1 +; SSE3-FAST-NEXT: movaps %xmm1, %xmm2 +; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE3-FAST-NEXT: addps %xmm1, %xmm2 ; SSE3-FAST-NEXT: haddps %xmm2, %xmm2 ; SSE3-FAST-NEXT: addss %xmm2, %xmm0 ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: fadd_reduce_v8f32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: fadd_reduce_v8f32: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-FAST-NEXT: vhaddps %xmm1, %xmm2, %xmm1 -; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX-LABEL: fadd_reduce_v8f32: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %r = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1) ret float %r } @@ -1681,29 +1673,20 @@ ; ; SSE3-FAST-LABEL: fadd_reduce_v4f64: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: haddpd %xmm1, %xmm2 -; SSE3-FAST-NEXT: haddpd %xmm2, %xmm2 -; SSE3-FAST-NEXT: addsd %xmm2, %xmm0 +; SSE3-FAST-NEXT: addpd %xmm2, %xmm1 +; SSE3-FAST-NEXT: haddpd %xmm1, %xmm1 +; SSE3-FAST-NEXT: addsd %xmm1, %xmm0 ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: fadd_reduce_v4f64: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: fadd_reduce_v4f64: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-FAST-NEXT: vhaddpd %xmm1, %xmm2, %xmm1 -; AVX-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 -; AVX-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX-LABEL: fadd_reduce_v4f64: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vaddpd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %r = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1) ret double %r } @@ -1751,15 +1734,6 @@ ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: PR39936_v8f32: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq %2 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> %3 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> %4 = fadd <8 x float> %2, %3 @@ -1830,22 +1804,14 @@ ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: hadd32_8: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: hadd32_8: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX-LABEL: hadd32_8: +; AVX: # %bb.0: +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> %x227 = fadd <8 x float> %x225, %x226 %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> @@ -1880,14 +1846,6 @@ ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: hadd32_16: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> %x227 = fadd <16 x float> %x225, %x226 %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> @@ -1932,7 +1890,8 @@ ; AVX: # %bb.0: ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> @@ -1951,14 +1910,6 @@ ; SSE3-NEXT: addps %xmm1, %xmm0 ; SSE3-NEXT: haddps %xmm0, %xmm0 ; SSE3-NEXT: retq -; -; AVX-LABEL: hadd32_16_optsize: -; AVX: # %bb.0: -; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> %x227 = fadd <16 x float> %x225, %x226 %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> @@ -2003,7 +1954,8 @@ ; AVX: # %bb.0: ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> @@ -2022,14 +1974,6 @@ ; SSE3-NEXT: addps %xmm1, %xmm0 ; SSE3-NEXT: haddps %xmm0, %xmm0 ; SSE3-NEXT: retq -; -; AVX-LABEL: hadd32_16_pgso: -; AVX: # %bb.0: -; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> %x227 = fadd <16 x float> %x225, %x226 %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> @@ -2056,21 +2000,14 @@ ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: partial_reduction_fadd_v8f32: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX-LABEL: partial_reduction_fadd_v8f32: +; AVX: # %bb.0: +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %x23 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> %x0213 = fadd <8 x float> %x, %x23 %x13 = shufflevector <8 x float> %x0213, <8 x float> undef, <8 x i32> @@ -2100,22 +2037,14 @@ ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: partial_reduction_fadd_v8f32_wrong_flags: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX-LABEL: partial_reduction_fadd_v8f32_wrong_flags: +; AVX: # %bb.0: +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %x23 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> %x0213 = fadd fast <8 x float> %x, %x23 %x13 = shufflevector <8 x float> %x0213, <8 x float> undef, <8 x i32> @@ -2150,13 +2079,6 @@ ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: partial_reduction_fadd_v16f32: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq %x23 = shufflevector <16 x float> %x, <16 x float> undef, <16 x i32> %x0213 = fadd <16 x float> %x, %x23 %x13 = shufflevector <16 x float> %x0213, <16 x float> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -64,9 +64,10 @@ ; ; CHECK-I686-LABEL: test_bitcast_to_half: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: movw {{[0-9]+}}(%esp), %ax -; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-I686-NEXT: movw %ax, (%ecx) +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-I686-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; CHECK-I686-NEXT: pextrw $0, %xmm0, %ecx +; CHECK-I686-NEXT: movw %cx, (%eax) ; CHECK-I686-NEXT: retl %val_fp = bitcast i16 %in to half store half %val_fp, ptr %addr @@ -1235,7 +1236,7 @@ ; CHECK-LIBCALL-LABEL: fcopysign: ; CHECK-LIBCALL: # %bb.0: ; CHECK-LIBCALL-NEXT: pextrw $0, %xmm1, %eax -; CHECK-LIBCALL-NEXT: andl $-32768, %eax # imm = 0x8000 +; CHECK-LIBCALL-NEXT: andl $32768, %eax # imm = 0x8000 ; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %ecx ; CHECK-LIBCALL-NEXT: andl $32767, %ecx # imm = 0x7FFF ; CHECK-LIBCALL-NEXT: orl %eax, %ecx @@ -1245,7 +1246,7 @@ ; BWON-F16C-LABEL: fcopysign: ; BWON-F16C: # %bb.0: ; BWON-F16C-NEXT: vpextrw $0, %xmm1, %eax -; BWON-F16C-NEXT: andl $-32768, %eax # imm = 0x8000 +; BWON-F16C-NEXT: andl $32768, %eax # imm = 0x8000 ; BWON-F16C-NEXT: vpextrw $0, %xmm0, %ecx ; BWON-F16C-NEXT: andl $32767, %ecx # imm = 0x7FFF ; BWON-F16C-NEXT: orl %eax, %ecx @@ -1254,8 +1255,8 @@ ; ; CHECK-I686-LABEL: fcopysign: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: movl $-32768, %eax # imm = 0x8000 -; CHECK-I686-NEXT: andl {{[0-9]+}}(%esp), %eax +; CHECK-I686-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-I686-NEXT: andl $32768, %eax # imm = 0x8000 ; CHECK-I686-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; CHECK-I686-NEXT: andl $32767, %ecx # imm = 0x7FFF ; CHECK-I686-NEXT: orl %eax, %ecx @@ -2113,8 +2114,8 @@ define void @pr63114() { ; CHECK-LIBCALL-LABEL: pr63114: ; CHECK-LIBCALL: # %bb.0: -; CHECK-LIBCALL-NEXT: movdqu (%rax), %xmm4 -; CHECK-LIBCALL-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,3,4,5,6,7] +; CHECK-LIBCALL-NEXT: movdqu (%rax), %xmm5 +; CHECK-LIBCALL-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,3,3,4,5,6,7] ; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] ; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm0 @@ -2122,28 +2123,28 @@ ; CHECK-LIBCALL-NEXT: por %xmm2, %xmm0 ; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] ; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm0 -; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm5 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,60] -; CHECK-LIBCALL-NEXT: por %xmm5, %xmm0 -; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,5,7,7] +; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm4 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,60] +; CHECK-LIBCALL-NEXT: por %xmm4, %xmm0 +; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,5,7,7] +; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,5,5,5,5] +; CHECK-LIBCALL-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3,0,3] +; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm5 +; CHECK-LIBCALL-NEXT: por %xmm2, %xmm5 +; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm5 +; CHECK-LIBCALL-NEXT: por %xmm4, %xmm5 ; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] ; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm6 ; CHECK-LIBCALL-NEXT: por %xmm2, %xmm6 ; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm6 -; CHECK-LIBCALL-NEXT: por %xmm5, %xmm6 -; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,5,5,5,5] -; CHECK-LIBCALL-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3,0,3] -; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm4 -; CHECK-LIBCALL-NEXT: por %xmm2, %xmm4 -; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm4 -; CHECK-LIBCALL-NEXT: por %xmm5, %xmm4 +; CHECK-LIBCALL-NEXT: por %xmm4, %xmm6 ; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm7 ; CHECK-LIBCALL-NEXT: por %xmm2, %xmm7 ; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm7 -; CHECK-LIBCALL-NEXT: por %xmm5, %xmm7 +; CHECK-LIBCALL-NEXT: por %xmm4, %xmm7 ; CHECK-LIBCALL-NEXT: movdqu %xmm7, 0 -; CHECK-LIBCALL-NEXT: movdqu %xmm4, 32 ; CHECK-LIBCALL-NEXT: movdqu %xmm6, 48 +; CHECK-LIBCALL-NEXT: movdqu %xmm5, 32 ; CHECK-LIBCALL-NEXT: movdqu %xmm0, 16 ; CHECK-LIBCALL-NEXT: retq ; @@ -2154,32 +2155,32 @@ ; BWON-F16C-NEXT: vbroadcastss (%rax), %xmm2 ; BWON-F16C-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; BWON-F16C-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[0,0] -; BWON-F16C-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 -; BWON-F16C-NEXT: vpsllq $48, %xmm3, %xmm4 -; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5,6,7] -; BWON-F16C-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] -; BWON-F16C-NEXT: vpor %xmm3, %xmm2, %xmm2 -; BWON-F16C-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,3],xmm1[2,0] -; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7] -; BWON-F16C-NEXT: vpor %xmm3, %xmm1, %xmm1 -; BWON-F16C-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,3,3,4,5,6,7] -; BWON-F16C-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5,6,7] -; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7] -; BWON-F16C-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3],xmm0[4,5,6,7] -; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm3[7] +; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,3,3,4,5,6,7] +; BWON-F16C-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] +; BWON-F16C-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 +; BWON-F16C-NEXT: vpsllq $48, %xmm4, %xmm5 +; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3],xmm3[4,5,6,7] +; BWON-F16C-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1] +; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] +; BWON-F16C-NEXT: vpshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,5,5,5,5] +; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm5[3],xmm6[4,5,6,7] +; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6],xmm4[7] +; BWON-F16C-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3],xmm2[4,5,6,7] +; BWON-F16C-NEXT: vpor %xmm4, %xmm2, %xmm2 +; BWON-F16C-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0] +; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3],xmm0[4,5,6,7] +; BWON-F16C-NEXT: vpor %xmm4, %xmm0, %xmm0 ; BWON-F16C-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; BWON-F16C-NEXT: vmovups %ymm0, 0 -; BWON-F16C-NEXT: vmovups %ymm1, 32 +; BWON-F16C-NEXT: vmovups %ymm0, 32 +; BWON-F16C-NEXT: vmovups %ymm3, 0 ; BWON-F16C-NEXT: vzeroupper ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: pr63114: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: movdqu (%eax), %xmm6 -; CHECK-I686-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,3,3,4,5,6,7] +; CHECK-I686-NEXT: movdqu (%eax), %xmm5 +; CHECK-I686-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,3,3,4,5,6,7] ; CHECK-I686-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; CHECK-I686-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] ; CHECK-I686-NEXT: pand %xmm1, %xmm0 @@ -2189,26 +2190,26 @@ ; CHECK-I686-NEXT: pand %xmm3, %xmm0 ; CHECK-I686-NEXT: movdqa {{.*#+}} xmm4 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,60] ; CHECK-I686-NEXT: por %xmm4, %xmm0 -; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5,7,7] -; CHECK-I686-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,5,7,7] +; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,5,5,5,5] +; CHECK-I686-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3,0,3] +; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] ; CHECK-I686-NEXT: pand %xmm1, %xmm5 ; CHECK-I686-NEXT: por %xmm2, %xmm5 ; CHECK-I686-NEXT: pand %xmm3, %xmm5 ; CHECK-I686-NEXT: por %xmm4, %xmm5 -; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,5,5,5,5] -; CHECK-I686-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3,0,3] -; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; CHECK-I686-NEXT: pand %xmm1, %xmm6 -; CHECK-I686-NEXT: por %xmm2, %xmm6 -; CHECK-I686-NEXT: pand %xmm3, %xmm6 -; CHECK-I686-NEXT: por %xmm4, %xmm6 +; CHECK-I686-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] ; CHECK-I686-NEXT: pand %xmm1, %xmm7 ; CHECK-I686-NEXT: por %xmm2, %xmm7 ; CHECK-I686-NEXT: pand %xmm3, %xmm7 ; CHECK-I686-NEXT: por %xmm4, %xmm7 -; CHECK-I686-NEXT: movdqu %xmm7, 0 -; CHECK-I686-NEXT: movdqu %xmm6, 32 -; CHECK-I686-NEXT: movdqu %xmm5, 48 +; CHECK-I686-NEXT: pand %xmm1, %xmm6 +; CHECK-I686-NEXT: por %xmm2, %xmm6 +; CHECK-I686-NEXT: pand %xmm3, %xmm6 +; CHECK-I686-NEXT: por %xmm4, %xmm6 +; CHECK-I686-NEXT: movdqu %xmm6, 0 +; CHECK-I686-NEXT: movdqu %xmm7, 48 +; CHECK-I686-NEXT: movdqu %xmm5, 32 ; CHECK-I686-NEXT: movdqu %xmm0, 16 ; CHECK-I686-NEXT: retl %1 = load <24 x half>, ptr poison, align 2 diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll --- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll @@ -409,16 +409,18 @@ ; X64-BMI1-LABEL: scalar_i64_lowestbit_eq: ; X64-BMI1: # %bb.0: ; X64-BMI1-NEXT: movq %rsi, %rcx +; X64-BMI1-NEXT: movl $1, %eax ; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-BMI1-NEXT: shlq %cl, %rdi -; X64-BMI1-NEXT: testb $1, %dil +; X64-BMI1-NEXT: shrq %cl, %rax +; X64-BMI1-NEXT: testl %edi, %eax ; X64-BMI1-NEXT: sete %al ; X64-BMI1-NEXT: retq ; ; X64-BMI2-LABEL: scalar_i64_lowestbit_eq: ; X64-BMI2: # %bb.0: -; X64-BMI2-NEXT: shlxq %rsi, %rdi, %rax -; X64-BMI2-NEXT: testb $1, %al +; X64-BMI2-NEXT: movl $1, %eax +; X64-BMI2-NEXT: shrxq %rsi, %rax, %rax +; X64-BMI2-NEXT: testl %edi, %eax ; X64-BMI2-NEXT: sete %al ; X64-BMI2-NEXT: retq %t0 = lshr i64 1, %y @@ -497,45 +499,45 @@ define <4 x i1> @vec_4xi32_splat_eq(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-SSE2-LABEL: vec_4xi32_splat_eq: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pxor %xmm2, %xmm2 ; X86-SSE2-NEXT: pslld $23, %xmm1 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pmuludq %xmm2, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE2-NEXT: pxor %xmm0, %xmm0 +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 ; X86-SSE2-NEXT: retl ; ; AVX2-LABEL: vec_4xi32_splat_eq: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} ; ; X64-SSE2-LABEL: vec_4xi32_splat_eq: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pxor %xmm2, %xmm2 ; X64-SSE2-NEXT: pslld $23, %xmm1 ; X64-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; X64-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X64-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X64-SSE2-NEXT: pmuludq %xmm2, %xmm0 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X64-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; X64-SSE2-NEXT: pxor %xmm0, %xmm0 +; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 ; X64-SSE2-NEXT: retq %t0 = lshr <4 x i32> , %y %t1 = and <4 x i32> %t0, %x @@ -581,45 +583,45 @@ define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pxor %xmm2, %xmm2 ; X86-SSE2-NEXT: pslld $23, %xmm1 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pmuludq %xmm2, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE2-NEXT: pxor %xmm0, %xmm0 +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 ; X86-SSE2-NEXT: retl ; ; AVX2-LABEL: vec_4xi32_nonsplat_undef0_eq: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} ; ; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pxor %xmm2, %xmm2 ; X64-SSE2-NEXT: pslld $23, %xmm1 ; X64-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; X64-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X64-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X64-SSE2-NEXT: pmuludq %xmm2, %xmm0 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X64-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; X64-SSE2-NEXT: pxor %xmm0, %xmm0 +; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 ; X64-SSE2-NEXT: retq %t0 = lshr <4 x i32> , %y %t1 = and <4 x i32> %t0, %x diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll --- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll @@ -471,10 +471,10 @@ ; AVX2-LABEL: vec_4xi32_splat_eq: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} ; ; X64-SSE2-LABEL: vec_4xi32_splat_eq: @@ -559,10 +559,10 @@ ; AVX2-LABEL: vec_4xi32_nonsplat_undef0_eq: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} ; ; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq: diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll b/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll @@ -99,7 +99,7 @@ ; AVX1-FAST-LABEL: PR37890_v4f64: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq @@ -235,7 +235,7 @@ ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll @@ -480,8 +480,8 @@ ; X86-AVX2-LABEL: test_reduce_v4i64: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -553,8 +553,8 @@ ; X64-AVX2-LABEL: test_reduce_v4i64: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -565,7 +565,7 @@ ; X64-AVX512-LABEL: test_reduce_v4i64: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax @@ -629,9 +629,9 @@ ; X86-AVX2-LABEL: test_reduce_v8i32: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -685,9 +685,9 @@ ; X64-AVX2-LABEL: test_reduce_v8i32: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax @@ -697,9 +697,9 @@ ; X64-AVX512-LABEL: test_reduce_v8i32: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax @@ -758,11 +758,14 @@ ; X86-AVX2-LABEL: test_reduce_v16i16: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -806,11 +809,14 @@ ; X64-AVX2-LABEL: test_reduce_v16i16: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -818,11 +824,14 @@ ; X64-AVX512-LABEL: test_reduce_v16i16: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -910,13 +919,16 @@ ; X86-AVX2-LABEL: test_reduce_v32i8: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: xorb $127, %al ; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -988,13 +1000,16 @@ ; X64-AVX2-LABEL: test_reduce_v32i8: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: xorb $127, %al ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1002,13 +1017,16 @@ ; X64-AVX512-LABEL: test_reduce_v32i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorb $127, %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1147,8 +1165,8 @@ ; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 ; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -1264,8 +1282,8 @@ ; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 ; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -1278,7 +1296,7 @@ ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax @@ -1361,9 +1379,9 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -1433,9 +1451,9 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax @@ -1447,9 +1465,9 @@ ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax @@ -1519,11 +1537,14 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -1575,11 +1596,14 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1587,13 +1611,16 @@ ; X64-AVX512-LABEL: test_reduce_v32i16: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1700,13 +1727,16 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: xorb $127, %al ; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -1794,13 +1824,16 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: xorb $127, %al ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1808,15 +1841,18 @@ ; X64-AVX512-LABEL: test_reduce_v64i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorb $127, %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1869,15 +1905,28 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v16i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorl $32767, %eax ## imm = 0x7FFF -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -1901,22 +1950,38 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX1OR2-LABEL: test_reduce_v16i16_v8i16: -; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax -; X64-AVX1OR2-NEXT: xorl $32767, %eax ## imm = 0x7FFF -; X64-AVX1OR2-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX1OR2-NEXT: vzeroupper -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v16i16_v8i16: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1956,15 +2021,28 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorl $32767, %eax ## imm = 0x7FFF -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -1988,22 +2066,38 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX1OR2-LABEL: test_reduce_v32i16_v8i16: -; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax -; X64-AVX1OR2-NEXT: xorl $32767, %eax ## imm = 0x7FFF -; X64-AVX1OR2-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX1OR2-NEXT: vzeroupper -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v32i16_v8i16: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -2065,17 +2159,32 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorb $127, %al -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorb $127, %al +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2121,26 +2230,44 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX1OR2-LABEL: test_reduce_v32i8_v16i8: -; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1OR2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax -; X64-AVX1OR2-NEXT: xorb $127, %al -; X64-AVX1OR2-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX1OR2-NEXT: vzeroupper -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorb $127, %al +; X64-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v32i8_v16i8: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorb $127, %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -2205,17 +2332,32 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v64i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorb $127, %al -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorb $127, %al +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2261,26 +2403,44 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX1OR2-LABEL: test_reduce_v64i8_v16i8: -; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1OR2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax -; X64-AVX1OR2-NEXT: xorb $127, %al -; X64-AVX1OR2-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX1OR2-NEXT: vzeroupper -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorb $127, %al +; X64-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v64i8_v16i8: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorb $127, %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll @@ -483,8 +483,8 @@ ; X86-AVX2-LABEL: test_reduce_v4i64: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -557,8 +557,8 @@ ; X64-AVX2-LABEL: test_reduce_v4i64: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -569,7 +569,7 @@ ; X64-AVX512-LABEL: test_reduce_v4i64: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsq %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax @@ -633,9 +633,9 @@ ; X86-AVX2-LABEL: test_reduce_v8i32: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -689,9 +689,9 @@ ; X64-AVX2-LABEL: test_reduce_v8i32: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax @@ -701,9 +701,9 @@ ; X64-AVX512-LABEL: test_reduce_v8i32: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax @@ -762,11 +762,14 @@ ; X86-AVX2-LABEL: test_reduce_v16i16: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -810,11 +813,14 @@ ; X64-AVX2-LABEL: test_reduce_v16i16: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -822,11 +828,14 @@ ; X64-AVX512-LABEL: test_reduce_v16i16: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -914,13 +923,16 @@ ; X86-AVX2-LABEL: test_reduce_v32i8: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: addb $-128, %al ; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -992,13 +1004,16 @@ ; X64-AVX2-LABEL: test_reduce_v32i8: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: addb $-128, %al ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1006,13 +1021,16 @@ ; X64-AVX512-LABEL: test_reduce_v32i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: addb $-128, %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1151,8 +1169,8 @@ ; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 ; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -1268,8 +1286,8 @@ ; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 ; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -1282,7 +1300,7 @@ ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax @@ -1365,9 +1383,9 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -1437,9 +1455,9 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax @@ -1451,9 +1469,9 @@ ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax @@ -1523,11 +1541,14 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -1579,11 +1600,14 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1591,13 +1615,16 @@ ; X64-AVX512-LABEL: test_reduce_v32i16: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1704,13 +1731,16 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: addb $-128, %al ; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -1798,13 +1828,16 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: addb $-128, %al ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1812,15 +1845,18 @@ ; X64-AVX512-LABEL: test_reduce_v64i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: addb $-128, %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1873,15 +1909,28 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v16i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorl $32768, %eax ## imm = 0x8000 -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -1905,22 +1954,38 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX1OR2-LABEL: test_reduce_v16i16_v8i16: -; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax -; X64-AVX1OR2-NEXT: xorl $32768, %eax ## imm = 0x8000 -; X64-AVX1OR2-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX1OR2-NEXT: vzeroupper -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v16i16_v8i16: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1960,15 +2025,28 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorl $32768, %eax ## imm = 0x8000 -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -1992,22 +2070,38 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX1OR2-LABEL: test_reduce_v32i16_v8i16: -; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax -; X64-AVX1OR2-NEXT: xorl $32768, %eax ## imm = 0x8000 -; X64-AVX1OR2-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX1OR2-NEXT: vzeroupper -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v32i16_v8i16: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -2069,17 +2163,32 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: addb $-128, %al -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: addb $-128, %al +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2125,26 +2234,44 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX1OR2-LABEL: test_reduce_v32i8_v16i8: -; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1OR2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax -; X64-AVX1OR2-NEXT: addb $-128, %al -; X64-AVX1OR2-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX1OR2-NEXT: vzeroupper -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: addb $-128, %al +; X64-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v32i8_v16i8: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: addb $-128, %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -2209,17 +2336,32 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v64i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: addb $-128, %al -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: addb $-128, %al +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2265,26 +2407,44 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX1OR2-LABEL: test_reduce_v64i8_v16i8: -; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1OR2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax -; X64-AVX1OR2-NEXT: addb $-128, %al -; X64-AVX1OR2-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX1OR2-NEXT: vzeroupper -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: addb $-128, %al +; X64-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v64i8_v16i8: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: addb $-128, %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -545,11 +545,11 @@ ; X86-AVX2-LABEL: test_reduce_v4i64: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 -; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; X86-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 +; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -636,10 +636,10 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 -; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; X64-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 +; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -652,7 +652,7 @@ ; X64-AVX512-LABEL: test_reduce_v4i64: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax @@ -725,9 +725,9 @@ ; X86-AVX2-LABEL: test_reduce_v8i32: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -790,9 +790,9 @@ ; X64-AVX2-LABEL: test_reduce_v8i32: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax @@ -802,9 +802,9 @@ ; X64-AVX512-LABEL: test_reduce_v8i32: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax @@ -869,12 +869,14 @@ ; X86-AVX2-LABEL: test_reduce_v16i16: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: notl %eax ; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -924,12 +926,14 @@ ; X64-AVX2-LABEL: test_reduce_v16i16: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: notl %eax ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -937,11 +941,14 @@ ; X64-AVX512-LABEL: test_reduce_v16i16: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: notl %eax ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1011,14 +1018,16 @@ ; X86-AVX2-LABEL: test_reduce_v32i8: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: notb %al ; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -1072,14 +1081,16 @@ ; X64-AVX2-LABEL: test_reduce_v32i8: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: notb %al ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1087,13 +1098,16 @@ ; X64-AVX512-LABEL: test_reduce_v32i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: notb %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1256,10 +1270,10 @@ ; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 ; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm3 -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 -; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 +; X86-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 +; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -1401,10 +1415,10 @@ ; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 ; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm3 -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 -; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 +; X64-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 +; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -1419,7 +1433,7 @@ ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax @@ -1517,9 +1531,9 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -1604,9 +1618,9 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax @@ -1618,9 +1632,9 @@ ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax @@ -1698,12 +1712,14 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: notl %eax ; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -1763,12 +1779,14 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: notl %eax ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1776,13 +1794,16 @@ ; X64-AVX512-LABEL: test_reduce_v32i16: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: notl %eax ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1863,14 +1884,16 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: notb %al ; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -1932,14 +1955,16 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: notb %al ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1947,15 +1972,18 @@ ; X64-AVX512-LABEL: test_reduce_v64i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: notb %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -2012,16 +2040,29 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v16i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: notl %eax -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: notl %eax +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -2062,21 +2103,26 @@ ; ; X64-AVX2-LABEL: test_reduce_v16i16_v8i16: ; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: notl %eax ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v16i16_v8i16: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: notl %eax ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -2120,16 +2166,29 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: notl %eax -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: notl %eax +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -2170,21 +2229,26 @@ ; ; X64-AVX2-LABEL: test_reduce_v32i16_v8i16: ; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: notl %eax ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v32i16_v8i16: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: notl %eax ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -2231,18 +2295,33 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: notb %al -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: notb %al +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2288,25 +2367,30 @@ ; ; X64-AVX2-LABEL: test_reduce_v32i8_v16i8: ; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: notb %al ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v32i8_v16i8: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: notb %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -2356,18 +2440,33 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v64i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: notb %al -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: notb %al +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2413,25 +2512,30 @@ ; ; X64-AVX2-LABEL: test_reduce_v64i8_v16i8: ; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: notb %al ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v64i8_v16i8: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: notb %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -489,11 +489,11 @@ ; X86-AVX2-LABEL: test_reduce_v4i64: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 -; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -582,10 +582,10 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 -; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -598,7 +598,7 @@ ; X64-AVX512-LABEL: test_reduce_v4i64: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminuq %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax @@ -671,9 +671,9 @@ ; X86-AVX2-LABEL: test_reduce_v8i32: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -736,9 +736,9 @@ ; X64-AVX2-LABEL: test_reduce_v8i32: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax @@ -748,9 +748,9 @@ ; X64-AVX512-LABEL: test_reduce_v8i32: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax @@ -813,8 +813,13 @@ ; X86-AVX2-LABEL: test_reduce_v16i16: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-AVX2-NEXT: vzeroupper @@ -863,8 +868,13 @@ ; X64-AVX2-LABEL: test_reduce_v16i16: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper @@ -873,8 +883,13 @@ ; X64-AVX512-LABEL: test_reduce_v16i16: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper @@ -939,10 +954,15 @@ ; X86-AVX2-LABEL: test_reduce_v32i8: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X86-AVX2-NEXT: vzeroupper @@ -991,10 +1011,15 @@ ; X64-AVX2-LABEL: test_reduce_v32i8: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper @@ -1003,10 +1028,15 @@ ; X64-AVX512-LABEL: test_reduce_v32i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper @@ -1172,10 +1202,10 @@ ; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 ; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 -; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 +; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -1319,10 +1349,10 @@ ; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 ; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 -; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 +; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -1337,7 +1367,7 @@ ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax @@ -1435,9 +1465,9 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -1522,9 +1552,9 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax @@ -1536,9 +1566,9 @@ ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax @@ -1616,8 +1646,13 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-AVX2-NEXT: vzeroupper @@ -1678,8 +1713,13 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper @@ -1688,10 +1728,15 @@ ; X64-AVX512-LABEL: test_reduce_v32i16: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper @@ -1767,10 +1812,15 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X86-AVX2-NEXT: vzeroupper @@ -1827,10 +1877,15 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper @@ -1839,12 +1894,17 @@ ; X64-AVX512-LABEL: test_reduce_v64i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper @@ -1902,13 +1962,26 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v16i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -1936,13 +2009,39 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v16i16_v8i16: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> %2 = icmp ult <16 x i16> %a0, %1 %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1 @@ -1983,13 +2082,26 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -2017,13 +2129,39 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v32i16_v8i16: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> %2 = icmp ult <32 x i16> %a0, %1 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1 @@ -2064,15 +2202,30 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2100,15 +2253,45 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v32i8_v16i8: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> %2 = icmp ult <32 x i8> %a0, %1 %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1 @@ -2152,15 +2335,30 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v64i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2188,15 +2386,45 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v64i8_v16i8: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> %2 = icmp ult <64 x i8> %a0, %1 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1 diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -20,9 +20,16 @@ ; SSSE3-SLOW-LABEL: pair_sum_v4f32_v4f32: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm3 -; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm0 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm2 +; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,1] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,1] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm3 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v4f32_v4f32: @@ -32,17 +39,17 @@ ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm0 ; SSSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: pair_sum_v4f32_v4f32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1] -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1] -; AVX-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 -; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] -; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: pair_sum_v4f32_v4f32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1] +; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 +; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] +; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: pair_sum_v4f32_v4f32: ; AVX-FAST: # %bb.0: @@ -50,6 +57,25 @@ ; AVX-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm1 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: pair_sum_v4f32_v4f32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX2-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX2-SLOW-NEXT: vaddss %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; AVX2-SLOW-NEXT: vaddss %xmm1, %xmm2, %xmm1 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX2-SLOW-NEXT: retq %5 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> %6 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> %7 = fadd <2 x float> %5, %6 @@ -82,13 +108,19 @@ ; SSSE3-SLOW-LABEL: pair_sum_v4i32_v4i32: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: phaddd %xmm2, %xmm3 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,1,3] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,1,3] +; SSSE3-SLOW-NEXT: phaddd %xmm2, %xmm2 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] +; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm3 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm0 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v4i32_v4i32: @@ -103,15 +135,15 @@ ; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm2 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] +; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm2 +; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6,7] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; AVX1-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: pair_sum_v4i32_v4i32: @@ -123,18 +155,18 @@ ; ; AVX2-SLOW-LABEL: pair_sum_v4i32_v4i32: ; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] ; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm2 -; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,1,3] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,1,3] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: retq %5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> %6 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> @@ -173,24 +205,22 @@ ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm2 -; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm5 -; SSSE3-SLOW-NEXT: haddps %xmm5, %xmm2 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1,3,2] +; SSSE3-SLOW-NEXT: movaps %xmm5, %xmm1 +; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm1 +; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm2 ; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm6 -; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm6 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,1] -; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1 +; SSSE3-SLOW-NEXT: haddps %xmm5, %xmm4 +; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm4 +; SSSE3-SLOW-NEXT: movaps %xmm4, %xmm1 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32: ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0 -; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSSE3-FAST-NEXT: haddps %xmm3, %xmm2 +; SSSE3-FAST-NEXT: haddps %xmm2, %xmm0 ; SSSE3-FAST-NEXT: haddps %xmm5, %xmm4 -; SSSE3-FAST-NEXT: haddps %xmm4, %xmm2 -; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-FAST-NEXT: haddps %xmm7, %xmm6 ; SSSE3-FAST-NEXT: haddps %xmm6, %xmm4 ; SSSE3-FAST-NEXT: movaps %xmm4, %xmm1 @@ -352,16 +382,12 @@ ; SSSE3-FAST-LABEL: pair_sum_v8i32_v4i32: ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0 -; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 ; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm2 +; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm0 ; SSSE3-FAST-NEXT: phaddd %xmm5, %xmm4 -; SSSE3-FAST-NEXT: phaddd %xmm4, %xmm2 -; SSSE3-FAST-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSSE3-FAST-NEXT: phaddd %xmm6, %xmm6 -; SSSE3-FAST-NEXT: phaddd %xmm7, %xmm7 ; SSSE3-FAST-NEXT: phaddd %xmm7, %xmm6 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,2] -; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1 +; SSSE3-FAST-NEXT: phaddd %xmm6, %xmm4 +; SSSE3-FAST-NEXT: movdqa %xmm4, %xmm1 ; SSSE3-FAST-NEXT: retq ; ; AVX1-SLOW-LABEL: pair_sum_v8i32_v4i32: @@ -425,8 +451,10 @@ ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; AVX2-SLOW-NEXT: vpbroadcastd %xmm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] @@ -448,8 +476,10 @@ ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] @@ -524,7 +554,7 @@ ; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm5 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4 ; SSSE3-SLOW-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[3,2] ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSSE3-SLOW-NEXT: addps %xmm2, %xmm0 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] @@ -550,7 +580,7 @@ ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5 ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm5 ; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm4[3,2] ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3] ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2 @@ -637,20 +667,23 @@ ; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm4 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSSE3-SLOW-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,0,1] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5 -; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm5 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm6 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[2,0] -; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1 +; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSSE3-SLOW-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSSE3-SLOW-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1 +; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm1 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm0 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm3 +; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm3 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] +; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: sequential_sum_v4i32_v4i32: @@ -659,19 +692,22 @@ ; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm4 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSSE3-FAST-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-FAST-NEXT: paddd %xmm0, %xmm4 ; SSSE3-FAST-NEXT: movdqa %xmm2, %xmm1 ; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1 +; SSSE3-FAST-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSSE3-FAST-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSSE3-FAST-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1 -; SSSE3-FAST-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm5 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; SSSE3-FAST-NEXT: paddd %xmm5, %xmm6 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[2,0] -; SSSE3-FAST-NEXT: paddd %xmm4, %xmm0 +; SSSE3-FAST-NEXT: paddd %xmm4, %xmm1 +; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] +; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm3 +; SSSE3-FAST-NEXT: paddd %xmm3, %xmm0 +; SSSE3-FAST-NEXT: paddd %xmm0, %xmm2 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] +; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0 ; SSSE3-FAST-NEXT: retq ; ; AVX1-SLOW-LABEL: sequential_sum_v4i32_v4i32: @@ -947,24 +983,25 @@ ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] -; SSSE3-SLOW-NEXT: addps %xmm1, %xmm5 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm5[1,1,3,3] -; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSSE3-SLOW-NEXT: addss %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSSE3-SLOW-NEXT: addss %xmm4, %xmm1 +; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1 -; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm2 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSSE3-SLOW-NEXT: addps %xmm3, %xmm2 -; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm3 -; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] -; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSSE3-SLOW-NEXT: addss %xmm1, %xmm2 +; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSSE3-SLOW-NEXT: addss %xmm1, %xmm3 +; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc: @@ -982,27 +1019,31 @@ ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSSE3-FAST-NEXT: addps %xmm3, %xmm2 -; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSSE3-FAST-NEXT: haddps %xmm1, %xmm2 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[2,0] ; SSSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX-SLOW-NEXT: vaddss %xmm4, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0] ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0] -; AVX-SLOW-NEXT: vaddps %xmm4, %xmm2, %xmm2 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm3[1,0] -; AVX-SLOW-NEXT: vaddps %xmm4, %xmm3, %xmm3 -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm3[1,1],xmm2[1,1] -; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm5 = xmm0[1],xmm1[1],zero,zero -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,0] -; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] +; AVX-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc: @@ -1014,10 +1055,12 @@ ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0] -; AVX-FAST-NEXT: vaddps %xmm2, %xmm3, %xmm2 -; AVX-FAST-NEXT: vhaddps %xmm2, %xmm1, %xmm1 -; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3] +; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] +; AVX-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1 +; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX-FAST-NEXT: retq %5 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0) %6 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1) @@ -1034,24 +1077,23 @@ ; SSSE3-SLOW-LABEL: reduction_sum_v4i32_v4i32: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] +; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm5 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm6 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm2 +; SSSE3-SLOW-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] +; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm2 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,0] ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: reduction_sum_v4i32_v4i32: @@ -1065,69 +1107,73 @@ ; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSSE3-FAST-NEXT: paddd %xmm3, %xmm2 -; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm2 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[2,0] ; SSSE3-FAST-NEXT: retq ; -; AVX1-SLOW-LABEL: reduction_sum_v4i32_v4i32: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm6, %xmm3, %xmm3 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] -; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 -; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-SLOW-NEXT: retq +; AVX-SLOW-LABEL: reduction_sum_v4i32_v4i32: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vmovd %xmm1, %eax +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; AVX-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vmovd %xmm1, %ecx +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; AVX-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vmovd %xmm1, %edx +; AVX-SLOW-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 +; AVX-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: reduction_sum_v4i32_v4i32: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX-FAST-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] -; AVX-FAST-NEXT: vpaddd %xmm4, %xmm1, %xmm1 -; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; AVX-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; AVX-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1 -; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: reduction_sum_v4i32_v4i32: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm2 +; AVX1-FAST-NEXT: vmovd %xmm2, %eax +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,1] +; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] +; AVX1-FAST-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX1-FAST-NEXT: retq ; -; AVX2-SLOW-LABEL: reduction_sum_v4i32_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: retq +; AVX2-FAST-LABEL: reduction_sum_v4i32_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vpaddd %xmm4, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovd %xmm2, %eax +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX2-FAST-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX2-FAST-NEXT: retq %5 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %0) %6 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %1) %7 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %2) diff --git a/llvm/test/CodeGen/X86/i128-add.ll b/llvm/test/CodeGen/X86/i128-add.ll --- a/llvm/test/CodeGen/X86/i128-add.ll +++ b/llvm/test/CodeGen/X86/i128-add.ll @@ -74,13 +74,9 @@ ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: addq %rdx, %rax ; X64-NEXT: adcq %rcx, %rsi -; X64-NEXT: movq %rax, %xmm0 -; X64-NEXT: movq %rsi, %xmm1 -; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: addq $1, %rax -; X64-NEXT: adcq $0, %rdx +; X64-NEXT: adcq $0, %rsi +; X64-NEXT: movq %rsi, %rdx ; X64-NEXT: retq %t0 = add <1 x i128> %x, %t1 = add <1 x i128> %y, %t0 diff --git a/llvm/test/CodeGen/X86/i64-to-float.ll b/llvm/test/CodeGen/X86/i64-to-float.ll --- a/llvm/test/CodeGen/X86/i64-to-float.ll +++ b/llvm/test/CodeGen/X86/i64-to-float.ll @@ -323,31 +323,32 @@ ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; X64-SSE-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE-NEXT: pxor %xmm1, %xmm2 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; X64-SSE-NEXT: pcmpeqd %xmm4, %xmm4 -; X64-SSE-NEXT: pcmpeqd %xmm3, %xmm4 -; X64-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; X64-SSE-NEXT: pand %xmm4, %xmm3 +; X64-SSE-NEXT: movdqa {{.*#+}} xmm3 = [18446744071562067713,18446744071562067713] +; X64-SSE-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE-NEXT: pcmpgtd %xmm3, %xmm4 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; X64-SSE-NEXT: pcmpeqd %xmm3, %xmm2 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; X64-SSE-NEXT: por %xmm3, %xmm2 -; X64-SSE-NEXT: pand %xmm2, %xmm0 -; X64-SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; X64-SSE-NEXT: por %xmm0, %xmm2 -; X64-SSE-NEXT: pxor %xmm2, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; X64-SSE-NEXT: pxor %xmm3, %xmm3 -; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm3 +; X64-SSE-NEXT: pand %xmm5, %xmm2 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X64-SSE-NEXT: por %xmm2, %xmm3 +; X64-SSE-NEXT: pand %xmm3, %xmm0 +; X64-SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; X64-SSE-NEXT: por %xmm0, %xmm3 +; X64-SSE-NEXT: pxor %xmm3, %xmm1 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm0 = [2147483903,2147483903] -; X64-SSE-NEXT: pcmpgtd %xmm1, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; X64-SSE-NEXT: pand %xmm3, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X64-SSE-NEXT: por %xmm1, %xmm0 -; X64-SSE-NEXT: pand %xmm0, %xmm2 -; X64-SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE-NEXT: por %xmm2, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pcmpgtd %xmm1, %xmm2 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X64-SSE-NEXT: pand %xmm4, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X64-SSE-NEXT: por %xmm0, %xmm1 +; X64-SSE-NEXT: pand %xmm1, %xmm3 +; X64-SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE-NEXT: por %xmm3, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm0 ; X64-SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/iabs.ll b/llvm/test/CodeGen/X86/iabs.ll --- a/llvm/test/CodeGen/X86/iabs.ll +++ b/llvm/test/CodeGen/X86/iabs.ll @@ -39,7 +39,7 @@ ; X86-NO-CMOV: # %bb.0: ; X86-NO-CMOV-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NO-CMOV-NEXT: movswl %ax, %ecx -; X86-NO-CMOV-NEXT: sarl $15, %ecx +; X86-NO-CMOV-NEXT: shrl $15, %ecx ; X86-NO-CMOV-NEXT: xorl %ecx, %eax ; X86-NO-CMOV-NEXT: subl %ecx, %eax ; X86-NO-CMOV-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll --- a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll +++ b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll @@ -985,6 +985,18 @@ ; AVX2-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpextrb $1, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: negb %al +; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm1 +; AVX2-NEXT: vpextrb $2, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: negb %al +; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpextrb $3, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: negb %al +; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: eq_or_to_abs_vec4x8_sext: @@ -993,6 +1005,21 @@ ; SSE41-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pextrb $1, %xmm0, %eax +; SSE41-NEXT: andb $1, %al +; SSE41-NEXT: negb %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pextrb $2, %xmm0, %ecx +; SSE41-NEXT: pextrb $3, %xmm0, %edx +; SSE41-NEXT: pinsrb $1, %eax, %xmm0 +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: negb %cl +; SSE41-NEXT: movzbl %cl, %eax +; SSE41-NEXT: pinsrb $2, %eax, %xmm0 +; SSE41-NEXT: andb $1, %dl +; SSE41-NEXT: negb %dl +; SSE41-NEXT: movzbl %dl, %eax +; SSE41-NEXT: pinsrb $3, %eax, %xmm0 ; SSE41-NEXT: retq ; ; SSE2-LABEL: eq_or_to_abs_vec4x8_sext: @@ -1001,6 +1028,22 @@ ; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: shll $8, %eax +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: orl %eax, %ecx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: orl %ecx, %eax +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: shll $24, %ecx +; SSE2-NEXT: orl %eax, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: retq %cmp1 = icmp eq <4 x i8> %x, %cmp2 = icmp eq <4 x i8> %x, @@ -1089,6 +1132,18 @@ ; AVX2-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpextrw $1, %xmm0, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm1 +; AVX2-NEXT: vpextrw $2, %xmm0, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpextrw $3, %xmm0, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: ne_and_to_abs_vec4x16_sext: @@ -1099,6 +1154,18 @@ ; SSE41-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pextrw $1, %xmm0, %eax +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: negl %eax +; SSE41-NEXT: pextrw $2, %xmm0, %ecx +; SSE41-NEXT: pextrw $3, %xmm0, %edx +; SSE41-NEXT: pinsrw $1, %eax, %xmm0 +; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: negl %ecx +; SSE41-NEXT: pinsrw $2, %ecx, %xmm0 +; SSE41-NEXT: andl $1, %edx +; SSE41-NEXT: negl %edx +; SSE41-NEXT: pinsrw $3, %edx, %xmm0 ; SSE41-NEXT: retq ; ; SSE2-LABEL: ne_and_to_abs_vec4x16_sext: @@ -1109,6 +1176,18 @@ ; SSE2-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pextrw $1, %xmm0, %eax +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: pextrw $3, %xmm0, %edx +; SSE2-NEXT: pinsrw $1, %eax, %xmm0 +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: negl %ecx +; SSE2-NEXT: pinsrw $2, %ecx, %xmm0 +; SSE2-NEXT: andl $1, %edx +; SSE2-NEXT: negl %edx +; SSE2-NEXT: pinsrw $3, %edx, %xmm0 ; SSE2-NEXT: retq %cmp1 = icmp ne <4 x i16> %x, %cmp2 = icmp ne <4 x i16> %x, diff --git a/llvm/test/CodeGen/X86/icmp-abs-C.ll b/llvm/test/CodeGen/X86/icmp-abs-C.ll --- a/llvm/test/CodeGen/X86/icmp-abs-C.ll +++ b/llvm/test/CodeGen/X86/icmp-abs-C.ll @@ -163,7 +163,7 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movswl %cx, %eax -; X86-NEXT: sarl $15, %eax +; X86-NEXT: shrl $15, %eax ; X86-NEXT: xorl %eax, %ecx ; X86-NEXT: subl %eax, %ecx ; X86-NEXT: movl %ecx, %eax diff --git a/llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll b/llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll --- a/llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll +++ b/llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll @@ -198,7 +198,7 @@ ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movswl %ax, %ecx -; X86-NEXT: sarl $15, %ecx +; X86-NEXT: shrl $15, %ecx ; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: movzwl %ax, %eax diff --git a/llvm/test/CodeGen/X86/icmp-shift-opt.ll b/llvm/test/CodeGen/X86/icmp-shift-opt.ll --- a/llvm/test/CodeGen/X86/icmp-shift-opt.ll +++ b/llvm/test/CodeGen/X86/icmp-shift-opt.ll @@ -13,30 +13,34 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB0_1: # %loop ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: addl $1, %edi +; X86-NEXT: addl $1, %ecx ; X86-NEXT: adcl $0, %esi ; X86-NEXT: adcl $0, %edx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: orl %ecx, %ebx -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: orl %edx, %ebp -; X86-NEXT: orl %ecx, %ebp -; X86-NEXT: shrdl $28, %ebx, %ebp +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: shldl $4, %edx, %edi +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: shldl $4, %esi, %ebp +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: shrl $28, %ecx +; X86-NEXT: orl %ebp, %ecx +; X86-NEXT: orl %edi, %ecx +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: jne .LBB0_1 ; X86-NEXT: # %bb.2: # %exit -; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: movl %esi, 4(%eax) ; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %ebx, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -52,9 +56,11 @@ ; X64-NEXT: # =>This Inner Loop Header: Depth=1 ; X64-NEXT: addq $1, %rax ; X64-NEXT: adcq $0, %rdx -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: shrq $60, %rcx -; X64-NEXT: orq %rdx, %rcx +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: shldq $4, %rax, %rcx +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: shrq $60, %rsi +; X64-NEXT: orq %rcx, %rsi ; X64-NEXT: jne .LBB0_1 ; X64-NEXT: # %bb.2: # %exit ; X64-NEXT: retq @@ -73,21 +79,27 @@ define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind { ; X86-LABEL: opt_setcc_srl_eq_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: shldl $15, %edx, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: shldl $15, %ecx, %esi +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: shrl $17, %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shrdl $17, %ecx, %eax +; X86-NEXT: orl %edx, %eax ; X86-NEXT: sete %al +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_srl_eq_zero: ; X64: # %bb.0: -; X64-NEXT: shrq $17, %rdi -; X64-NEXT: orq %rsi, %rdi +; X64-NEXT: shrdq $17, %rsi, %rdi +; X64-NEXT: shrq $17, %rsi +; X64-NEXT: orq %rdi, %rsi ; X64-NEXT: sete %al ; X64-NEXT: retq %srl = lshr i128 %a, 17 @@ -98,21 +110,27 @@ define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind { ; X86-LABEL: opt_setcc_srl_ne_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: shldl $15, %edx, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: shldl $15, %ecx, %esi +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: shrl $17, %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shrdl $17, %ecx, %eax +; X86-NEXT: orl %edx, %eax ; X86-NEXT: setne %al +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_srl_ne_zero: ; X64: # %bb.0: -; X64-NEXT: shrq $17, %rdi -; X64-NEXT: orq %rsi, %rdi +; X64-NEXT: shrdq $17, %rsi, %rdi +; X64-NEXT: shrq $17, %rsi +; X64-NEXT: orq %rdi, %rsi ; X64-NEXT: setne %al ; X64-NEXT: retq %srl = lshr i128 %a, 17 @@ -123,19 +141,27 @@ define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind { ; X86-LABEL: opt_setcc_shl_eq_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $17, %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: shldl $17, %eax, %esi +; X86-NEXT: shll $17, %edx +; X86-NEXT: orl %esi, %edx ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl %ecx, %eax +; X86-NEXT: shldl $17, %ecx, %eax +; X86-NEXT: orl %edx, %eax ; X86-NEXT: sete %al +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_shl_eq_zero: ; X64: # %bb.0: -; X64-NEXT: shlq $17, %rsi -; X64-NEXT: orq %rdi, %rsi +; X64-NEXT: shldq $17, %rdi, %rsi +; X64-NEXT: shlq $17, %rdi +; X64-NEXT: orq %rsi, %rdi ; X64-NEXT: sete %al ; X64-NEXT: retq %shl = shl i128 %a, 17 @@ -146,19 +172,27 @@ define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind { ; X86-LABEL: opt_setcc_shl_ne_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $17, %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: shldl $17, %eax, %esi +; X86-NEXT: shll $17, %edx +; X86-NEXT: orl %esi, %edx ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl %ecx, %eax +; X86-NEXT: shldl $17, %ecx, %eax +; X86-NEXT: orl %edx, %eax ; X86-NEXT: setne %al +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_shl_ne_zero: ; X64: # %bb.0: -; X64-NEXT: shlq $17, %rsi -; X64-NEXT: orq %rdi, %rsi +; X64-NEXT: shldq $17, %rdi, %rsi +; X64-NEXT: shlq $17, %rdi +; X64-NEXT: orq %rsi, %rdi ; X64-NEXT: setne %al ; X64-NEXT: retq %shl = shl i128 %a, 17 @@ -233,8 +267,9 @@ ; ; X64-LABEL: opt_setcc_expanded_shl_correct_shifts: ; X64: # %bb.0: -; X64-NEXT: shlq $17, %rdi -; X64-NEXT: orq %rsi, %rdi +; X64-NEXT: shldq $17, %rsi, %rdi +; X64-NEXT: shlq $17, %rsi +; X64-NEXT: orq %rdi, %rsi ; X64-NEXT: sete %al ; X64-NEXT: retq %shl.a = shl i64 %a, 17 diff --git a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll --- a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll +++ b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll @@ -108,7 +108,17 @@ ; ; X64-LABEL: i56_or: ; X64: # %bb.0: -; X64-NEXT: orl $384, (%rdi) # imm = 0x180 +; X64-NEXT: movzwl 4(%rdi), %eax +; X64-NEXT: shlq $32, %rax +; X64-NEXT: movzbl 6(%rdi), %ecx +; X64-NEXT: shlq $48, %rcx +; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: orq %rcx, %rax +; X64-NEXT: orq $384, %rax # imm = 0x180 +; X64-NEXT: shrq $32, %rcx +; X64-NEXT: movw %cx, 4(%rdi) +; X64-NEXT: movl %eax, (%rdi) ; X64-NEXT: retq %aa = load i56, ptr %a, align 1 %b = or i56 %aa, 384 @@ -129,18 +139,17 @@ ; X64-LABEL: i56_and_or: ; X64: # %bb.0: ; X64-NEXT: movzwl 4(%rdi), %eax +; X64-NEXT: shlq $32, %rax ; X64-NEXT: movzbl 6(%rdi), %ecx -; X64-NEXT: shll $16, %ecx -; X64-NEXT: orl %eax, %ecx -; X64-NEXT: shlq $32, %rcx +; X64-NEXT: shlq $48, %rcx +; X64-NEXT: orq %rax, %rcx ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: orq %rcx, %rax ; X64-NEXT: orq $384, %rax # imm = 0x180 -; X64-NEXT: movabsq $72057594037927808, %rcx # imm = 0xFFFFFFFFFFFF80 -; X64-NEXT: andq %rax, %rcx -; X64-NEXT: movl %ecx, (%rdi) -; X64-NEXT: shrq $32, %rcx -; X64-NEXT: movw %cx, 4(%rdi) +; X64-NEXT: andq $-128, %rax +; X64-NEXT: movl %eax, (%rdi) +; X64-NEXT: shrq $32, %rax +; X64-NEXT: movw %ax, 4(%rdi) ; X64-NEXT: retq %b = load i56, ptr %a, align 1 %c = and i56 %b, -128 @@ -163,19 +172,20 @@ ; ; X64-LABEL: i56_insert_bit: ; X64: # %bb.0: -; X64-NEXT: movzwl 4(%rdi), %eax -; X64-NEXT: movzbl 6(%rdi), %ecx -; X64-NEXT: shll $16, %ecx -; X64-NEXT: orl %eax, %ecx +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movzwl 4(%rdi), %ecx ; X64-NEXT: shlq $32, %rcx -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: orq %rcx, %rax -; X64-NEXT: shll $13, %esi -; X64-NEXT: andq $-8193, %rax # imm = 0xDFFF -; X64-NEXT: orl %eax, %esi -; X64-NEXT: shrq $32, %rax -; X64-NEXT: movw %ax, 4(%rdi) -; X64-NEXT: movl %esi, (%rdi) +; X64-NEXT: movzbl 6(%rdi), %edx +; X64-NEXT: shlq $48, %rdx +; X64-NEXT: orq %rcx, %rdx +; X64-NEXT: movl (%rdi), %ecx +; X64-NEXT: orq %rdx, %rcx +; X64-NEXT: shlq $13, %rax +; X64-NEXT: andq $-8193, %rcx # imm = 0xDFFF +; X64-NEXT: orq %rax, %rcx +; X64-NEXT: shrq $32, %rdx +; X64-NEXT: movw %dx, 4(%rdi) +; X64-NEXT: movl %ecx, (%rdi) ; X64-NEXT: retq %extbit = zext i1 %bit to i56 %b = load i56, ptr %a, align 1 diff --git a/llvm/test/CodeGen/X86/insertelement-duplicates.ll b/llvm/test/CodeGen/X86/insertelement-duplicates.ll --- a/llvm/test/CodeGen/X86/insertelement-duplicates.ll +++ b/llvm/test/CodeGen/X86/insertelement-duplicates.ll @@ -31,18 +31,18 @@ ; AVX-32: # %bb.0: # %L.entry ; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; AVX-32-NEXT: vbroadcastss 304(%ecx), %xmm0 -; AVX-32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7] +; AVX-32-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-32-NEXT: vbroadcastss 304(%ecx), %xmm1 +; AVX-32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] ; AVX-32-NEXT: vmovups %ymm0, 608(%eax) ; AVX-32-NEXT: vzeroupper ; AVX-32-NEXT: retl ; ; AVX-64-LABEL: PR15298: ; AVX-64: # %bb.0: # %L.entry -; AVX-64-NEXT: vbroadcastss 304(%rdi), %xmm0 -; AVX-64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7] +; AVX-64-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-64-NEXT: vbroadcastss 304(%rdi), %xmm1 +; AVX-64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] ; AVX-64-NEXT: vmovups %ymm0, 608(%rsi) ; AVX-64-NEXT: vzeroupper ; AVX-64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/insertelement-var-index.ll b/llvm/test/CodeGen/X86/insertelement-var-index.ll --- a/llvm/test/CodeGen/X86/insertelement-var-index.ll +++ b/llvm/test/CodeGen/X86/insertelement-var-index.ll @@ -2270,14 +2270,14 @@ ; SSE: # %bb.0: ; SSE-NEXT: movl (%rdi), %eax ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm0, 96(%rdi) ; SSE-NEXT: movdqa %xmm0, 112(%rdi) -; SSE-NEXT: movdqa %xmm0, 64(%rdi) +; SSE-NEXT: movdqa %xmm0, 96(%rdi) ; SSE-NEXT: movdqa %xmm0, 80(%rdi) -; SSE-NEXT: movdqa %xmm0, 32(%rdi) +; SSE-NEXT: movdqa %xmm0, 64(%rdi) ; SSE-NEXT: movdqa %xmm0, 48(%rdi) -; SSE-NEXT: movdqa %xmm0, (%rdi) +; SSE-NEXT: movdqa %xmm0, 32(%rdi) ; SSE-NEXT: movdqa %xmm0, 16(%rdi) +; SSE-NEXT: movdqa %xmm0, (%rdi) ; SSE-NEXT: leal 2147483647(%rax), %ecx ; SSE-NEXT: testl %eax, %eax ; SSE-NEXT: cmovnsl %eax, %ecx @@ -2293,8 +2293,8 @@ ; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX1-NEXT: vpinsrq $1, (%rdi), %xmm0, %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-NEXT: vmovaps %ymm0, 64(%rdi) ; AVX1-NEXT: vmovaps %ymm0, 96(%rdi) +; AVX1-NEXT: vmovaps %ymm0, 64(%rdi) ; AVX1-NEXT: vmovaps %ymm0, 32(%rdi) ; AVX1-NEXT: movl (%rdi), %eax ; AVX1-NEXT: vmovaps %ymm1, (%rdi) @@ -2314,8 +2314,8 @@ ; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 ; AVX2-NEXT: vpinsrq $1, (%rdi), %xmm0, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqa %ymm0, 64(%rdi) ; AVX2-NEXT: vmovdqa %ymm0, 96(%rdi) +; AVX2-NEXT: vmovdqa %ymm0, 64(%rdi) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdi) ; AVX2-NEXT: movl (%rdi), %eax ; AVX2-NEXT: vmovdqa %ymm1, (%rdi) @@ -2357,8 +2357,8 @@ ; X86AVX2-NEXT: vbroadcastsd (%ecx), %ymm0 ; X86AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] ; X86AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; X86AVX2-NEXT: vmovaps %ymm0, 64(%ecx) ; X86AVX2-NEXT: vmovaps %ymm0, 96(%ecx) +; X86AVX2-NEXT: vmovaps %ymm0, 64(%ecx) ; X86AVX2-NEXT: vmovaps %ymm0, 32(%ecx) ; X86AVX2-NEXT: movl (%ecx), %eax ; X86AVX2-NEXT: vmovaps %ymm1, (%ecx) diff --git a/llvm/test/CodeGen/X86/insertelement-zero.ll b/llvm/test/CodeGen/X86/insertelement-zero.ll --- a/llvm/test/CodeGen/X86/insertelement-zero.ll +++ b/llvm/test/CodeGen/X86/insertelement-zero.ll @@ -337,19 +337,22 @@ ; SSE2-LABEL: insert_v16i16_z12345z789ABCDEz: ; SSE2: # %bb.0: ; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: pinsrw $7, %eax, %xmm1 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v16i16_z12345z789ABCDEz: ; SSE3: # %bb.0: ; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE3-NEXT: xorl %eax, %eax +; SSE3-NEXT: pinsrw $7, %eax, %xmm1 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v16i16_z12345z789ABCDEz: ; SSSE3: # %bb.0: ; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSSE3-NEXT: xorl %eax, %eax +; SSSE3-NEXT: pinsrw $7, %eax, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_v16i16_z12345z789ABCDEz: @@ -359,10 +362,10 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_v16i16_z12345z789ABCDEz: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: insert_v16i16_z12345z789ABCDEz: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq %1 = insertelement <16 x i16> %a, i16 0, i32 0 %2 = insertelement <16 x i16> %1, i16 0, i32 6 %3 = insertelement <16 x i16> %2, i16 0, i32 15 diff --git a/llvm/test/CodeGen/X86/is_fpclass-fp80.ll b/llvm/test/CodeGen/X86/is_fpclass-fp80.ll --- a/llvm/test/CodeGen/X86/is_fpclass-fp80.ll +++ b/llvm/test/CodeGen/X86/is_fpclass-fp80.ll @@ -246,7 +246,7 @@ define i1 @is_inf_f80(x86_fp80 %x) { ; CHECK-32-LABEL: is_inf_f80: ; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; CHECK-32-NEXT: notl %eax ; CHECK-32-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 ; CHECK-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx @@ -258,7 +258,7 @@ ; ; CHECK-64-LABEL: is_inf_f80: ; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; CHECK-64-NEXT: notl %eax ; CHECK-64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 ; CHECK-64-NEXT: xorq {{[0-9]+}}(%rsp), %rcx @@ -301,9 +301,9 @@ ; CHECK-32-LABEL: is_neginf_f80: ; CHECK-32: # %bb.0: # %entry ; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: xorl $65535, %eax # imm = 0xFFFF ; CHECK-32-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 ; CHECK-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: xorl $65535, %eax # imm = 0xFFFF ; CHECK-32-NEXT: orl {{[0-9]+}}(%esp), %eax ; CHECK-32-NEXT: orl %ecx, %eax ; CHECK-32-NEXT: sete %al @@ -312,10 +312,10 @@ ; CHECK-64-LABEL: is_neginf_f80: ; CHECK-64: # %bb.0: # %entry ; CHECK-64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax -; CHECK-64-NEXT: xorq $65535, %rax # imm = 0xFFFF ; CHECK-64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 ; CHECK-64-NEXT: xorq {{[0-9]+}}(%rsp), %rcx -; CHECK-64-NEXT: orq %rax, %rcx +; CHECK-64-NEXT: xorq $65535, %rax # imm = 0xFFFF +; CHECK-64-NEXT: orq %rcx, %rax ; CHECK-64-NEXT: sete %al ; CHECK-64-NEXT: retq entry: @@ -363,22 +363,22 @@ ; CHECK-32-NEXT: pushl %esi ; CHECK-32-NEXT: .cfi_def_cfa_offset 8 ; CHECK-32-NEXT: .cfi_offset %esi, -8 -; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; CHECK-32-NEXT: movswl %dx, %ecx -; CHECK-32-NEXT: sarl $15, %ecx +; CHECK-32-NEXT: movswl {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: movl %ecx, %edx +; CHECK-32-NEXT: sarl $31, %edx ; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: andl $32767, %edx # imm = 0x7FFF -; CHECK-32-NEXT: decl %edx -; CHECK-32-NEXT: movzwl %dx, %edx +; CHECK-32-NEXT: andl $32767, %ecx # imm = 0x7FFF +; CHECK-32-NEXT: decl %ecx +; CHECK-32-NEXT: movzwl %cx, %ecx ; CHECK-32-NEXT: xorl %esi, %esi -; CHECK-32-NEXT: cmpl $32766, %edx # imm = 0x7FFE +; CHECK-32-NEXT: cmpl $32766, %ecx # imm = 0x7FFE ; CHECK-32-NEXT: sbbl %esi, %esi -; CHECK-32-NEXT: setb %dl -; CHECK-32-NEXT: testl %ecx, %ecx -; CHECK-32-NEXT: setns %cl +; CHECK-32-NEXT: setb %cl +; CHECK-32-NEXT: testl %edx, %edx +; CHECK-32-NEXT: setns %dl ; CHECK-32-NEXT: shrl $31, %eax -; CHECK-32-NEXT: andb %cl, %al ; CHECK-32-NEXT: andb %dl, %al +; CHECK-32-NEXT: andb %cl, %al ; CHECK-32-NEXT: # kill: def $al killed $al killed $eax ; CHECK-32-NEXT: popl %esi ; CHECK-32-NEXT: .cfi_def_cfa_offset 4 @@ -434,9 +434,10 @@ ; ; CHECK-64-LABEL: is_negnormal_f80: ; CHECK-64: # %bb.0: # %entry +; CHECK-64-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; CHECK-64-NEXT: movswq %cx, %rdx ; CHECK-64-NEXT: movq {{[0-9]+}}(%rsp), %rax -; CHECK-64-NEXT: movswq {{[0-9]+}}(%rsp), %rcx -; CHECK-64-NEXT: testq %rcx, %rcx +; CHECK-64-NEXT: testq %rdx, %rdx ; CHECK-64-NEXT: sets %dl ; CHECK-64-NEXT: andl $32767, %ecx # imm = 0x7FFF ; CHECK-64-NEXT: decl %ecx diff --git a/llvm/test/CodeGen/X86/is_fpclass.ll b/llvm/test/CodeGen/X86/is_fpclass.ll --- a/llvm/test/CodeGen/X86/is_fpclass.ll +++ b/llvm/test/CodeGen/X86/is_fpclass.ll @@ -792,7 +792,8 @@ define i1 @issubnormal_or_zero_f(float %x) { ; CHECK-32-LABEL: issubnormal_or_zero_f: ; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: testl $32640, %eax # imm = 0x7F80 ; CHECK-32-NEXT: sete %al ; CHECK-32-NEXT: retl ; @@ -810,7 +811,8 @@ define i1 @issubnormal_or_zero_f_daz(float %x) #0 { ; CHECK-32-LABEL: issubnormal_or_zero_f_daz: ; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: testl $32640, %eax # imm = 0x7F80 ; CHECK-32-NEXT: sete %al ; CHECK-32-NEXT: retl ; @@ -828,7 +830,8 @@ define i1 @issubnormal_or_zero_f_maybe_daz(float %x) #1 { ; CHECK-32-LABEL: issubnormal_or_zero_f_maybe_daz: ; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: testl $32640, %eax # imm = 0x7F80 ; CHECK-32-NEXT: sete %al ; CHECK-32-NEXT: retl ; @@ -846,7 +849,8 @@ define i1 @not_issubnormal_or_zero_f(float %x) { ; CHECK-32-LABEL: not_issubnormal_or_zero_f: ; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: testl $32640, %eax # imm = 0x7F80 ; CHECK-32-NEXT: setne %al ; CHECK-32-NEXT: retl ; @@ -864,7 +868,8 @@ define i1 @not_issubnormal_or_zero_f_daz(float %x) #0 { ; CHECK-32-LABEL: not_issubnormal_or_zero_f_daz: ; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: testl $32640, %eax # imm = 0x7F80 ; CHECK-32-NEXT: setne %al ; CHECK-32-NEXT: retl ; @@ -882,7 +887,8 @@ define i1 @not_issubnormal_or_zero_f_maybe_daz(float %x) #1 { ; CHECK-32-LABEL: not_issubnormal_or_zero_f_maybe_daz: ; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000 +; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: testl $32640, %eax # imm = 0x7F80 ; CHECK-32-NEXT: setne %al ; CHECK-32-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/ispow2.ll b/llvm/test/CodeGen/X86/ispow2.ll --- a/llvm/test/CodeGen/X86/ispow2.ll +++ b/llvm/test/CodeGen/X86/ispow2.ll @@ -78,20 +78,20 @@ ; CHECK-NOBMI-LABEL: is_pow2_non_zero_4xv64: ; CHECK-NOBMI: # %bb.0: ; CHECK-NOBMI-NEXT: movdqa {{.*#+}} xmm2 = [256,256] -; CHECK-NOBMI-NEXT: por %xmm2, %xmm0 ; CHECK-NOBMI-NEXT: por %xmm2, %xmm1 +; CHECK-NOBMI-NEXT: por %xmm2, %xmm0 ; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm2 -; CHECK-NOBMI-NEXT: movdqa %xmm1, %xmm3 +; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm3 ; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm3 -; CHECK-NOBMI-NEXT: pand %xmm1, %xmm3 +; CHECK-NOBMI-NEXT: pand %xmm3, %xmm0 +; CHECK-NOBMI-NEXT: paddq %xmm1, %xmm2 +; CHECK-NOBMI-NEXT: pand %xmm1, %xmm2 ; CHECK-NOBMI-NEXT: pxor %xmm1, %xmm1 -; CHECK-NOBMI-NEXT: pcmpeqd %xmm1, %xmm3 -; CHECK-NOBMI-NEXT: paddq %xmm0, %xmm2 -; CHECK-NOBMI-NEXT: pand %xmm2, %xmm0 +; CHECK-NOBMI-NEXT: pcmpeqd %xmm1, %xmm2 ; CHECK-NOBMI-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] -; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] +; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] ; CHECK-NOBMI-NEXT: andps %xmm1, %xmm0 ; CHECK-NOBMI-NEXT: retq ; @@ -128,17 +128,17 @@ ; CHECK-NOBMI-LABEL: neither_pow2_non_zero_4xv64: ; CHECK-NOBMI: # %bb.0: ; CHECK-NOBMI-NEXT: movdqa {{.*#+}} xmm2 = [256,256] -; CHECK-NOBMI-NEXT: por %xmm2, %xmm0 ; CHECK-NOBMI-NEXT: por %xmm2, %xmm1 +; CHECK-NOBMI-NEXT: por %xmm2, %xmm0 ; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm2 +; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm3 +; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm3 +; CHECK-NOBMI-NEXT: pand %xmm3, %xmm0 ; CHECK-NOBMI-NEXT: movdqa %xmm1, %xmm3 ; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm3 ; CHECK-NOBMI-NEXT: pand %xmm1, %xmm3 ; CHECK-NOBMI-NEXT: pxor %xmm1, %xmm1 ; CHECK-NOBMI-NEXT: pcmpeqd %xmm1, %xmm3 -; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm4 -; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm4 -; CHECK-NOBMI-NEXT: pand %xmm4, %xmm0 ; CHECK-NOBMI-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] @@ -186,26 +186,23 @@ ; CHECK-NOBMI-NEXT: paddq %xmm3, %xmm4 ; CHECK-NOBMI-NEXT: pand %xmm1, %xmm4 ; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm1 -; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,0,3,2] -; CHECK-NOBMI-NEXT: pand %xmm1, %xmm5 +; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm5 +; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm5 +; CHECK-NOBMI-NEXT: movdqa %xmm5, %xmm6 +; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm1[1,3] +; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2] +; CHECK-NOBMI-NEXT: andps %xmm6, %xmm5 ; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm4 -; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2] -; CHECK-NOBMI-NEXT: pand %xmm4, %xmm1 -; CHECK-NOBMI-NEXT: pxor %xmm3, %xmm1 -; CHECK-NOBMI-NEXT: por %xmm5, %xmm1 -; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm4 -; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm4 -; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2] -; CHECK-NOBMI-NEXT: pand %xmm4, %xmm5 -; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm4 -; CHECK-NOBMI-NEXT: paddq %xmm3, %xmm4 -; CHECK-NOBMI-NEXT: pand %xmm4, %xmm0 +; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NOBMI-NEXT: paddq %xmm3, %xmm1 +; CHECK-NOBMI-NEXT: pand %xmm1, %xmm0 ; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm0 -; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2] -; CHECK-NOBMI-NEXT: pand %xmm2, %xmm0 -; CHECK-NOBMI-NEXT: pxor %xmm3, %xmm0 -; CHECK-NOBMI-NEXT: por %xmm5, %xmm0 -; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3] +; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; CHECK-NOBMI-NEXT: andps %xmm1, %xmm0 +; CHECK-NOBMI-NEXT: xorps %xmm3, %xmm0 +; CHECK-NOBMI-NEXT: orps %xmm5, %xmm0 ; CHECK-NOBMI-NEXT: retq ; ; CHECK-AVX2-LABEL: neither_pow2_non_zero_4xv64_x_maybe_z: diff --git a/llvm/test/CodeGen/X86/jump_sign.ll b/llvm/test/CodeGen/X86/jump_sign.ll --- a/llvm/test/CodeGen/X86/jump_sign.ll +++ b/llvm/test/CodeGen/X86/jump_sign.ll @@ -228,11 +228,13 @@ ; CHECK-NEXT: jne .LBB12_8 ; CHECK-NEXT: # %bb.4: # %if.end29 ; CHECK-NEXT: movzwl (%eax), %eax -; CHECK-NEXT: imull $-13107, %eax, %eax # imm = 0xCCCD -; CHECK-NEXT: rorw %ax ; CHECK-NEXT: movzwl %ax, %eax -; CHECK-NEXT: cmpl $6554, %eax # imm = 0x199A -; CHECK-NEXT: jae .LBB12_5 +; CHECK-NEXT: imull $52429, %eax, %ecx # imm = 0xCCCD +; CHECK-NEXT: shrl $18, %ecx +; CHECK-NEXT: andl $-2, %ecx +; CHECK-NEXT: leal (%ecx,%ecx,4), %ecx +; CHECK-NEXT: cmpw %cx, %ax +; CHECK-NEXT: jne .LBB12_5 ; CHECK-NEXT: .LBB12_8: # %if.then44 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al @@ -388,11 +390,10 @@ ; CHECK-LABEL: func_test1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl b, %eax -; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: setb %cl ; CHECK-NEXT: movl a, %eax -; CHECK-NEXT: testl %eax, %ecx +; CHECK-NEXT: testb %al, %cl ; CHECK-NEXT: je .LBB18_2 ; CHECK-NEXT: # %bb.1: # %if.then ; CHECK-NEXT: decl %eax diff --git a/llvm/test/CodeGen/X86/known-bits-vector.ll b/llvm/test/CodeGen/X86/known-bits-vector.ll --- a/llvm/test/CodeGen/X86/known-bits-vector.ll +++ b/llvm/test/CodeGen/X86/known-bits-vector.ll @@ -349,12 +349,26 @@ define <4 x i32> @knownbits_mask_srem_shuffle_lshr(<4 x i32> %a0) nounwind { ; X86-LABEL: knownbits_mask_srem_shuffle_lshr: ; X86: # %bb.0: -; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 +; X86-NEXT: vpsrad $31, %xmm0, %xmm0 +; X86-NEXT: vpsrld $28, %xmm0, %xmm0 +; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,3] +; X86-NEXT: vpsrld $22, %xmm0, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: knownbits_mask_srem_shuffle_lshr: ; X64: # %bb.0: -; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; X64-NEXT: vpsrad $31, %xmm0, %xmm0 +; X64-NEXT: vpsrld $28, %xmm0, %xmm0 +; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,3] +; X64-NEXT: vpsrld $22, %xmm0, %xmm0 ; X64-NEXT: retq %1 = and <4 x i32> %a0, %2 = srem <4 x i32> %1, diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll --- a/llvm/test/CodeGen/X86/known-never-zero.ll +++ b/llvm/test/CodeGen/X86/known-never-zero.ll @@ -870,11 +870,10 @@ define i32 @zext_maybe_zero(i16 %x) { ; CHECK-LABEL: zext_maybe_zero: ; CHECK: # %bb.0: -; CHECK-NEXT: testw %di, %di +; CHECK-NEXT: andl $65535, %edi # imm = 0xFFFF ; CHECK-NEXT: je .LBB49_1 ; CHECK-NEXT: # %bb.2: # %cond.false -; CHECK-NEXT: movzwl %di, %eax -; CHECK-NEXT: rep bsfl %eax, %eax +; CHECK-NEXT: rep bsfl %edi, %eax ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB49_1: ; CHECK-NEXT: movl $32, %eax diff --git a/llvm/test/CodeGen/X86/known-pow2.ll b/llvm/test/CodeGen/X86/known-pow2.ll --- a/llvm/test/CodeGen/X86/known-pow2.ll +++ b/llvm/test/CodeGen/X86/known-pow2.ll @@ -26,16 +26,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [954437177,1073741824,268435456,67108864] ; CHECK-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-NEXT: psrld $1, %xmm2 +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[3,3] +; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] -; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; CHECK-NEXT: movdqa %xmm1, %xmm3 -; CHECK-NEXT: psrld $1, %xmm3 -; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] -; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -157,8 +157,9 @@ ; ; X64-LABEL: signbits_ashr_extract_sitofp_0: ; X64: # %bb.0: -; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 +; X64-NEXT: vmovq %xmm0, %rax +; X64-NEXT: shrq $32, %rax +; X64-NEXT: vcvtsi2ss %eax, %xmm1, %xmm0 ; X64-NEXT: retq %1 = ashr <2 x i64> %a0, %2 = extractelement <2 x i64> %1, i32 0 @@ -179,8 +180,9 @@ ; ; X64-LABEL: signbits_ashr_extract_sitofp_1: ; X64: # %bb.0: -; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 +; X64-NEXT: vmovq %xmm0, %rax +; X64-NEXT: shrq $32, %rax +; X64-NEXT: vcvtsi2ss %eax, %xmm1, %xmm0 ; X64-NEXT: retq %1 = ashr <2 x i64> %a0, %2 = extractelement <2 x i64> %1, i32 0 @@ -203,10 +205,10 @@ ; ; X64-LABEL: signbits_ashr_shl_extract_sitofp: ; X64: # %bb.0: -; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X64-NEXT: vpsrad $29, %xmm0, %xmm0 -; X64-NEXT: vpsllq $20, %xmm0, %xmm0 -; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 +; X64-NEXT: vmovq %xmm0, %rax +; X64-NEXT: sarq $61, %rax +; X64-NEXT: shll $20, %eax +; X64-NEXT: vcvtsi2ss %eax, %xmm1, %xmm0 ; X64-NEXT: retq %1 = ashr <2 x i64> %a0, %2 = shl <2 x i64> %1, @@ -220,9 +222,9 @@ ; X86: # %bb.0: ; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shrdl $30, %ecx, %eax ; X86-NEXT: sarl $30, %ecx -; X86-NEXT: shll $2, %eax ; X86-NEXT: vmovd %eax, %xmm0 ; X86-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; X86-NEXT: vpsrlq $3, %xmm0, %xmm0 @@ -235,9 +237,8 @@ ; X64-LABEL: signbits_ashr_insert_ashr_extract_sitofp: ; X64: # %bb.0: ; X64-NEXT: sarq $30, %rdi -; X64-NEXT: vmovq %rdi, %xmm0 -; X64-NEXT: vpsrlq $3, %xmm0, %xmm0 -; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 +; X64-NEXT: shrq $3, %rdi +; X64-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 ; X64-NEXT: retq %1 = ashr i64 %a0, 30 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 @@ -352,7 +353,8 @@ ; X64: # %bb.0: ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X64-NEXT: vpsrad $29, %xmm0, %xmm0 -; X64-NEXT: vmovd %edi, %xmm1 +; X64-NEXT: movslq %edi, %rax +; X64-NEXT: vmovq %rax, %xmm1 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-NEXT: retq @@ -405,24 +407,24 @@ ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $16, %esp -; X86-NEXT: vmovapd 8(%ebp), %xmm3 -; X86-NEXT: vpsrad $31, %xmm2, %xmm4 -; X86-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; X86-NEXT: vpsrad $1, %xmm5, %xmm5 -; X86-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] -; X86-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X86-NEXT: vpmovsxdq 8(%ebp), %xmm4 +; X86-NEXT: vpmovsxdq 16(%ebp), %xmm3 ; X86-NEXT: vpsrad $31, %xmm2, %xmm5 +; X86-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] +; X86-NEXT: vpsrad $1, %xmm6, %xmm6 +; X86-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7] +; X86-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X86-NEXT: vpsrad $31, %xmm2, %xmm6 ; X86-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; X86-NEXT: vpsrad $1, %xmm2, %xmm2 -; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] -; X86-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,3,3] +; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3],xmm2[4,5],xmm6[6,7] ; X86-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm6 +; X86-NEXT: vblendvpd %xmm6, %xmm5, %xmm4, %xmm4 ; X86-NEXT: vextractf128 $1, %ymm1, %xmm1 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 ; X86-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; X86-NEXT: vblendvpd %xmm0, %xmm2, %xmm5, %xmm0 -; X86-NEXT: vblendvpd %xmm6, %xmm4, %xmm3, %xmm1 -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0 +; X86-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 ; X86-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] diff --git a/llvm/test/CodeGen/X86/lea-recursion.ll b/llvm/test/CodeGen/X86/lea-recursion.ll --- a/llvm/test/CodeGen/X86/lea-recursion.ll +++ b/llvm/test/CodeGen/X86/lea-recursion.ll @@ -21,27 +21,27 @@ ; CHECK-NEXT: leal 1(%rax,%rcx), %eax ; CHECK-NEXT: movl %eax, g0+4(%rip) ; CHECK-NEXT: movl g1+4(%rip), %eax -; CHECK-NEXT: leal 1(%rax,%rdx), %ecx +; CHECK-NEXT: leal (%rdx,%rax), %ecx ; CHECK-NEXT: leal 2(%rax,%rdx), %eax ; CHECK-NEXT: movl %eax, g0+8(%rip) ; CHECK-NEXT: movl g1+8(%rip), %eax -; CHECK-NEXT: leal 1(%rax,%rcx), %edx -; CHECK-NEXT: leal 2(%rax,%rcx), %eax +; CHECK-NEXT: leal (%rcx,%rax), %edx +; CHECK-NEXT: leal 3(%rax,%rcx), %eax ; CHECK-NEXT: movl %eax, g0+12(%rip) ; CHECK-NEXT: movl g1+12(%rip), %eax -; CHECK-NEXT: leal 1(%rax,%rdx), %ecx -; CHECK-NEXT: leal 2(%rax,%rdx), %eax +; CHECK-NEXT: leal (%rdx,%rax), %ecx +; CHECK-NEXT: leal 4(%rax,%rdx), %eax ; CHECK-NEXT: movl %eax, g0+16(%rip) ; CHECK-NEXT: movl g1+16(%rip), %eax -; CHECK-NEXT: leal 1(%rax,%rcx), %edx -; CHECK-NEXT: leal 2(%rax,%rcx), %eax +; CHECK-NEXT: leal (%rcx,%rax), %edx +; CHECK-NEXT: leal 5(%rax,%rcx), %eax ; CHECK-NEXT: movl %eax, g0+20(%rip) ; CHECK-NEXT: movl g1+20(%rip), %eax -; CHECK-NEXT: leal 1(%rax,%rdx), %ecx -; CHECK-NEXT: leal 2(%rax,%rdx), %eax +; CHECK-NEXT: leal (%rdx,%rax), %ecx +; CHECK-NEXT: leal 6(%rax,%rdx), %eax ; CHECK-NEXT: movl %eax, g0+24(%rip) ; CHECK-NEXT: movl g1+24(%rip), %eax -; CHECK-NEXT: leal 2(%rax,%rcx), %eax +; CHECK-NEXT: leal 7(%rax,%rcx), %eax ; CHECK-NEXT: movl %eax, g0+28(%rip) ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/legalize-shift.ll b/llvm/test/CodeGen/X86/legalize-shift.ll --- a/llvm/test/CodeGen/X86/legalize-shift.ll +++ b/llvm/test/CodeGen/X86/legalize-shift.ll @@ -5,13 +5,17 @@ define void @PR36250() nounwind { ; X86-LABEL: PR36250: ; X86: # %bb.0: -; X86-NEXT: cmpl $0, (%eax) +; X86-NEXT: movl (%eax), %eax +; X86-NEXT: leal (%eax,%eax), %ecx +; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete (%eax) ; X86-NEXT: retl ; ; X64-LABEL: PR36250: ; X64: # %bb.0: -; X64-NEXT: cmpq $0, (%rax) +; X64-NEXT: movq (%rax), %rax +; X64-NEXT: leaq (%rax,%rax), %rcx +; X64-NEXT: orq %rax, %rcx ; X64-NEXT: sete (%rax) ; X64-NEXT: retq %1 = load i448, ptr undef diff --git a/llvm/test/CodeGen/X86/lifetime-alias.ll b/llvm/test/CodeGen/X86/lifetime-alias.ll --- a/llvm/test/CodeGen/X86/lifetime-alias.ll +++ b/llvm/test/CodeGen/X86/lifetime-alias.ll @@ -28,10 +28,10 @@ ; CHECK: # %bb.0: # %_ZNSt3__312basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEED2Ev.exit50 ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: movaps {{.*#+}} xmm0 = [97,97,97,97,97,97,97,97,97,97,97,97,97,97,97,97] -; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movabsq $7016996765293437281, %rax # imm = 0x6161616161616161 ; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movaps {{.*#+}} xmm0 = [97,97,97,97,97,97,97,97,97,97,97,97,97,97,97,97] +; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $5632, {{[0-9]+}}(%rsp) # imm = 0x1600 ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) @@ -44,13 +44,11 @@ ; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $21, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movabsq $7308613581744070988, %rax # imm = 0x656D69547473614C -; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movups .L.str.1(%rip), %xmm1 ; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; CHECK-NEXT: movabsq $7308613581744070988, %rax # imm = 0x656D69547473614C ; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax @@ -61,6 +59,8 @@ ; CHECK-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax diff --git a/llvm/test/CodeGen/X86/load-chain.ll b/llvm/test/CodeGen/X86/load-chain.ll --- a/llvm/test/CodeGen/X86/load-chain.ll +++ b/llvm/test/CodeGen/X86/load-chain.ll @@ -11,9 +11,9 @@ ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: movl $-32707, %ebp # imm = 0x803D -; CHECK-NEXT: andl (%rdi), %ebp +; CHECK-NEXT: movzwl (%rdi), %ebp ; CHECK-NEXT: callq maybe_mutate@PLT +; CHECK-NEXT: andl $32829, %ebp # imm = 0x803D ; CHECK-NEXT: orl $514, %ebp # imm = 0x202 ; CHECK-NEXT: movw %bp, (%rbx) ; CHECK-NEXT: addq $8, %rsp diff --git a/llvm/test/CodeGen/X86/load-combine.ll b/llvm/test/CodeGen/X86/load-combine.ll --- a/llvm/test/CodeGen/X86/load-combine.ll +++ b/llvm/test/CodeGen/X86/load-combine.ll @@ -894,7 +894,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl 12(%eax,%ecx), %eax +; CHECK-NEXT: movl 12(%ecx,%eax), %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: load_i32_by_i8_base_offset_index: @@ -939,13 +939,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl 13(%eax,%ecx), %eax +; CHECK-NEXT: movl 13(%ecx,%eax), %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: load_i32_by_i8_base_offset_index_2: ; CHECK64: # %bb.0: ; CHECK64-NEXT: movl %esi, %eax -; CHECK64-NEXT: movl 13(%rax,%rdi), %eax +; CHECK64-NEXT: movl 13(%rdi,%rax), %eax ; CHECK64-NEXT: retq %tmp = add nuw nsw i32 %i, 4 %tmp2 = add nuw nsw i32 %i, 3 @@ -995,7 +995,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl 12(%eax,%ecx), %eax +; CHECK-NEXT: movl 12(%ecx,%eax), %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: load_i32_by_i8_zaext_loads: @@ -1051,7 +1051,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl 12(%eax,%ecx), %eax +; CHECK-NEXT: movl 12(%ecx,%eax), %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: load_i32_by_i8_zsext_loads: diff --git a/llvm/test/CodeGen/X86/load-local-v3i1.ll b/llvm/test/CodeGen/X86/load-local-v3i1.ll --- a/llvm/test/CodeGen/X86/load-local-v3i1.ll +++ b/llvm/test/CodeGen/X86/load-local-v3i1.ll @@ -19,10 +19,10 @@ ; CHECK-NEXT: andb $1, %dl ; CHECK-NEXT: addb %dl, %dl ; CHECK-NEXT: orb %sil, %dl -; CHECK-NEXT: andb $1, %cl ; CHECK-NEXT: shlb $2, %cl ; CHECK-NEXT: orb %dl, %cl -; CHECK-NEXT: testb $1, %cl +; CHECK-NEXT: andb $7, %cl +; CHECK-NEXT: testb %sil, %sil ; CHECK-NEXT: # implicit-def: $xmm0 ; CHECK-NEXT: jne .LBB0_1 ; CHECK-NEXT: # %bb.2: # %else @@ -56,10 +56,10 @@ ; CHECK-NEXT: andb $1, %dl ; CHECK-NEXT: addb %dl, %dl ; CHECK-NEXT: orb %sil, %dl -; CHECK-NEXT: andb $1, %cl ; CHECK-NEXT: shlb $2, %cl ; CHECK-NEXT: orb %dl, %cl -; CHECK-NEXT: testb $1, %cl +; CHECK-NEXT: andb $7, %cl +; CHECK-NEXT: testb %sil, %sil ; CHECK-NEXT: jne .LBB1_1 ; CHECK-NEXT: # %bb.2: # %else ; CHECK-NEXT: testb $2, %cl diff --git a/llvm/test/CodeGen/X86/load-local-v3i129.ll b/llvm/test/CodeGen/X86/load-local-v3i129.ll --- a/llvm/test/CodeGen/X86/load-local-v3i129.ll +++ b/llvm/test/CodeGen/X86/load-local-v3i129.ll @@ -5,28 +5,35 @@ define void @_start() nounwind { ; FAST-SHLD-LABEL: _start: ; FAST-SHLD: # %bb.0: # %Entry -; FAST-SHLD-NEXT: movq -40(%rsp), %rax -; FAST-SHLD-NEXT: movq -32(%rsp), %rcx -; FAST-SHLD-NEXT: movq %rcx, %rdx -; FAST-SHLD-NEXT: shlq $62, %rdx -; FAST-SHLD-NEXT: shrq $2, %rcx -; FAST-SHLD-NEXT: shldq $2, %rdx, %rcx -; FAST-SHLD-NEXT: andq $-4, %rax -; FAST-SHLD-NEXT: orq $1, %rax -; FAST-SHLD-NEXT: movq %rax, -40(%rsp) -; FAST-SHLD-NEXT: movq %rcx, -32(%rsp) -; FAST-SHLD-NEXT: orq $-2, -56(%rsp) +; FAST-SHLD-NEXT: movl -24(%rsp), %eax +; FAST-SHLD-NEXT: movl %eax, %ecx +; FAST-SHLD-NEXT: shrl $2, %ecx +; FAST-SHLD-NEXT: movq -40(%rsp), %rdx +; FAST-SHLD-NEXT: movq -32(%rsp), %rsi +; FAST-SHLD-NEXT: shldq $62, %rsi, %rax +; FAST-SHLD-NEXT: shrdq $2, %rsi, %rdx +; FAST-SHLD-NEXT: leaq 1(,%rdx,4), %rsi +; FAST-SHLD-NEXT: movq %rsi, -40(%rsp) +; FAST-SHLD-NEXT: shrdq $62, %rax, %rdx +; FAST-SHLD-NEXT: movq %rdx, -32(%rsp) +; FAST-SHLD-NEXT: shrdq $62, %rcx, %rax +; FAST-SHLD-NEXT: andl $7, %eax +; FAST-SHLD-NEXT: movb %al, -24(%rsp) ; FAST-SHLD-NEXT: movq $-1, -48(%rsp) +; FAST-SHLD-NEXT: orq $-2, -56(%rsp) ; FAST-SHLD-NEXT: retq ; ; SLOW-SHLD-LABEL: _start: ; SLOW-SHLD: # %bb.0: # %Entry ; SLOW-SHLD-NEXT: movq -40(%rsp), %rax +; SLOW-SHLD-NEXT: movzbl -24(%rsp), %ecx +; SLOW-SHLD-NEXT: andl $7, %ecx +; SLOW-SHLD-NEXT: movb %cl, -24(%rsp) ; SLOW-SHLD-NEXT: andq $-4, %rax ; SLOW-SHLD-NEXT: orq $1, %rax ; SLOW-SHLD-NEXT: movq %rax, -40(%rsp) -; SLOW-SHLD-NEXT: orq $-2, -56(%rsp) ; SLOW-SHLD-NEXT: movq $-1, -48(%rsp) +; SLOW-SHLD-NEXT: orq $-2, -56(%rsp) ; SLOW-SHLD-NEXT: retq Entry: %y = alloca <3 x i129>, align 16 diff --git a/llvm/test/CodeGen/X86/load-partial.ll b/llvm/test/CodeGen/X86/load-partial.ll --- a/llvm/test/CodeGen/X86/load-partial.ll +++ b/llvm/test/CodeGen/X86/load-partial.ll @@ -104,14 +104,32 @@ } define <4 x float> @load_float4_float3_as_float2_float(ptr nocapture readonly dereferenceable(16)) nofree nosync { -; SSE-LABEL: load_float4_float3_as_float2_float: -; SSE: # %bb.0: -; SSE-NEXT: movups (%rdi), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: load_float4_float3_as_float2_float: +; SSE2: # %bb.0: +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_float4_float3_as_float2_float: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_float4_float3_as_float2_float: +; SSE41: # %bb.0: +; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; SSE41-NEXT: retq ; ; AVX-LABEL: load_float4_float3_as_float2_float: ; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %xmm0 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; AVX-NEXT: retq %2 = load <2 x float>, ptr %0, align 4 %3 = extractelement <2 x float> %2, i32 0 @@ -380,40 +398,48 @@ } define dso_local void @PR43227(ptr %explicit_0, ptr %explicit_1) { -; SSE-LABEL: PR43227: -; SSE: # %bb.0: -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: psrlq $32, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm1, 672(%rsi) -; SSE-NEXT: movdqa %xmm0, 688(%rsi) -; SSE-NEXT: retq +; SSE2-LABEL: PR43227: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm1, 672(%rsi) +; SSE2-NEXT: movdqa %xmm0, 688(%rsi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: PR43227: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: psrlq $32, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, 672(%rsi) +; SSSE3-NEXT: movdqa %xmm0, 688(%rsi) +; SSSE3-NEXT: retq ; -; AVX1-LABEL: PR43227: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, 672(%rsi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq +; SSE41-LABEL: PR43227: +; SSE41: # %bb.0: +; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, 672(%rsi) +; SSE41-NEXT: movaps %xmm1, 688(%rsi) +; SSE41-NEXT: retq ; -; AVX2-LABEL: PR43227: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, 672(%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: PR43227: +; AVX: # %bb.0: +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: vmovaps %ymm0, 672(%rsi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %1 = getelementptr i32, ptr %explicit_0, i64 63 %2 = load <3 x i32>, ptr %1, align 1 %3 = shufflevector <3 x i32> %2, <3 x i32> undef, <2 x i32> @@ -423,3 +449,6 @@ store <8 x i32> %5, ptr %6, align 32 ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX1: {{.*}} +; AVX2: {{.*}} diff --git a/llvm/test/CodeGen/X86/load-scalar-as-vector.ll b/llvm/test/CodeGen/X86/load-scalar-as-vector.ll --- a/llvm/test/CodeGen/X86/load-scalar-as-vector.ll +++ b/llvm/test/CodeGen/X86/load-scalar-as-vector.ll @@ -365,7 +365,7 @@ ; SSE-NEXT: movzwl %cx, %eax ; SSE-NEXT: movswl %ax, %ecx ; SSE-NEXT: shrl $15, %eax -; SSE-NEXT: sarl $5, %ecx +; SSE-NEXT: shrl $5, %ecx ; SSE-NEXT: addl %eax, %ecx ; SSE-NEXT: movd %ecx, %xmm0 ; SSE-NEXT: retq @@ -379,7 +379,7 @@ ; AVX-NEXT: movzwl %cx, %eax ; AVX-NEXT: movswl %ax, %ecx ; AVX-NEXT: shrl $15, %eax -; AVX-NEXT: sarl $5, %ecx +; AVX-NEXT: shrl $5, %ecx ; AVX-NEXT: addl %eax, %ecx ; AVX-NEXT: vmovd %ecx, %xmm0 ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/lzcnt-cmp.ll b/llvm/test/CodeGen/X86/lzcnt-cmp.ll --- a/llvm/test/CodeGen/X86/lzcnt-cmp.ll +++ b/llvm/test/CodeGen/X86/lzcnt-cmp.ll @@ -30,7 +30,9 @@ ; ; X64-LZCNT-LABEL: lshr_ctlz_cmpeq_one_i64: ; X64-LZCNT: # %bb.0: -; X64-LZCNT-NEXT: testq %rdi, %rdi +; X64-LZCNT-NEXT: lzcntq %rdi, %rax +; X64-LZCNT-NEXT: shrl $6, %eax +; X64-LZCNT-NEXT: cmpl $1, %eax ; X64-LZCNT-NEXT: sete %al ; X64-LZCNT-NEXT: retq %ctlz = call i64 @llvm.ctlz.i64(i64 %in, i1 0) @@ -111,8 +113,9 @@ ; ; X64-LZCNT-LABEL: lshr_ctlz_cmpne_zero_i64: ; X64-LZCNT: # %bb.0: -; X64-LZCNT-NEXT: testq %rdi, %rdi -; X64-LZCNT-NEXT: sete %al +; X64-LZCNT-NEXT: lzcntq %rdi, %rax +; X64-LZCNT-NEXT: testb $64, %al +; X64-LZCNT-NEXT: setne %al ; X64-LZCNT-NEXT: retq %ctlz = call i64 @llvm.ctlz.i64(i64 %in, i1 0) %lshr = lshr i64 %ctlz, 6 diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -40,9 +40,9 @@ ; AVX-NEXT: .p2align 4, 0x90 ; AVX-NEXT: .LBB0_1: # %vector.body ; AVX-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpmovsxwd (%rdi,%rcx,2), %xmm1 +; AVX-NEXT: vpmovsxwd (%rsi,%rcx,2), %xmm2 +; AVX-NEXT: vpmulld %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: addq $8, %rcx ; AVX-NEXT: cmpq %rcx, %rax @@ -96,7 +96,16 @@ ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm2 ; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm3 -; SSE2-NEXT: pmaddwd %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pmulhw %xmm2, %xmm4 +; SSE2-NEXT: pmullw %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm2[1,3] +; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: addq $8, %rcx ; SSE2-NEXT: cmpq %rcx, %rax @@ -118,8 +127,13 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB1_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %xmm1 -; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxwd (%rdi,%rcx,2), %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%rdi,%rcx,2), %xmm2 +; AVX1-NEXT: vpmovsxwd (%rsi,%rcx,2), %xmm3 +; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%rsi,%rcx,2), %xmm3 +; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: addq $8, %rcx @@ -144,17 +158,20 @@ ; AVX256-NEXT: .p2align 4, 0x90 ; AVX256-NEXT: .LBB1_1: # %vector.body ; AVX256-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX256-NEXT: vmovdqu (%rsi,%rcx,2), %xmm1 -; AVX256-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1 +; AVX256-NEXT: vpmovsxwd (%rdi,%rcx,2), %ymm1 +; AVX256-NEXT: vpmovsxwd (%rsi,%rcx,2), %ymm2 +; AVX256-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX256-NEXT: vphaddd %xmm2, %xmm1, %xmm1 ; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX256-NEXT: addq $8, %rcx ; AVX256-NEXT: cmpq %rcx, %rax ; AVX256-NEXT: jne .LBB1_1 ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vmovd %xmm0, %eax @@ -203,14 +220,32 @@ ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB2_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm3 -; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm4 -; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm5 -; SSE2-NEXT: pmaddwd %xmm3, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm2 -; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm3 -; SSE2-NEXT: pmaddwd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm4 +; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm5 +; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm3 +; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm6 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: pmulhw %xmm4, %xmm7 +; SSE2-NEXT: pmullw %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pmulhw %xmm5, %xmm7 +; SSE2-NEXT: pmullw %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm5[0,2] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm4[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm5[1,3] +; SSE2-NEXT: paddd %xmm7, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm4[1,3] +; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: paddd %xmm3, %xmm2 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB2_1 @@ -234,14 +269,24 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB2_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %xmm2 -; AVX1-NEXT: vmovdqu 16(%rsi,%rcx,2), %xmm3 -; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2 -; AVX1-NEXT: vpmaddwd 16(%rdi,%rcx,2), %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxwd 24(%rdi,%rcx,2), %xmm2 +; AVX1-NEXT: vpmovsxwd 16(%rdi,%rcx,2), %xmm3 +; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovsxwd 8(%rdi,%rcx,2), %xmm3 +; AVX1-NEXT: vpmovsxwd (%rdi,%rcx,2), %xmm4 +; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovsxwd 24(%rsi,%rcx,2), %xmm4 +; AVX1-NEXT: vpmovsxwd 16(%rsi,%rcx,2), %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpmovsxwd 8(%rsi,%rcx,2), %xmm4 +; AVX1-NEXT: vpmovsxwd (%rsi,%rcx,2), %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: addq $16, %rcx ; AVX1-NEXT: cmpq %rcx, %rax ; AVX1-NEXT: jne .LBB2_1 @@ -268,8 +313,15 @@ ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB2_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %ymm2 -; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm2, %ymm2 +; AVX2-NEXT: vpmovsxwd 16(%rdi,%rcx,2), %ymm2 +; AVX2-NEXT: vpmovsxwd (%rdi,%rcx,2), %ymm3 +; AVX2-NEXT: vpackssdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovsxwd 16(%rsi,%rcx,2), %ymm3 +; AVX2-NEXT: vpmovsxwd (%rsi,%rcx,2), %ymm4 +; AVX2-NEXT: vpackssdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: addq $16, %rcx ; AVX2-NEXT: cmpq %rcx, %rax @@ -277,9 +329,9 @@ ; AVX2-NEXT: # %bb.2: # %middle.block ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -294,8 +346,12 @@ ; AVX512-NEXT: .p2align 4, 0x90 ; AVX512-NEXT: .LBB2_1: # %vector.body ; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512-NEXT: vmovdqu (%rsi,%rcx,2), %ymm1 -; AVX512-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm1, %ymm1 +; AVX512-NEXT: vpmovsxwd (%rdi,%rcx,2), %zmm1 +; AVX512-NEXT: vpmovsxwd (%rsi,%rcx,2), %zmm2 +; AVX512-NEXT: vpmulld %zmm1, %zmm2, %zmm1 +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vphaddd %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: addq $16, %rcx ; AVX512-NEXT: cmpq %rcx, %rax @@ -304,9 +360,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -350,7 +406,6 @@ ; SSE2-LABEL: _Z10test_shortPsS_i_1024: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm4 @@ -359,26 +414,63 @@ ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB3_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm5 -; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm6 -; SSE2-NEXT: movdqu 32(%rdi,%rcx,2), %xmm7 -; SSE2-NEXT: movdqu 48(%rdi,%rcx,2), %xmm8 -; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm9 -; SSE2-NEXT: pmaddwd %xmm5, %xmm9 -; SSE2-NEXT: paddd %xmm9, %xmm2 -; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm5 -; SSE2-NEXT: pmaddwd %xmm6, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm4 -; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm5 -; SSE2-NEXT: pmaddwd %xmm7, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm1 -; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm5 -; SSE2-NEXT: pmaddwd %xmm8, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm3 +; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm7 +; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm10 +; SSE2-NEXT: movdqu 32(%rdi,%rcx,2), %xmm11 +; SSE2-NEXT: movdqu 48(%rdi,%rcx,2), %xmm12 +; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm5 +; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm6 +; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm8 +; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm9 +; SSE2-NEXT: movdqa %xmm5, %xmm13 +; SSE2-NEXT: pmulhw %xmm7, %xmm13 +; SSE2-NEXT: pmullw %xmm7, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; SSE2-NEXT: movdqa %xmm6, %xmm13 +; SSE2-NEXT: pmulhw %xmm10, %xmm13 +; SSE2-NEXT: pmullw %xmm10, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3] +; SSE2-NEXT: movdqa %xmm8, %xmm13 +; SSE2-NEXT: pmulhw %xmm11, %xmm13 +; SSE2-NEXT: pmullw %xmm11, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3] +; SSE2-NEXT: movdqa %xmm9, %xmm13 +; SSE2-NEXT: pmulhw %xmm12, %xmm13 +; SSE2-NEXT: pmullw %xmm12, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; SSE2-NEXT: movdqa %xmm9, %xmm13 +; SSE2-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm12[0,2] +; SSE2-NEXT: movdqa %xmm8, %xmm14 +; SSE2-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm11[0,2] +; SSE2-NEXT: movdqa %xmm6, %xmm15 +; SSE2-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm10[0,2] +; SSE2-NEXT: movdqa %xmm5, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,3],xmm12[1,3] +; SSE2-NEXT: paddd %xmm13, %xmm9 +; SSE2-NEXT: paddd %xmm9, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,3],xmm11[1,3] +; SSE2-NEXT: paddd %xmm14, %xmm8 +; SSE2-NEXT: paddd %xmm8, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm10[1,3] +; SSE2-NEXT: paddd %xmm15, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm7[1,3] +; SSE2-NEXT: paddd %xmm0, %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm2 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB3_1 ; SSE2-NEXT: # %bb.2: # %middle.block +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm3 @@ -403,22 +495,42 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB3_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %xmm3 -; AVX1-NEXT: vmovdqu 16(%rsi,%rcx,2), %xmm4 -; AVX1-NEXT: vmovdqu 32(%rsi,%rcx,2), %xmm5 -; AVX1-NEXT: vmovdqu 48(%rsi,%rcx,2), %xmm6 -; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm3, %xmm3 -; AVX1-NEXT: vpmaddwd 16(%rdi,%rcx,2), %xmm4, %xmm4 -; AVX1-NEXT: vpmaddwd 32(%rdi,%rcx,2), %xmm5, %xmm5 -; AVX1-NEXT: vpmaddwd 48(%rdi,%rcx,2), %xmm6, %xmm6 +; AVX1-NEXT: vpmovsxwd 56(%rdi,%rcx,2), %xmm3 +; AVX1-NEXT: vpmovsxwd 48(%rdi,%rcx,2), %xmm4 +; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovsxwd 40(%rdi,%rcx,2), %xmm4 +; AVX1-NEXT: vpmovsxwd 32(%rdi,%rcx,2), %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmovsxwd 24(%rdi,%rcx,2), %xmm5 +; AVX1-NEXT: vpmovsxwd 16(%rdi,%rcx,2), %xmm6 +; AVX1-NEXT: vpackssdw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpmovsxwd 8(%rdi,%rcx,2), %xmm6 +; AVX1-NEXT: vpmovsxwd (%rdi,%rcx,2), %xmm7 +; AVX1-NEXT: vpackssdw %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpmovsxwd 56(%rsi,%rcx,2), %xmm7 +; AVX1-NEXT: vpmovsxwd 48(%rsi,%rcx,2), %xmm8 +; AVX1-NEXT: vpackssdw %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpmaddwd %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vpmovsxwd 40(%rsi,%rcx,2), %xmm7 +; AVX1-NEXT: vpmovsxwd 32(%rsi,%rcx,2), %xmm8 +; AVX1-NEXT: vpackssdw %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpmaddwd %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpmovsxwd 24(%rsi,%rcx,2), %xmm7 +; AVX1-NEXT: vpmovsxwd 16(%rsi,%rcx,2), %xmm8 +; AVX1-NEXT: vpackssdw %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpmaddwd %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vpmovsxwd 8(%rsi,%rcx,2), %xmm7 +; AVX1-NEXT: vpmovsxwd (%rsi,%rcx,2), %xmm8 +; AVX1-NEXT: vpackssdw %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpaddd %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: addq $16, %rcx ; AVX1-NEXT: cmpq %rcx, %rax ; AVX1-NEXT: jne .LBB3_1 @@ -451,11 +563,25 @@ ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB3_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %ymm3 -; AVX2-NEXT: vmovdqu 32(%rsi,%rcx,2), %ymm4 -; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm3 +; AVX2-NEXT: vpmovsxwd 48(%rdi,%rcx,2), %ymm3 +; AVX2-NEXT: vpmovsxwd 32(%rdi,%rcx,2), %ymm4 +; AVX2-NEXT: vpackssdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpmovsxwd 16(%rdi,%rcx,2), %ymm4 +; AVX2-NEXT: vpmovsxwd (%rdi,%rcx,2), %ymm5 +; AVX2-NEXT: vpackssdw %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpmovsxwd 48(%rsi,%rcx,2), %ymm5 +; AVX2-NEXT: vpmovsxwd 32(%rsi,%rcx,2), %ymm6 +; AVX2-NEXT: vpackssdw %ymm5, %ymm6, %ymm5 +; AVX2-NEXT: vpmovsxwd 16(%rsi,%rcx,2), %ymm6 +; AVX2-NEXT: vpmovsxwd (%rsi,%rcx,2), %ymm7 +; AVX2-NEXT: vpackssdw %ymm6, %ymm7, %ymm6 +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,1,3] +; AVX2-NEXT: vpmaddwd %ymm4, %ymm6, %ymm4 +; AVX2-NEXT: vpaddd %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,2,1,3] +; AVX2-NEXT: vpmaddwd %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: addq $16, %rcx ; AVX2-NEXT: cmpq %rcx, %rax @@ -465,9 +591,9 @@ ; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -483,10 +609,16 @@ ; AVX512F-NEXT: .p2align 4, 0x90 ; AVX512F-NEXT: .LBB3_1: # %vector.body ; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512F-NEXT: vmovdqu (%rsi,%rcx,2), %ymm2 -; AVX512F-NEXT: vmovdqu 32(%rsi,%rcx,2), %ymm3 -; AVX512F-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm3, %ymm3 -; AVX512F-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm2, %ymm2 +; AVX512F-NEXT: vpmovsxwd (%rdi,%rcx,2), %zmm2 +; AVX512F-NEXT: vpmovsxwd 32(%rdi,%rcx,2), %zmm3 +; AVX512F-NEXT: vpmovsxwd (%rsi,%rcx,2), %zmm4 +; AVX512F-NEXT: vpmovsxwd 32(%rsi,%rcx,2), %zmm5 +; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512F-NEXT: vpmovdw %zmm5, %ymm5 +; AVX512F-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 +; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512F-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512F-NEXT: vpmaddwd %ymm2, %ymm4, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: addq $16, %rcx @@ -497,9 +629,9 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, %eax @@ -515,8 +647,17 @@ ; AVX512BW-NEXT: .p2align 4, 0x90 ; AVX512BW-NEXT: .LBB3_1: # %vector.body ; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512BW-NEXT: vmovdqu64 (%rsi,%rcx,2), %zmm2 -; AVX512BW-NEXT: vpmaddwd (%rdi,%rcx,2), %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovsxwd 32(%rdi,%rcx,2), %zmm2 +; AVX512BW-NEXT: vpmovsxwd (%rdi,%rcx,2), %zmm3 +; AVX512BW-NEXT: vpmovsxwd 32(%rsi,%rcx,2), %zmm4 +; AVX512BW-NEXT: vpmovsxwd (%rsi,%rcx,2), %zmm5 +; AVX512BW-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512BW-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpmovdw %zmm5, %ymm3 +; AVX512BW-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: addq $16, %rcx ; AVX512BW-NEXT: cmpq %rcx, %rax @@ -526,9 +667,9 @@ ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax @@ -668,7 +809,15 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm2 ; SSE2-NEXT: psraw $8, %xmm3 -; SSE2-NEXT: pmaddwd %xmm2, %xmm3 +; SSE2-NEXT: pmullw %xmm2, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm2[1,3] +; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax @@ -690,9 +839,13 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB5_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm1 -; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm2 -; AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovsxbd (%rdi,%rcx), %xmm1 +; AVX1-NEXT: vpmovsxbd 4(%rdi,%rcx), %xmm2 +; AVX1-NEXT: vpmovsxbd (%rsi,%rcx), %xmm3 +; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpmovsxbd 4(%rsi,%rcx), %xmm3 +; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: addq $16, %rcx @@ -717,18 +870,20 @@ ; AVX256-NEXT: .p2align 4, 0x90 ; AVX256-NEXT: .LBB5_1: # %vector.body ; AVX256-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX256-NEXT: vpmovsxbw (%rdi,%rcx), %xmm1 -; AVX256-NEXT: vpmovsxbw (%rsi,%rcx), %xmm2 -; AVX256-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 +; AVX256-NEXT: vpmovsxbd (%rdi,%rcx), %ymm1 +; AVX256-NEXT: vpmovsxbd (%rsi,%rcx), %ymm2 +; AVX256-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX256-NEXT: vphaddd %xmm2, %xmm1, %xmm1 ; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX256-NEXT: addq $16, %rcx ; AVX256-NEXT: cmpq %rcx, %rax ; AVX256-NEXT: jne .LBB5_1 ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vmovd %xmm0, %eax @@ -783,14 +938,30 @@ ; SSE2-NEXT: psraw $8, %xmm5 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; SSE2-NEXT: psraw $8, %xmm6 -; SSE2-NEXT: pmaddwd %xmm5, %xmm6 -; SSE2-NEXT: paddd %xmm6, %xmm2 +; SSE2-NEXT: pmullw %xmm5, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm4 -; SSE2-NEXT: pmaddwd %xmm3, %xmm4 +; SSE2-NEXT: pmullw %xmm3, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm3[0,2] +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm5[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm3[1,3] +; SSE2-NEXT: paddd %xmm7, %xmm4 ; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm5[1,3] +; SSE2-NEXT: paddd %xmm8, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm2 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB6_1 @@ -814,11 +985,19 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB6_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm2 -; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm3 -; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm4 +; AVX1-NEXT: vpmovsxbd 12(%rdi,%rcx), %xmm2 +; AVX1-NEXT: vpmovsxbd 8(%rdi,%rcx), %xmm3 +; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovsxbd 4(%rdi,%rcx), %xmm3 +; AVX1-NEXT: vpmovsxbd (%rdi,%rcx), %xmm4 +; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovsxbd 12(%rsi,%rcx), %xmm4 +; AVX1-NEXT: vpmovsxbd 8(%rsi,%rcx), %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm4 +; AVX1-NEXT: vpmovsxbd 4(%rsi,%rcx), %xmm4 +; AVX1-NEXT: vpmovsxbd (%rsi,%rcx), %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 @@ -850,8 +1029,14 @@ ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB6_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vpmovsxbw (%rdi,%rcx), %ymm2 -; AVX2-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 +; AVX2-NEXT: vpmovsxbd 8(%rdi,%rcx), %ymm2 +; AVX2-NEXT: vpmovsxbd (%rdi,%rcx), %ymm3 +; AVX2-NEXT: vpackssdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovsxbd 8(%rsi,%rcx), %ymm3 +; AVX2-NEXT: vpmovsxbd (%rsi,%rcx), %ymm4 +; AVX2-NEXT: vpackssdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] ; AVX2-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: addq $16, %rcx @@ -860,9 +1045,9 @@ ; AVX2-NEXT: # %bb.2: # %middle.block ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -877,9 +1062,12 @@ ; AVX512-NEXT: .p2align 4, 0x90 ; AVX512-NEXT: .LBB6_1: # %vector.body ; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %ymm1 -; AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %ymm2 -; AVX512-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vpmovsxbd (%rdi,%rcx), %zmm1 +; AVX512-NEXT: vpmovsxbd (%rsi,%rcx), %zmm2 +; AVX512-NEXT: vpmulld %zmm1, %zmm2, %zmm1 +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vphaddd %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: addq $16, %rcx ; AVX512-NEXT: cmpq %rcx, %rax @@ -888,9 +1076,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -934,7 +1122,6 @@ ; SSE2-LABEL: _Z9test_charPcS_i_1024: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm4 @@ -944,37 +1131,70 @@ ; SSE2-NEXT: .LBB7_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE2-NEXT: movdqu (%rdi,%rcx), %xmm7 -; SSE2-NEXT: movdqu 16(%rdi,%rcx), %xmm6 +; SSE2-NEXT: movdqu 16(%rdi,%rcx), %xmm10 ; SSE2-NEXT: movdqu (%rsi,%rcx), %xmm8 -; SSE2-NEXT: movdqu 16(%rsi,%rcx), %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; SSE2-NEXT: psraw $8, %xmm9 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; SSE2-NEXT: psraw $8, %xmm10 -; SSE2-NEXT: pmaddwd %xmm9, %xmm10 -; SSE2-NEXT: paddd %xmm10, %xmm2 +; SSE2-NEXT: movdqu 16(%rsi,%rcx), %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; SSE2-NEXT: psraw $8, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; SSE2-NEXT: psraw $8, %xmm11 +; SSE2-NEXT: pmullw %xmm5, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; SSE2-NEXT: psrad $16, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] +; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: psraw $8, %xmm8 -; SSE2-NEXT: pmaddwd %xmm7, %xmm8 -; SSE2-NEXT: paddd %xmm8, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE2-NEXT: psraw $8, %xmm7 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; SSE2-NEXT: psraw $8, %xmm8 -; SSE2-NEXT: pmaddwd %xmm7, %xmm8 -; SSE2-NEXT: paddd %xmm8, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: psraw $8, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: psraw $8, %xmm5 -; SSE2-NEXT: pmaddwd %xmm6, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] +; SSE2-NEXT: psraw $8, %xmm11 +; SSE2-NEXT: pmullw %xmm7, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] +; SSE2-NEXT: psrad $16, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; SSE2-NEXT: psrad $16, %xmm7 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; SSE2-NEXT: psraw $8, %xmm11 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; SSE2-NEXT: psraw $8, %xmm13 +; SSE2-NEXT: pmullw %xmm11, %xmm13 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; SSE2-NEXT: psrad $16, %xmm12 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] +; SSE2-NEXT: psrad $16, %xmm11 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: psraw $8, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: psraw $8, %xmm9 +; SSE2-NEXT: pmullw %xmm10, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE2-NEXT: psrad $16, %xmm10 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm13 +; SSE2-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm10[0,2] +; SSE2-NEXT: movdqa %xmm11, %xmm14 +; SSE2-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm12[0,2] +; SSE2-NEXT: movdqa %xmm7, %xmm15 +; SSE2-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm8[0,2] +; SSE2-NEXT: movdqa %xmm5, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,3],xmm10[1,3] +; SSE2-NEXT: paddd %xmm13, %xmm9 +; SSE2-NEXT: paddd %xmm9, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,3],xmm12[1,3] +; SSE2-NEXT: paddd %xmm14, %xmm11 +; SSE2-NEXT: paddd %xmm11, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm8[1,3] +; SSE2-NEXT: paddd %xmm15, %xmm7 +; SSE2-NEXT: paddd %xmm7, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm6[1,3] +; SSE2-NEXT: paddd %xmm0, %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm2 ; SSE2-NEXT: addq $32, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB7_1 ; SSE2-NEXT: # %bb.2: # %middle.block +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm3 @@ -999,17 +1219,33 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB7_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovsxbw 24(%rdi,%rcx), %xmm3 -; AVX1-NEXT: vpmovsxbw 16(%rdi,%rcx), %xmm4 -; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm5 -; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm6 -; AVX1-NEXT: vpmovsxbw 24(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbd 28(%rdi,%rcx), %xmm3 +; AVX1-NEXT: vpmovsxbd 24(%rdi,%rcx), %xmm4 +; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovsxbd 20(%rdi,%rcx), %xmm4 +; AVX1-NEXT: vpmovsxbd 16(%rdi,%rcx), %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmovsxbd 12(%rdi,%rcx), %xmm5 +; AVX1-NEXT: vpmovsxbd 8(%rdi,%rcx), %xmm6 +; AVX1-NEXT: vpackssdw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpmovsxbd 4(%rdi,%rcx), %xmm6 +; AVX1-NEXT: vpmovsxbd (%rdi,%rcx), %xmm7 +; AVX1-NEXT: vpackssdw %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpmovsxbd 28(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbd 24(%rsi,%rcx), %xmm8 +; AVX1-NEXT: vpackssdw %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vpmaddwd %xmm3, %xmm7, %xmm3 -; AVX1-NEXT: vpmovsxbw 16(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbd 20(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbd 16(%rsi,%rcx), %xmm8 +; AVX1-NEXT: vpackssdw %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vpmaddwd %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbd 12(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbd 8(%rsi,%rcx), %xmm8 +; AVX1-NEXT: vpackssdw %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vpmaddwd %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbd 4(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbd (%rsi,%rcx), %xmm8 +; AVX1-NEXT: vpackssdw %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 ; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3 @@ -1051,14 +1287,26 @@ ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB7_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 -; AVX2-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 -; AVX2-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 -; AVX2-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vpmovsxbd 24(%rdi,%rcx), %ymm3 +; AVX2-NEXT: vpmovsxbd 16(%rdi,%rcx), %ymm4 +; AVX2-NEXT: vpackssdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpmovsxbd 8(%rdi,%rcx), %ymm4 +; AVX2-NEXT: vpmovsxbd (%rdi,%rcx), %ymm5 +; AVX2-NEXT: vpackssdw %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpmovsxbd 24(%rsi,%rcx), %ymm5 +; AVX2-NEXT: vpmovsxbd 16(%rsi,%rcx), %ymm6 +; AVX2-NEXT: vpackssdw %ymm5, %ymm6, %ymm5 +; AVX2-NEXT: vpmovsxbd 8(%rsi,%rcx), %ymm6 +; AVX2-NEXT: vpmovsxbd (%rsi,%rcx), %ymm7 +; AVX2-NEXT: vpackssdw %ymm6, %ymm7, %ymm6 +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,1,3] +; AVX2-NEXT: vpmaddwd %ymm4, %ymm6, %ymm4 +; AVX2-NEXT: vpaddd %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,2,1,3] +; AVX2-NEXT: vpmaddwd %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 -; AVX2-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: addq $32, %rcx ; AVX2-NEXT: cmpq %rcx, %rax ; AVX2-NEXT: jne .LBB7_1 @@ -1067,9 +1315,9 @@ ; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1101,9 +1349,9 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, %eax @@ -1119,8 +1367,12 @@ ; AVX512BW-NEXT: .p2align 4, 0x90 ; AVX512BW-NEXT: .LBB7_1: # %vector.body ; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512BW-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 -; AVX512BW-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 +; AVX512BW-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm2 +; AVX512BW-NEXT: vpmovsxbw (%rdi,%rcx), %ymm3 +; AVX512BW-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm4 +; AVX512BW-NEXT: vpmovsxbw (%rsi,%rcx), %ymm5 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm3 ; AVX512BW-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: addq $32, %rcx @@ -1131,9 +1383,9 @@ ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax @@ -1335,9 +1587,9 @@ ; AVX256-NEXT: jne .LBB9_1 ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vmovd %xmm0, %eax @@ -1490,9 +1742,9 @@ ; AVX2-NEXT: # %bb.2: # %middle.block ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1518,9 +1770,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1743,9 +1995,9 @@ ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1777,9 +2029,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1824,13 +2076,39 @@ define <4 x i32> @pmaddwd_8(<8 x i16> %A, <8 x i16> %B) { ; SSE2-LABEL: pmaddwd_8: ; SSE2: # %bb.0: -; SSE2-NEXT: pmaddwd %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pmulhw %xmm1, %xmm2 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: retq ; -; AVX-LABEL: pmaddwd_8: -; AVX: # %bb.0: -; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: pmaddwd_8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-NEXT: vpmaddwd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX256-LABEL: pmaddwd_8: +; AVX256: # %bb.0: +; AVX256-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX256-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX256-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %a = sext <8 x i16> %A to <8 x i32> %b = sext <8 x i16> %B to <8 x i32> %m = mul nsw <8 x i32> %a, %b @@ -1843,13 +2121,39 @@ define <4 x i32> @pmaddwd_8_swapped(<8 x i16> %A, <8 x i16> %B) { ; SSE2-LABEL: pmaddwd_8_swapped: ; SSE2: # %bb.0: -; SSE2-NEXT: pmaddwd %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pmulhw %xmm1, %xmm2 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: retq ; -; AVX-LABEL: pmaddwd_8_swapped: -; AVX: # %bb.0: -; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: pmaddwd_8_swapped: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-NEXT: vpmaddwd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX256-LABEL: pmaddwd_8_swapped: +; AVX256: # %bb.0: +; AVX256-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX256-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX256-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %a = sext <8 x i16> %A to <8 x i32> %b = sext <8 x i16> %B to <8 x i32> %m = mul nsw <8 x i32> %a, %b @@ -1877,13 +2181,24 @@ ; ; AVX1-LABEL: larger_mul: ; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-NEXT: vpmaddwd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: larger_mul: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1892,8 +2207,10 @@ ; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7] +; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %a = sext <16 x i16> %A to <16 x i32> @@ -1908,8 +2225,26 @@ define <8 x i32> @pmaddwd_16(<16 x i16> %A, <16 x i16> %B) { ; SSE2-LABEL: pmaddwd_16: ; SSE2: # %bb.0: -; SSE2-NEXT: pmaddwd %xmm2, %xmm0 -; SSE2-NEXT: pmaddwd %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pmulhw %xmm2, %xmm4 +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pmulhw %xmm3, %xmm4 +; SSE2-NEXT: pmullw %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; SSE2-NEXT: paddd %xmm5, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: pmaddwd_16: @@ -1921,10 +2256,20 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX256-LABEL: pmaddwd_16: -; AVX256: # %bb.0: -; AVX256-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 -; AVX256-NEXT: retq +; AVX2-LABEL: pmaddwd_16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: pmaddwd_16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512-NEXT: retq %a = sext <16 x i16> %A to <16 x i32> %b = sext <16 x i16> %B to <16 x i32> %m = mul nsw <16 x i32> %a, %b @@ -1937,10 +2282,46 @@ define <16 x i32> @pmaddwd_32(<32 x i16> %A, <32 x i16> %B) { ; SSE2-LABEL: pmaddwd_32: ; SSE2: # %bb.0: -; SSE2-NEXT: pmaddwd %xmm4, %xmm0 -; SSE2-NEXT: pmaddwd %xmm5, %xmm1 -; SSE2-NEXT: pmaddwd %xmm6, %xmm2 -; SSE2-NEXT: pmaddwd %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: pmulhw %xmm4, %xmm8 +; SSE2-NEXT: pmullw %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: pmulhw %xmm5, %xmm8 +; SSE2-NEXT: pmullw %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: pmulhw %xmm6, %xmm8 +; SSE2-NEXT: pmullw %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: pmulhw %xmm7, %xmm8 +; SSE2-NEXT: pmullw %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm7[0,2] +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm6[0,2] +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm5[0,2] +; SSE2-NEXT: movdqa %xmm0, %xmm11 +; SSE2-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm4[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm7[1,3] +; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm6[1,3] +; SSE2-NEXT: paddd %xmm9, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm5[1,3] +; SSE2-NEXT: paddd %xmm10, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] +; SSE2-NEXT: paddd %xmm11, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: pmaddwd_32: @@ -1988,13 +2369,36 @@ define <4 x i32> @pmaddwd_const(<8 x i16> %A) { ; SSE2-LABEL: pmaddwd_const: ; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3] +; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; -; AVX-LABEL: pmaddwd_const: -; AVX: # %bb.0: -; AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: pmaddwd_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX256-LABEL: pmaddwd_const: +; AVX256: # %bb.0: +; AVX256-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX256-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %a = sext <8 x i16> %A to <8 x i32> %m = mul nsw <8 x i32> %a, %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> @@ -2058,9 +2462,9 @@ ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] ; SSE2-NEXT: paddd %xmm2, %xmm1 @@ -2095,13 +2499,41 @@ define <4 x i32> @jumbled_indices4(<8 x i16> %A, <8 x i16> %B) { ; SSE2-LABEL: jumbled_indices4: ; SSE2: # %bb.0: -; SSE2-NEXT: pmaddwd %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pmulhw %xmm1, %xmm2 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[1,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: retq ; -; AVX-LABEL: jumbled_indices4: -; AVX: # %bb.0: -; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: jumbled_indices4: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-NEXT: vpmaddwd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX1-NEXT: retq +; +; AVX256-LABEL: jumbled_indices4: +; AVX256: # %bb.0: +; AVX256-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX256-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX256-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %exta = sext <8 x i16> %A to <8 x i32> %extb = sext <8 x i16> %B to <8 x i32> %m = mul <8 x i32> %exta, %extb @@ -2114,8 +2546,26 @@ define <8 x i32> @jumbled_indices8(<16 x i16> %A, <16 x i16> %B) { ; SSE2-LABEL: jumbled_indices8: ; SSE2: # %bb.0: -; SSE2-NEXT: pmaddwd %xmm2, %xmm0 -; SSE2-NEXT: pmaddwd %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pmulhw %xmm2, %xmm4 +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pmulhw %xmm3, %xmm4 +; SSE2-NEXT: pmullw %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm3[3,0] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm3[2,1] +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[2,1] +; SSE2-NEXT: paddd %xmm5, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: jumbled_indices8: @@ -2127,10 +2577,21 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX256-LABEL: jumbled_indices8: -; AVX256: # %bb.0: -; AVX256-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 -; AVX256-NEXT: retq +; AVX2-LABEL: jumbled_indices8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: jumbled_indices8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,5,4,3,2,7,6] +; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: retq %exta = sext <16 x i16> %A to <16 x i32> %extb = sext <16 x i16> %B to <16 x i32> %m = mul <16 x i32> %exta, %extb @@ -2143,10 +2604,46 @@ define <16 x i32> @jumbled_indices16(<32 x i16> %A, <32 x i16> %B) { ; SSE2-LABEL: jumbled_indices16: ; SSE2: # %bb.0: -; SSE2-NEXT: pmaddwd %xmm4, %xmm0 -; SSE2-NEXT: pmaddwd %xmm5, %xmm1 -; SSE2-NEXT: pmaddwd %xmm6, %xmm2 -; SSE2-NEXT: pmaddwd %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: pmulhw %xmm4, %xmm8 +; SSE2-NEXT: pmullw %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: pmulhw %xmm5, %xmm8 +; SSE2-NEXT: pmullw %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: pmulhw %xmm6, %xmm8 +; SSE2-NEXT: pmullw %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: pmulhw %xmm7, %xmm8 +; SSE2-NEXT: pmullw %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,0],xmm7[3,1] +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,2],xmm6[0,3] +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm5[3,0] +; SSE2-NEXT: movdqa %xmm0, %xmm11 +; SSE2-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm4[1,2] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,1],xmm7[2,0] +; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm6[1,2] +; SSE2-NEXT: paddd %xmm9, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[2,1] +; SSE2-NEXT: paddd %xmm10, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm4[0,3] +; SSE2-NEXT: paddd %xmm11, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: jumbled_indices16: @@ -2194,16 +2691,99 @@ define <32 x i32> @jumbled_indices32(<64 x i16> %A, <64 x i16> %B) { ; SSE2-LABEL: jumbled_indices32: ; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm7, %xmm8 ; SSE2-NEXT: movq %rdi, %rax -; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm1 -; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm2 -; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm4 -; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm5 -; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm6 -; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm7 -; SSE2-NEXT: movdqa %xmm7, 112(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 +; SSE2-NEXT: movdqa %xmm0, %xmm15 +; SSE2-NEXT: pmulhw %xmm7, %xmm15 +; SSE2-NEXT: pmullw %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] +; SSE2-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE2-NEXT: movdqa %xmm1, %xmm15 +; SSE2-NEXT: pmulhw %xmm9, %xmm15 +; SSE2-NEXT: pmullw %xmm9, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSE2-NEXT: movdqa %xmm2, %xmm15 +; SSE2-NEXT: pmulhw %xmm10, %xmm15 +; SSE2-NEXT: pmullw %xmm10, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; SSE2-NEXT: movdqa %xmm3, %xmm15 +; SSE2-NEXT: pmulhw %xmm11, %xmm15 +; SSE2-NEXT: pmullw %xmm11, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] +; SSE2-NEXT: movdqa %xmm4, %xmm15 +; SSE2-NEXT: pmulhw %xmm12, %xmm15 +; SSE2-NEXT: pmullw %xmm12, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3] +; SSE2-NEXT: movdqa %xmm5, %xmm15 +; SSE2-NEXT: pmulhw %xmm14, %xmm15 +; SSE2-NEXT: pmullw %xmm14, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm14 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] +; SSE2-NEXT: movdqa %xmm6, %xmm15 +; SSE2-NEXT: pmulhw %xmm13, %xmm15 +; SSE2-NEXT: pmullw %xmm13, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm13 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm15 +; SSE2-NEXT: movdqa %xmm8, %xmm7 +; SSE2-NEXT: pmulhw %xmm15, %xmm7 +; SSE2-NEXT: pmullw %xmm15, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm15 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE2-NEXT: movdqa %xmm8, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,0],xmm15[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,1],xmm15[0,2] +; SSE2-NEXT: paddd %xmm7, %xmm8 +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm13[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm13[1,3] +; SSE2-NEXT: paddd %xmm7, %xmm6 +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm14[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm14[0,2] +; SSE2-NEXT: paddd %xmm7, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm12[1,2] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm12[0,3] +; SSE2-NEXT: paddd %xmm7, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm11[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm11[3,1] +; SSE2-NEXT: paddd %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm10[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm10[3,1] +; SSE2-NEXT: paddd %xmm7, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm9[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm9[3,1] +; SSE2-NEXT: paddd %xmm7, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm9[2,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm9[3,0] +; SSE2-NEXT: paddd %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm8, 112(%rdi) ; SSE2-NEXT: movdqa %xmm6, 96(%rdi) ; SSE2-NEXT: movdqa %xmm5, 80(%rdi) ; SSE2-NEXT: movdqa %xmm4, 64(%rdi) @@ -2306,10 +2886,44 @@ define <8 x i32> @pmaddwd_256(<16 x i16>* %Aptr, <16 x i16>* %Bptr) { ; SSE2-LABEL: pmaddwd_256: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 -; SSE2-NEXT: pmaddwd (%rsi), %xmm0 -; SSE2-NEXT: pmaddwd 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa 16(%rdi), %xmm2 +; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: movdqa 16(%rsi), %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pslld $16, %xmm4 +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pslld $16, %xmm5 +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: packssdw %xmm4, %xmm5 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pslld $16, %xmm4 +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: packssdw %xmm2, %xmm4 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: pmulhw %xmm4, %xmm2 +; SSE2-NEXT: pmullw %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pmulhw %xmm0, %xmm2 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: paddd %xmm4, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: pmaddwd_256: @@ -2321,11 +2935,52 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX256-LABEL: pmaddwd_256: -; AVX256: # %bb.0: -; AVX256-NEXT: vmovdqa (%rdi), %ymm0 -; AVX256-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 -; AVX256-NEXT: retq +; AVX2-LABEL: pmaddwd_256: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX2-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa (%rsi), %xmm2 +; AVX2-NEXT: vmovdqa 16(%rsi), %xmm4 +; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] +; AVX2-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm4, %xmm4 +; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX2-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vpmovsxwd %xmm3, %ymm3 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vpmulld %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX2-NEXT: vpmovsxwd %xmm2, %ymm2 +; AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: pmaddwd_256: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm1 +; AVX512-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512-NEXT: vpmovdw %zmm2, %ymm3 +; AVX512-NEXT: vpsrld $16, %ymm2, %ymm2 +; AVX512-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX512-NEXT: vpmovsxwd %xmm3, %ymm3 +; AVX512-NEXT: vpmulld %ymm3, %ymm1, %ymm1 +; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512-NEXT: vpmovsxwd %xmm2, %ymm2 +; AVX512-NEXT: vpmulld %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: retq %A = load <16 x i16>, <16 x i16>* %Aptr %B = load <16 x i16>, <16 x i16>* %Bptr %A_even = shufflevector <16 x i16> %A, <16 x i16> undef, <8 x i32> @@ -2345,14 +3000,82 @@ define <16 x i32> @pmaddwd_512(<32 x i16>* %Aptr, <32 x i16>* %Bptr) { ; SSE2-LABEL: pmaddwd_512: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 -; SSE2-NEXT: movdqa 32(%rdi), %xmm2 -; SSE2-NEXT: movdqa 48(%rdi), %xmm3 -; SSE2-NEXT: pmaddwd (%rsi), %xmm0 -; SSE2-NEXT: pmaddwd 16(%rsi), %xmm1 -; SSE2-NEXT: pmaddwd 32(%rsi), %xmm2 -; SSE2-NEXT: pmaddwd 48(%rsi), %xmm3 +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa 16(%rdi), %xmm9 +; SSE2-NEXT: movdqa 32(%rdi), %xmm3 +; SSE2-NEXT: movdqa 48(%rdi), %xmm5 +; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: movdqa 16(%rsi), %xmm7 +; SSE2-NEXT: movdqa 32(%rsi), %xmm2 +; SSE2-NEXT: movdqa 48(%rsi), %xmm6 +; SSE2-NEXT: movdqa %xmm9, %xmm8 +; SSE2-NEXT: pslld $16, %xmm8 +; SSE2-NEXT: psrad $16, %xmm8 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pslld $16, %xmm4 +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: packssdw %xmm8, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm10 +; SSE2-NEXT: pslld $16, %xmm10 +; SSE2-NEXT: psrad $16, %xmm10 +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: pslld $16, %xmm8 +; SSE2-NEXT: psrad $16, %xmm8 +; SSE2-NEXT: packssdw %xmm10, %xmm8 +; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm9, %xmm1 +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: packssdw %xmm5, %xmm3 +; SSE2-NEXT: movdqa %xmm7, %xmm9 +; SSE2-NEXT: pslld $16, %xmm9 +; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pslld $16, %xmm5 +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: packssdw %xmm9, %xmm5 +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: pslld $16, %xmm10 +; SSE2-NEXT: psrad $16, %xmm10 +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: pslld $16, %xmm9 +; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: packssdw %xmm10, %xmm9 +; SSE2-NEXT: psrad $16, %xmm7 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm7, %xmm0 +; SSE2-NEXT: psrad $16, %xmm6 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: packssdw %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm8, %xmm6 +; SSE2-NEXT: pmulhw %xmm9, %xmm6 +; SSE2-NEXT: pmullw %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pmulhw %xmm5, %xmm6 +; SSE2-NEXT: pmullw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pmulhw %xmm2, %xmm6 +; SSE2-NEXT: pmullw %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE2-NEXT: paddd %xmm7, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE2-NEXT: paddd %xmm9, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pmulhw %xmm0, %xmm6 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE2-NEXT: paddd %xmm5, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: pmaddwd_512: @@ -2377,20 +3100,24 @@ ; AVX2-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: pmaddwd_512: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: pmaddwd_512: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpmaddwd (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: retq +; AVX512-LABEL: pmaddwd_512: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm1 +; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512-NEXT: vpmovdw %zmm2, %ymm3 +; AVX512-NEXT: vpsrld $16, %zmm2, %zmm2 +; AVX512-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512-NEXT: vpmovsxwd %ymm3, %zmm3 +; AVX512-NEXT: vpmulld %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512-NEXT: vpmovsxwd %ymm2, %zmm2 +; AVX512-NEXT: vpmulld %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: retq %A = load <32 x i16>, <32 x i16>* %Aptr %B = load <32 x i16>, <32 x i16>* %Bptr %A_even = shufflevector <32 x i16> %A, <32 x i16> undef, <16 x i32> @@ -2411,30 +3138,175 @@ ; SSE2-LABEL: pmaddwd_1024: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 -; SSE2-NEXT: pmaddwd (%rdx), %xmm0 -; SSE2-NEXT: pmaddwd 16(%rdx), %xmm1 -; SSE2-NEXT: pmaddwd 32(%rdx), %xmm2 -; SSE2-NEXT: pmaddwd 48(%rdx), %xmm3 -; SSE2-NEXT: movdqa 64(%rsi), %xmm4 -; SSE2-NEXT: pmaddwd 64(%rdx), %xmm4 -; SSE2-NEXT: movdqa 80(%rsi), %xmm5 -; SSE2-NEXT: pmaddwd 80(%rdx), %xmm5 -; SSE2-NEXT: movdqa 96(%rsi), %xmm6 -; SSE2-NEXT: pmaddwd 96(%rdx), %xmm6 -; SSE2-NEXT: movdqa 112(%rsi), %xmm7 -; SSE2-NEXT: pmaddwd 112(%rdx), %xmm7 -; SSE2-NEXT: movdqa %xmm7, 112(%rdi) -; SSE2-NEXT: movdqa %xmm6, 96(%rdi) -; SSE2-NEXT: movdqa %xmm5, 80(%rdi) -; SSE2-NEXT: movdqa %xmm4, 64(%rdi) -; SSE2-NEXT: movdqa %xmm3, 48(%rdi) -; SSE2-NEXT: movdqa %xmm2, 32(%rdi) -; SSE2-NEXT: movdqa %xmm1, 16(%rdi) -; SSE2-NEXT: movdqa %xmm0, (%rdi) +; SSE2-NEXT: movdqa 96(%rsi), %xmm13 +; SSE2-NEXT: movdqa 112(%rsi), %xmm0 +; SSE2-NEXT: movdqa 64(%rsi), %xmm12 +; SSE2-NEXT: movdqa 80(%rsi), %xmm1 +; SSE2-NEXT: movdqa (%rsi), %xmm4 +; SSE2-NEXT: movdqa 16(%rsi), %xmm3 +; SSE2-NEXT: movdqa 32(%rsi), %xmm10 +; SSE2-NEXT: movdqa 48(%rsi), %xmm7 +; SSE2-NEXT: movaps 80(%rdx), %xmm2 +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa (%rdx), %xmm6 +; SSE2-NEXT: movdqa 16(%rdx), %xmm14 +; SSE2-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: pslld $16, %xmm9 +; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: packssdw %xmm2, %xmm9 +; SSE2-NEXT: movdqa %xmm7, %xmm2 +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: movdqa %xmm10, %xmm8 +; SSE2-NEXT: pslld $16, %xmm8 +; SSE2-NEXT: psrad $16, %xmm8 +; SSE2-NEXT: packssdw %xmm2, %xmm8 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: movdqa %xmm12, %xmm11 +; SSE2-NEXT: pslld $16, %xmm11 +; SSE2-NEXT: psrad $16, %xmm11 +; SSE2-NEXT: packssdw %xmm2, %xmm11 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: movdqa %xmm13, %xmm15 +; SSE2-NEXT: pslld $16, %xmm15 +; SSE2-NEXT: psrad $16, %xmm15 +; SSE2-NEXT: packssdw %xmm2, %xmm15 +; SSE2-NEXT: movdqa 32(%rdx), %xmm5 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: packssdw %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa 48(%rdx), %xmm4 +; SSE2-NEXT: psrad $16, %xmm7 +; SSE2-NEXT: psrad $16, %xmm10 +; SSE2-NEXT: packssdw %xmm7, %xmm10 +; SSE2-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm12 +; SSE2-NEXT: packssdw %xmm1, %xmm12 +; SSE2-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm13 +; SSE2-NEXT: packssdw %xmm0, %xmm13 +; SSE2-NEXT: pslld $16, %xmm14 +; SSE2-NEXT: psrad $16, %xmm14 +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pslld $16, %xmm7 +; SSE2-NEXT: psrad $16, %xmm7 +; SSE2-NEXT: packssdw %xmm14, %xmm7 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm12 +; SSE2-NEXT: pslld $16, %xmm12 +; SSE2-NEXT: psrad $16, %xmm12 +; SSE2-NEXT: packssdw %xmm0, %xmm12 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: movdqa 64(%rdx), %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: pslld $16, %xmm10 +; SSE2-NEXT: psrad $16, %xmm10 +; SSE2-NEXT: packssdw %xmm0, %xmm10 +; SSE2-NEXT: movdqa 112(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: movdqa 96(%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm14 +; SSE2-NEXT: pslld $16, %xmm14 +; SSE2-NEXT: psrad $16, %xmm14 +; SSE2-NEXT: packssdw %xmm2, %xmm14 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm6 +; SSE2-NEXT: packssdw %xmm2, %xmm6 +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: packssdw %xmm4, %xmm5 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm15, %xmm3 +; SSE2-NEXT: pmulhw %xmm14, %xmm3 +; SSE2-NEXT: pmullw %xmm15, %xmm14 +; SSE2-NEXT: movdqa %xmm14, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3] +; SSE2-NEXT: movdqa %xmm11, %xmm3 +; SSE2-NEXT: pmulhw %xmm10, %xmm3 +; SSE2-NEXT: pmullw %xmm11, %xmm10 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] +; SSE2-NEXT: movdqa %xmm8, %xmm3 +; SSE2-NEXT: pmulhw %xmm12, %xmm3 +; SSE2-NEXT: pmullw %xmm8, %xmm12 +; SSE2-NEXT: movdqa %xmm12, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3] +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pmulhw %xmm7, %xmm4 +; SSE2-NEXT: pmullw %xmm9, %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm13, %xmm9 +; SSE2-NEXT: pmulhw %xmm0, %xmm9 +; SSE2-NEXT: pmullw %xmm13, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE2-NEXT: paddd %xmm2, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE2-NEXT: paddd %xmm14, %xmm0 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: pmulhw %xmm1, %xmm9 +; SSE2-NEXT: pmullw %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] +; SSE2-NEXT: paddd %xmm11, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE2-NEXT: paddd %xmm10, %xmm1 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm10, %xmm9 +; SSE2-NEXT: pmulhw %xmm5, %xmm9 +; SSE2-NEXT: pmullw %xmm10, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE2-NEXT: paddd %xmm8, %xmm10 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; SSE2-NEXT: paddd %xmm12, %xmm5 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm9, %xmm8 +; SSE2-NEXT: pmulhw %xmm6, %xmm8 +; SSE2-NEXT: pmullw %xmm9, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; SSE2-NEXT: paddd %xmm3, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; SSE2-NEXT: paddd %xmm7, %xmm6 +; SSE2-NEXT: movdqa %xmm4, 112(%rdi) +; SSE2-NEXT: movdqa %xmm0, 96(%rdi) +; SSE2-NEXT: movdqa %xmm2, 80(%rdi) +; SSE2-NEXT: movdqa %xmm1, 64(%rdi) +; SSE2-NEXT: movdqa %xmm10, 48(%rdi) +; SSE2-NEXT: movdqa %xmm5, 32(%rdi) +; SSE2-NEXT: movdqa %xmm9, 16(%rdi) +; SSE2-NEXT: movdqa %xmm6, (%rdi) ; SSE2-NEXT: retq ; ; AVX1-LABEL: pmaddwd_1024: @@ -2514,13 +3386,26 @@ ; SSE2-LABEL: pmaddwd_commuted_mul: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: pmaddwd (%rsi), %xmm0 +; SSE2-NEXT: movdqa (%rsi), %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pmaddwd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: pmaddwd_commuted_mul: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rsi), %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %A = load <8 x i16>, <8 x i16>* %Aptr %B = load <8 x i16>, <8 x i16>* %Bptr @@ -2541,14 +3426,20 @@ define <4 x i32> @pmaddwd_swapped_indices(<8 x i16>* %Aptr, <8 x i16>* %Bptr) { ; SSE2-LABEL: pmaddwd_swapped_indices: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: pmaddwd (%rsi), %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[1,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,4,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[1,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] +; SSE2-NEXT: pmaddwd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: pmaddwd_swapped_indices: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[1,0,2,3,4,5,6,7] +; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] +; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = mem[1,0,2,3,4,5,6,7] +; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] +; AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %A = load <8 x i16>, <8 x i16>* %Aptr %B = load <8 x i16>, <8 x i16>* %Bptr @@ -2604,31 +3495,80 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqu (%rdi), %xmm0 ; SSE2-NEXT: movdqu (%rsi), %xmm1 -; SSE2-NEXT: pmaddwd %xmm0, %xmm1 -; SSE2-NEXT: movdqu (%rdx), %xmm0 -; SSE2-NEXT: movdqu (%rcx), %xmm2 -; SSE2-NEXT: pmaddwd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pmulhw %xmm1, %xmm2 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] ; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: movdqu (%rdx), %xmm1 +; SSE2-NEXT: movdqu (%rcx), %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pmulhw %xmm2, %xmm3 +; SSE2-NEXT: pmullw %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; -; AVX-LABEL: madd_double_reduction: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqu (%rdi), %xmm0 -; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu (%rdx), %xmm1 -; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq +; AVX1-LABEL: madd_double_reduction: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0 +; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm1 +; AVX1-NEXT: vpmovsxwd (%rsi), %xmm2 +; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd 8(%rsi), %xmm2 +; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd (%rdx), %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%rdx), %xmm2 +; AVX1-NEXT: vpmovsxwd (%rcx), %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%rcx), %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: retq +; +; AVX256-LABEL: madd_double_reduction: +; AVX256: # %bb.0: +; AVX256-NEXT: vpmovsxwd (%rdi), %ymm0 +; AVX256-NEXT: vpmovsxwd (%rsi), %ymm1 +; AVX256-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpmovsxwd (%rdx), %ymm1 +; AVX256-NEXT: vpmovsxwd (%rcx), %ymm2 +; AVX256-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX256-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX256-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vmovd %xmm0, %eax +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %tmp = load <8 x i16>, <8 x i16>* %arg, align 1 %tmp6 = load <8 x i16>, <8 x i16>* %arg1, align 1 %tmp7 = sext <8 x i16> %tmp to <8 x i32> @@ -2655,49 +3595,140 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: movdqu (%rsi), %xmm1 -; SSE2-NEXT: pmaddwd %xmm0, %xmm1 +; SSE2-NEXT: movdqu (%rdi), %xmm1 +; SSE2-NEXT: movdqu (%rsi), %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pmulhw %xmm0, %xmm2 +; SSE2-NEXT: pmullw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3] +; SSE2-NEXT: paddd %xmm2, %xmm1 ; SSE2-NEXT: movdqu (%rdx), %xmm0 ; SSE2-NEXT: movdqu (%rcx), %xmm2 -; SSE2-NEXT: pmaddwd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: movdqu (%r8), %xmm0 -; SSE2-NEXT: movdqu (%r9), %xmm1 -; SSE2-NEXT: pmaddwd %xmm0, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pmulhw %xmm2, %xmm3 +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movdqu (%r8), %xmm1 +; SSE2-NEXT: movdqu (%r9), %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pmulhw %xmm2, %xmm3 +; SSE2-NEXT: pmullw %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movdqu (%r10), %xmm0 ; SSE2-NEXT: movdqu (%rax), %xmm2 -; SSE2-NEXT: pmaddwd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pmulhw %xmm2, %xmm3 +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq ; -; AVX-LABEL: madd_quad_reduction: -; AVX: # %bb.0: -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX-NEXT: vmovdqu (%rdi), %xmm0 -; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu (%rdx), %xmm1 -; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovdqu (%r8), %xmm1 -; AVX-NEXT: vpmaddwd (%r9), %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqu (%r10), %xmm1 -; AVX-NEXT: vpmaddwd (%rax), %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq +; AVX1-LABEL: madd_quad_reduction: +; AVX1: # %bb.0: +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0 +; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm1 +; AVX1-NEXT: vpmovsxwd (%rsi), %xmm2 +; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd 8(%rsi), %xmm2 +; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd (%rdx), %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%rdx), %xmm2 +; AVX1-NEXT: vpmovsxwd (%rcx), %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%rcx), %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovsxwd (%r8), %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%r8), %xmm2 +; AVX1-NEXT: vpmovsxwd (%r9), %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%r9), %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd (%r10), %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%r10), %xmm2 +; AVX1-NEXT: vpmovsxwd (%rax), %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%rax), %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: retq +; +; AVX256-LABEL: madd_quad_reduction: +; AVX256: # %bb.0: +; AVX256-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX256-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX256-NEXT: vpmovsxwd (%rdi), %ymm0 +; AVX256-NEXT: vpmovsxwd (%rsi), %ymm1 +; AVX256-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpmovsxwd (%rdx), %ymm1 +; AVX256-NEXT: vpmovsxwd (%rcx), %ymm2 +; AVX256-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX256-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX256-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX256-NEXT: vpmovsxwd (%r8), %ymm1 +; AVX256-NEXT: vpmovsxwd (%r9), %ymm2 +; AVX256-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX256-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpmovsxwd (%r10), %ymm1 +; AVX256-NEXT: vpmovsxwd (%rax), %ymm2 +; AVX256-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX256-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX256-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vmovd %xmm0, %eax +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %tmp = load <8 x i16>, <8 x i16>* %arg, align 1 %tmp6 = load <8 x i16>, <8 x i16>* %arg1, align 1 %tmp7 = sext <8 x i16> %tmp to <8 x i32> @@ -2753,8 +3784,15 @@ ; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSE2-NEXT: paddd %xmm5, %xmm3 -; SSE2-NEXT: pmaddwd %xmm4, %xmm4 +; SSE2-NEXT: pmullw %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm5[0,2] +; SSE2-NEXT: psrld $16, %xmm4 ; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: paddd %xmm6, %xmm1 ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: addq $-8, %rax ; SSE2-NEXT: jne .LBB33_1 @@ -2835,16 +3873,16 @@ ; AVX256-NEXT: jne .LBB33_1 ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX256-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX256-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX256-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX256-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX256-NEXT: vmovd %xmm1, %ecx ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vmovd %xmm0, %eax @@ -2908,7 +3946,16 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; SSE2-NEXT: psubw %xmm2, %xmm3 -; SSE2-NEXT: pmaddwd %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pmulhw %xmm3, %xmm2 +; SSE2-NEXT: pmullw %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm4[1,3] +; SSE2-NEXT: paddd %xmm2, %xmm3 ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: addq $8, %rcx ; SSE2-NEXT: cmpq %rcx, %rax @@ -2930,10 +3977,15 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB34_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpmaddwd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmulld %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpmulld %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vphaddd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: addq $8, %rcx @@ -2958,19 +4010,21 @@ ; AVX256-NEXT: .p2align 4, 0x90 ; AVX256-NEXT: .LBB34_1: # %vector.body ; AVX256-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX256-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX256-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX256-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; AVX256-NEXT: vpmaddwd %xmm1, %xmm1, %xmm1 +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX256-NEXT: vpsubd %ymm1, %ymm2, %ymm1 +; AVX256-NEXT: vpmulld %ymm1, %ymm1, %ymm1 +; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX256-NEXT: vphaddd %xmm2, %xmm1, %xmm1 ; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX256-NEXT: addq $8, %rcx ; AVX256-NEXT: cmpq %rcx, %rax ; AVX256-NEXT: jne .LBB34_1 ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vmovd %xmm0, %eax @@ -3114,14 +4168,30 @@ ; SSE2-NEXT: psraw $8, %xmm5 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] ; SSE2-NEXT: psraw $8, %xmm6 -; SSE2-NEXT: pmaddwd %xmm5, %xmm6 -; SSE2-NEXT: paddd %xmm6, %xmm2 +; SSE2-NEXT: pmullw %xmm5, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm4 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm3 -; SSE2-NEXT: pmaddwd %xmm4, %xmm3 +; SSE2-NEXT: pmullw %xmm4, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm4[0,2] +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm5[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm4[1,3] +; SSE2-NEXT: paddd %xmm7, %xmm3 ; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm5[1,3] +; SSE2-NEXT: paddd %xmm8, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm2 ; SSE2-NEXT: addq $16, %rax ; SSE2-NEXT: cmpq %r8, %rax ; SSE2-NEXT: jb .LBB38_1 @@ -3146,11 +4216,19 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB38_1: # %loop ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovsxbw 8(%rdi,%rax), %xmm2 -; AVX1-NEXT: vpmovsxbw (%rdi,%rax), %xmm3 -; AVX1-NEXT: vpmovsxbw 8(%rsi,%rax), %xmm4 +; AVX1-NEXT: vpmovsxbd 12(%rdi,%rax), %xmm2 +; AVX1-NEXT: vpmovsxbd 8(%rdi,%rax), %xmm3 +; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovsxbd 4(%rdi,%rax), %xmm3 +; AVX1-NEXT: vpmovsxbd (%rdi,%rax), %xmm4 +; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovsxbd 12(%rsi,%rax), %xmm4 +; AVX1-NEXT: vpmovsxbd 8(%rsi,%rax), %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmaddwd %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpmovsxbw (%rsi,%rax), %xmm4 +; AVX1-NEXT: vpmovsxbd 4(%rsi,%rax), %xmm4 +; AVX1-NEXT: vpmovsxbd (%rsi,%rax), %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmaddwd %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 @@ -3183,8 +4261,14 @@ ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB38_1: # %loop ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vpmovsxbw (%rdi,%rax), %ymm2 -; AVX2-NEXT: vpmovsxbw (%rsi,%rax), %ymm3 +; AVX2-NEXT: vpmovsxbd 8(%rdi,%rax), %ymm2 +; AVX2-NEXT: vpmovsxbd (%rdi,%rax), %ymm3 +; AVX2-NEXT: vpackssdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovsxbd 8(%rsi,%rax), %ymm3 +; AVX2-NEXT: vpmovsxbd (%rsi,%rax), %ymm4 +; AVX2-NEXT: vpackssdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] ; AVX2-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: addq $16, %rax @@ -3193,9 +4277,9 @@ ; AVX2-NEXT: # %bb.2: # %afterloop ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -3211,9 +4295,12 @@ ; AVX512-NEXT: .p2align 4, 0x90 ; AVX512-NEXT: .LBB38_1: # %loop ; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512-NEXT: vpmovsxbw (%rdi,%rax), %ymm1 -; AVX512-NEXT: vpmovsxbw (%rsi,%rax), %ymm2 -; AVX512-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vpmovsxbd (%rdi,%rax), %zmm1 +; AVX512-NEXT: vpmovsxbd (%rsi,%rax), %zmm2 +; AVX512-NEXT: vpmulld %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vphaddd %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: addq $16, %rax ; AVX512-NEXT: cmpq %r8, %rax @@ -3222,9 +4309,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/mask-negated-bool.ll b/llvm/test/CodeGen/X86/mask-negated-bool.ll --- a/llvm/test/CodeGen/X86/mask-negated-bool.ll +++ b/llvm/test/CodeGen/X86/mask-negated-bool.ll @@ -27,7 +27,10 @@ define <4 x i32> @mask_negated_zext_bool_vec(<4 x i1> %x) { ; CHECK-LABEL: mask_negated_zext_bool_vec: ; CHECK: # %bb.0: -; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: psubd %xmm0, %xmm1 +; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %ext = zext <4 x i1> %x to <4 x i32> %neg = sub <4 x i32> zeroinitializer, %ext @@ -61,7 +64,10 @@ define <4 x i32> @mask_negated_sext_bool_vec(<4 x i1> %x) { ; CHECK-LABEL: mask_negated_sext_bool_vec: ; CHECK: # %bb.0: -; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: psubd %xmm0, %xmm1 +; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %ext = sext <4 x i1> %x to <4 x i32> %neg = sub <4 x i32> zeroinitializer, %ext diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll --- a/llvm/test/CodeGen/X86/masked_compressstore.ll +++ b/llvm/test/CodeGen/X86/masked_compressstore.ll @@ -1290,7 +1290,7 @@ ; SSE2-NEXT: pmovmskb %xmm8, %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: jne LBB6_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %al @@ -1614,7 +1614,7 @@ ; SSE42-NEXT: pmovmskb %xmm8, %eax ; SSE42-NEXT: shll $16, %eax ; SSE42-NEXT: orl %ecx, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $1, %cl ; SSE42-NEXT: jne LBB6_1 ; SSE42-NEXT: ## %bb.2: ## %else ; SSE42-NEXT: testb $2, %al @@ -1895,7 +1895,7 @@ ; AVX1-NEXT: vpmovmskb %xmm4, %eax ; AVX1-NEXT: shll $16, %eax ; AVX1-NEXT: orl %ecx, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $1, %cl ; AVX1-NEXT: jne LBB6_1 ; AVX1-NEXT: ## %bb.2: ## %else ; AVX1-NEXT: testb $2, %al @@ -3400,8 +3400,9 @@ ; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne LBB11_1 ; AVX512F-NEXT: ## %bb.2: ## %else ; AVX512F-NEXT: testb $2, %al @@ -3543,8 +3544,9 @@ ; AVX512VLBW-LABEL: compressstore_v8i16_v8i16: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vptestnmw %xmm1, %xmm1, %k0 +; AVX512VLBW-NEXT: kmovd %k0, %ecx ; AVX512VLBW-NEXT: kmovd %k0, %eax -; AVX512VLBW-NEXT: testb $1, %al +; AVX512VLBW-NEXT: testb $1, %cl ; AVX512VLBW-NEXT: jne LBB11_1 ; AVX512VLBW-NEXT: ## %bb.2: ## %else ; AVX512VLBW-NEXT: testb $2, %al diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll --- a/llvm/test/CodeGen/X86/masked_expandload.ll +++ b/llvm/test/CodeGen/X86/masked_expandload.ll @@ -1388,7 +1388,7 @@ ; SSE2-NEXT: pmovmskb %xmm8, %ecx ; SSE2-NEXT: shll $16, %ecx ; SSE2-NEXT: orl %edx, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $1, %dl ; SSE2-NEXT: jne LBB8_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %cl @@ -1745,7 +1745,7 @@ ; SSE42-NEXT: pmovmskb %xmm8, %ecx ; SSE42-NEXT: shll $16, %ecx ; SSE42-NEXT: orl %edx, %ecx -; SSE42-NEXT: testb $1, %cl +; SSE42-NEXT: testb $1, %dl ; SSE42-NEXT: jne LBB8_1 ; SSE42-NEXT: ## %bb.2: ## %else ; SSE42-NEXT: testb $2, %cl @@ -2042,7 +2042,7 @@ ; AVX1-NEXT: vpmovmskb %xmm4, %eax ; AVX1-NEXT: shll $16, %eax ; AVX1-NEXT: orl %ecx, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $1, %cl ; AVX1-NEXT: jne LBB8_1 ; AVX1-NEXT: ## %bb.2: ## %else ; AVX1-NEXT: testb $2, %al @@ -2665,20 +2665,16 @@ ; define <2 x i64> @expandload_v2i64_const(ptr %base, <2 x i64> %src0) { -; SSE2-LABEL: expandload_v2i64_const: -; SSE2: ## %bb.0: -; SSE2-NEXT: movsd (%rdi), %xmm1 ## xmm1 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSE42-LABEL: expandload_v2i64_const: -; SSE42: ## %bb.0: -; SSE42-NEXT: pinsrq $1, (%rdi), %xmm0 -; SSE42-NEXT: retq +; SSE-LABEL: expandload_v2i64_const: +; SSE: ## %bb.0: +; SSE-NEXT: movsd (%rdi), %xmm1 ## xmm1 = mem[0],zero +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: expandload_v2i64_const: ; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vpinsrq $1, (%rdi), %xmm0, %xmm0 +; AVX1OR2-NEXT: vmovddup (%rdi), %xmm1 ## xmm1 = mem[0,0] +; AVX1OR2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: expandload_v2i64_const: @@ -3005,8 +3001,9 @@ ; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne LBB11_1 ; AVX512F-NEXT: ## %bb.2: ## %else ; AVX512F-NEXT: testb $2, %al @@ -3148,8 +3145,9 @@ ; AVX512VLBW-LABEL: expandload_v8i16_v8i16: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vptestnmw %xmm1, %xmm1, %k0 +; AVX512VLBW-NEXT: kmovd %k0, %ecx ; AVX512VLBW-NEXT: kmovd %k0, %eax -; AVX512VLBW-NEXT: testb $1, %al +; AVX512VLBW-NEXT: testb $1, %cl ; AVX512VLBW-NEXT: jne LBB11_1 ; AVX512VLBW-NEXT: ## %bb.2: ## %else ; AVX512VLBW-NEXT: testb $2, %al diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll --- a/llvm/test/CodeGen/X86/masked_gather.ll +++ b/llvm/test/CodeGen/X86/masked_gather.ll @@ -139,8 +139,9 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: je .LBB0_2 ; AVX512F-NEXT: # %bb.1: # %cond.load ; AVX512F-NEXT: vmovq %xmm0, %rcx @@ -338,8 +339,9 @@ ; AVX512F-NEXT: vpsllq $2, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: je .LBB1_2 ; AVX512F-NEXT: # %bb.1: # %cond.load ; AVX512F-NEXT: vmovq %xmm0, %rcx @@ -533,8 +535,9 @@ ; AVX512F-NEXT: vpsllq $2, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: je .LBB2_2 ; AVX512F-NEXT: # %bb.1: # %cond.load ; AVX512F-NEXT: vmovq %xmm0, %rcx diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -867,8 +867,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm1 ; KNL_64-NEXT: vpbroadcastq %xmm1, %ymm1 ; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm1 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: # implicit-def: $xmm0 ; KNL_64-NEXT: je .LBB14_2 ; KNL_64-NEXT: # %bb.1: # %cond.load @@ -908,8 +909,9 @@ ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: # implicit-def: $xmm0 ; KNL_32-NEXT: jne .LBB14_1 ; KNL_32-NEXT: # %bb.2: # %else @@ -981,8 +983,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm1 ; KNL_64-NEXT: vpbroadcastq %xmm1, %ymm1 ; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: je .LBB15_2 ; KNL_64-NEXT: # %bb.1: # %cond.load ; KNL_64-NEXT: vmovq %xmm0, %rcx @@ -1025,8 +1028,9 @@ ; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB15_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -1098,8 +1102,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm1 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB16_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -1127,8 +1132,9 @@ ; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB16_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -1220,8 +1226,9 @@ ; KNL_64: # %bb.0: ; KNL_64-NEXT: vpslld $31, %xmm2, %xmm2 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: je .LBB17_2 ; KNL_64-NEXT: # %bb.1: # %cond.store ; KNL_64-NEXT: vmovq %xmm1, %rcx @@ -1257,8 +1264,9 @@ ; KNL_32: # %bb.0: ; KNL_32-NEXT: vpslld $31, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB17_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -1320,8 +1328,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm2 ; KNL_64-NEXT: vpbroadcastq %xmm2, %ymm2 ; KNL_64-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: je .LBB18_2 ; KNL_64-NEXT: # %bb.1: # %cond.store ; KNL_64-NEXT: vmovq %xmm1, %rcx @@ -1363,8 +1372,9 @@ ; KNL_32-NEXT: vpslld $3, %xmm1, %xmm1 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2 ; KNL_32-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: je .LBB18_2 ; KNL_32-NEXT: # %bb.1: # %cond.store ; KNL_32-NEXT: vmovd %xmm1, %ecx @@ -1423,8 +1433,9 @@ ; KNL_64: # %bb.0: ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB19_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -1447,8 +1458,9 @@ ; KNL_32: # %bb.0: ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB19_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -1520,8 +1532,9 @@ ; KNL_64: # %bb.0: ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB20_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -1544,8 +1557,9 @@ ; KNL_32: # %bb.0: ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB20_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -1624,8 +1638,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm1 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB21_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -1654,8 +1669,9 @@ ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB21_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -1747,8 +1763,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm1 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB22_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -1778,8 +1795,9 @@ ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB22_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -1874,8 +1892,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm1 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB23_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -1903,8 +1922,9 @@ ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB23_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -1993,8 +2013,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm1 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB24_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -2023,8 +2044,9 @@ ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB24_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -2165,8 +2187,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm1 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB26_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -2194,8 +2217,9 @@ ; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB26_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -3251,7 +3275,8 @@ ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: kmovw %k0, %ecx +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: # implicit-def: $ymm1 ; KNL_64-NEXT: je .LBB42_2 ; KNL_64-NEXT: # %bb.1: # %cond.load @@ -3281,7 +3306,8 @@ ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; KNL_64-NEXT: .LBB42_8: # %else8 ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: kmovw %k0, %ecx +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: # implicit-def: $ymm3 ; KNL_64-NEXT: jne .LBB42_9 ; KNL_64-NEXT: # %bb.10: # %else15 @@ -3299,7 +3325,8 @@ ; KNL_64-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; KNL_64-NEXT: .LBB42_16: # %else33 ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: kmovw %k0, %ecx +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: # implicit-def: $ymm4 ; KNL_64-NEXT: jne .LBB42_17 ; KNL_64-NEXT: # %bb.18: # %else40 @@ -3364,16 +3391,19 @@ ; KNL_32-NEXT: movl %esp, %ebp ; KNL_32-NEXT: .cfi_def_cfa_register %ebp ; KNL_32-NEXT: pushl %ebx +; KNL_32-NEXT: pushl %edi ; KNL_32-NEXT: pushl %esi ; KNL_32-NEXT: andl $-32, %esp ; KNL_32-NEXT: subl $32, %esp -; KNL_32-NEXT: .cfi_offset %esi, -16 +; KNL_32-NEXT: .cfi_offset %esi, -20 +; KNL_32-NEXT: .cfi_offset %edi, -16 ; KNL_32-NEXT: .cfi_offset %ebx, -12 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL_32-NEXT: kmovw %k0, %ebx -; KNL_32-NEXT: testb $1, %bl -; KNL_32-NEXT: vmovd %xmm0, %eax +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: vmovd %xmm0, %edi ; KNL_32-NEXT: # implicit-def: $ymm1 ; KNL_32-NEXT: je .LBB42_2 ; KNL_32-NEXT: # %bb.1: # %cond.load @@ -3406,7 +3436,8 @@ ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; KNL_32-NEXT: .LBB42_8: # %else8 ; KNL_32-NEXT: kmovw %k0, %ebx -; KNL_32-NEXT: testb $1, %bl +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al ; KNL_32-NEXT: # implicit-def: $ymm0 ; KNL_32-NEXT: jne .LBB42_9 ; KNL_32-NEXT: # %bb.10: # %else15 @@ -3425,7 +3456,8 @@ ; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; KNL_32-NEXT: .LBB42_16: # %else33 ; KNL_32-NEXT: kmovw %k0, %ebx -; KNL_32-NEXT: testb $1, %bl +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al ; KNL_32-NEXT: # implicit-def: $ymm2 ; KNL_32-NEXT: jne .LBB42_17 ; KNL_32-NEXT: # %bb.18: # %else40 @@ -3445,8 +3477,9 @@ ; KNL_32-NEXT: .LBB42_24: # %else58 ; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; KNL_32-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; KNL_32-NEXT: leal -8(%ebp), %esp +; KNL_32-NEXT: leal -12(%ebp), %esp ; KNL_32-NEXT: popl %esi +; KNL_32-NEXT: popl %edi ; KNL_32-NEXT: popl %ebx ; KNL_32-NEXT: popl %ebp ; KNL_32-NEXT: .cfi_def_cfa %esp, 4 @@ -3705,8 +3738,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm2 ; KNL_64-NEXT: vpbroadcastq %xmm2, %xmm2 ; KNL_64-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB47_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -3737,8 +3771,9 @@ ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2 ; KNL_32-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB47_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -4016,8 +4051,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm2 ; KNL_64-NEXT: vpbroadcastq %xmm2, %xmm2 ; KNL_64-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB52_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -4043,8 +4079,9 @@ ; KNL_32-NEXT: vpslld $3, %xmm1, %xmm1 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2 ; KNL_32-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB52_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -4396,9 +4433,10 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm0 ; KNL_64-NEXT: vpbroadcastq %xmm0, %xmm0 ; KNL_64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax ; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB58_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -4423,9 +4461,10 @@ ; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 ; KNL_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax ; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB58_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -4592,8 +4631,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm0 ; KNL_64-NEXT: vpbroadcastq %xmm0, %xmm0 ; KNL_64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB60_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -4618,8 +4658,9 @@ ; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 ; KNL_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB60_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -4780,8 +4821,9 @@ ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL_64-NEXT: vmovq %rdi, %xmm0 ; KNL_64-NEXT: vpbroadcastq %xmm0, %ymm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: je .LBB62_2 ; KNL_64-NEXT: # %bb.1: # %cond.load ; KNL_64-NEXT: vmovq %xmm0, %rcx @@ -4820,8 +4862,9 @@ ; KNL_32-NEXT: vpslld $31, %xmm0, %xmm0 ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB62_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -4890,8 +4933,9 @@ ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL_64-NEXT: vmovq %rdi, %xmm0 ; KNL_64-NEXT: vpbroadcastq %xmm0, %ymm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: je .LBB63_2 ; KNL_64-NEXT: # %bb.1: # %cond.store ; KNL_64-NEXT: vmovq %xmm0, %rcx @@ -4928,8 +4972,9 @@ ; KNL_32-NEXT: vpslld $31, %xmm0, %xmm0 ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB63_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll --- a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll @@ -41,8 +41,9 @@ ; WIDEN_KNL-NEXT: vmovq %rdi, %xmm1 ; WIDEN_KNL-NEXT: vpbroadcastq %xmm1, %xmm1 ; WIDEN_KNL-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; WIDEN_KNL-NEXT: kmovw %k0, %ecx ; WIDEN_KNL-NEXT: kmovw %k0, %eax -; WIDEN_KNL-NEXT: testb $1, %al +; WIDEN_KNL-NEXT: testb $1, %cl ; WIDEN_KNL-NEXT: jne .LBB0_1 ; WIDEN_KNL-NEXT: # %bb.2: # %else ; WIDEN_KNL-NEXT: testb $2, %al @@ -110,8 +111,9 @@ ; WIDEN_KNL-NEXT: vmovq %rdi, %xmm2 ; WIDEN_KNL-NEXT: vpbroadcastq %xmm2, %xmm2 ; WIDEN_KNL-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; WIDEN_KNL-NEXT: kmovw %k0, %ecx ; WIDEN_KNL-NEXT: kmovw %k0, %eax -; WIDEN_KNL-NEXT: testb $1, %al +; WIDEN_KNL-NEXT: testb $1, %cl ; WIDEN_KNL-NEXT: jne .LBB1_1 ; WIDEN_KNL-NEXT: # %bb.2: # %else ; WIDEN_KNL-NEXT: testb $2, %al @@ -189,8 +191,9 @@ ; WIDEN_KNL: # %bb.0: ; WIDEN_KNL-NEXT: vpsllq $63, %xmm1, %xmm1 ; WIDEN_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 +; WIDEN_KNL-NEXT: kmovw %k0, %ecx ; WIDEN_KNL-NEXT: kmovw %k0, %eax -; WIDEN_KNL-NEXT: testb $1, %al +; WIDEN_KNL-NEXT: testb $1, %cl ; WIDEN_KNL-NEXT: jne .LBB2_1 ; WIDEN_KNL-NEXT: # %bb.2: # %else ; WIDEN_KNL-NEXT: testb $2, %al @@ -249,8 +252,9 @@ ; WIDEN_KNL: # %bb.0: ; WIDEN_KNL-NEXT: vpsllq $63, %xmm2, %xmm2 ; WIDEN_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0 +; WIDEN_KNL-NEXT: kmovw %k0, %ecx ; WIDEN_KNL-NEXT: kmovw %k0, %eax -; WIDEN_KNL-NEXT: testb $1, %al +; WIDEN_KNL-NEXT: testb $1, %cl ; WIDEN_KNL-NEXT: jne .LBB3_1 ; WIDEN_KNL-NEXT: # %bb.2: # %else ; WIDEN_KNL-NEXT: testb $2, %al @@ -331,8 +335,9 @@ ; WIDEN_KNL-NEXT: vmovq %rdi, %xmm1 ; WIDEN_KNL-NEXT: vpbroadcastq %xmm1, %xmm1 ; WIDEN_KNL-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; WIDEN_KNL-NEXT: kmovw %k0, %ecx ; WIDEN_KNL-NEXT: kmovw %k0, %eax -; WIDEN_KNL-NEXT: testb $1, %al +; WIDEN_KNL-NEXT: testb $1, %cl ; WIDEN_KNL-NEXT: jne .LBB4_1 ; WIDEN_KNL-NEXT: # %bb.2: # %else ; WIDEN_KNL-NEXT: testb $2, %al @@ -401,8 +406,9 @@ ; WIDEN_KNL-NEXT: vmovq %rdi, %xmm2 ; WIDEN_KNL-NEXT: vpbroadcastq %xmm2, %xmm2 ; WIDEN_KNL-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; WIDEN_KNL-NEXT: kmovw %k0, %ecx ; WIDEN_KNL-NEXT: kmovw %k0, %eax -; WIDEN_KNL-NEXT: testb $1, %al +; WIDEN_KNL-NEXT: testb $1, %cl ; WIDEN_KNL-NEXT: jne .LBB5_1 ; WIDEN_KNL-NEXT: # %bb.2: # %else ; WIDEN_KNL-NEXT: testb $2, %al diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -2941,8 +2941,9 @@ ; AVX512F-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne LBB21_1 ; AVX512F-NEXT: ## %bb.2: ## %else ; AVX512F-NEXT: testb $2, %al @@ -4517,7 +4518,7 @@ ; SSE2-NEXT: pmovmskb %xmm1, %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: jne LBB24_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %al @@ -4936,7 +4937,7 @@ ; SSE42-NEXT: pmovmskb %xmm1, %eax ; SSE42-NEXT: shll $16, %eax ; SSE42-NEXT: orl %ecx, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $1, %cl ; SSE42-NEXT: jne LBB24_1 ; SSE42-NEXT: ## %bb.2: ## %else ; SSE42-NEXT: testb $2, %al @@ -5170,7 +5171,7 @@ ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: shll $16, %eax ; AVX1-NEXT: orl %ecx, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $1, %cl ; AVX1-NEXT: jne LBB24_1 ; AVX1-NEXT: ## %bb.2: ## %else ; AVX1-NEXT: testb $2, %al @@ -6559,20 +6560,13 @@ } define <8 x float> @mload_constmask_v8f32_zero(ptr %addr, <8 x float> %dst) { -; SSE2-LABEL: mload_constmask_v8f32_zero: -; SSE2: ## %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: retq -; -; SSE42-LABEL: mload_constmask_v8f32_zero: -; SSE42: ## %bb.0: -; SSE42-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],zero -; SSE42-NEXT: xorps %xmm1, %xmm1 -; SSE42-NEXT: retq +; SSE-LABEL: mload_constmask_v8f32_zero: +; SSE: ## %bb.0: +; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: mload_constmask_v8f32_zero: ; AVX1OR2: ## %bb.0: diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -1764,8 +1764,9 @@ ; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne LBB13_1 ; AVX512F-NEXT: ## %bb.2: ## %else ; AVX512F-NEXT: testb $2, %al @@ -3283,7 +3284,7 @@ ; SSE2-NEXT: pmovmskb %xmm1, %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm2, %ecx ; SSE2-NEXT: jne LBB16_1 ; SSE2-NEXT: ## %bb.2: ## %else @@ -3491,7 +3492,7 @@ ; SSE4-NEXT: pmovmskb %xmm1, %eax ; SSE4-NEXT: shll $16, %eax ; SSE4-NEXT: orl %ecx, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $1, %cl ; SSE4-NEXT: jne LBB16_1 ; SSE4-NEXT: ## %bb.2: ## %else ; SSE4-NEXT: testb $2, %al @@ -3726,7 +3727,7 @@ ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: shll $16, %eax ; AVX1-NEXT: orl %ecx, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $1, %cl ; AVX1-NEXT: jne LBB16_1 ; AVX1-NEXT: ## %bb.2: ## %else ; AVX1-NEXT: testb $2, %al @@ -5274,10 +5275,10 @@ ; SSE2-NEXT: andb $1, %dl ; SSE2-NEXT: addb %dl, %dl ; SSE2-NEXT: orb %sil, %dl -; SSE2-NEXT: andb $1, %cl ; SSE2-NEXT: shlb $2, %cl ; SSE2-NEXT: orb %dl, %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: andb $7, %cl +; SSE2-NEXT: testb %sil, %sil ; SSE2-NEXT: jne LBB28_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %cl @@ -5307,10 +5308,10 @@ ; SSE4-NEXT: andb $1, %dl ; SSE4-NEXT: addb %dl, %dl ; SSE4-NEXT: orb %sil, %dl -; SSE4-NEXT: andb $1, %cl ; SSE4-NEXT: shlb $2, %cl ; SSE4-NEXT: orb %dl, %cl -; SSE4-NEXT: testb $1, %cl +; SSE4-NEXT: andb $7, %cl +; SSE4-NEXT: testb %sil, %sil ; SSE4-NEXT: jne LBB28_1 ; SSE4-NEXT: ## %bb.2: ## %else ; SSE4-NEXT: testb $2, %cl @@ -5656,6 +5657,8 @@ ; SSE2-NEXT: movdqa 16(%rsi), %xmm4 ; SSE2-NEXT: movdqa (%rsi), %xmm5 ; SSE2-NEXT: packssdw 48(%rdi), %xmm7 +; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm9 ; SSE2-NEXT: packssdw 16(%rdi), %xmm6 ; SSE2-NEXT: packsswb %xmm7, %xmm6 ; SSE2-NEXT: packssdw 80(%rdi), %xmm8 @@ -5666,7 +5669,8 @@ ; SSE2-NEXT: andl $85, %ecx ; SSE2-NEXT: shll $16, %ecx ; SSE2-NEXT: orl %edi, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: movd %xmm9, %edi +; SSE2-NEXT: testb $1, %dil ; SSE2-NEXT: jne LBB31_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %cl @@ -5891,8 +5895,8 @@ ; SSE4-NEXT: .cfi_offset %r14, -32 ; SSE4-NEXT: .cfi_offset %r15, -24 ; SSE4-NEXT: .cfi_offset %rbp, -16 -; SSE4-NEXT: movdqa (%rdi), %xmm1 -; SSE4-NEXT: movdqa 32(%rdi), %xmm2 +; SSE4-NEXT: movdqa (%rdi), %xmm2 +; SSE4-NEXT: movdqa 32(%rdi), %xmm1 ; SSE4-NEXT: movdqa 64(%rdi), %xmm0 ; SSE4-NEXT: movl 92(%rsi), %eax ; SSE4-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill @@ -5916,19 +5920,22 @@ ; SSE4-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; SSE4-NEXT: movl 52(%rsi), %eax ; SSE4-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill -; SSE4-NEXT: packssdw 48(%rdi), %xmm2 -; SSE4-NEXT: packssdw 16(%rdi), %xmm1 -; SSE4-NEXT: packsswb %xmm2, %xmm1 +; SSE4-NEXT: packssdw 48(%rdi), %xmm1 +; SSE4-NEXT: pxor %xmm3, %xmm3 +; SSE4-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE4-NEXT: packssdw 16(%rdi), %xmm2 +; SSE4-NEXT: packsswb %xmm1, %xmm2 ; SSE4-NEXT: packssdw 80(%rdi), %xmm0 ; SSE4-NEXT: packsswb %xmm0, %xmm0 -; SSE4-NEXT: pmovmskb %xmm1, %eax +; SSE4-NEXT: pmovmskb %xmm2, %eax ; SSE4-NEXT: andl $21845, %eax ## imm = 0x5555 ; SSE4-NEXT: pmovmskb %xmm0, %edi ; SSE4-NEXT: andl $85, %edi ; SSE4-NEXT: shll $16, %edi ; SSE4-NEXT: orl %eax, %edi +; SSE4-NEXT: movd %xmm3, %eax +; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: movl 48(%rsi), %r13d -; SSE4-NEXT: testb $1, %dil ; SSE4-NEXT: movl 44(%rsi), %eax ; SSE4-NEXT: movl 40(%rsi), %ecx ; SSE4-NEXT: movl 36(%rsi), %r8d @@ -6135,21 +6142,17 @@ ; AVX1-NEXT: vpcmpgtd 32(%rdi), %xmm3, %xmm5 ; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpacksswb %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtd 80(%rdi), %xmm3, %xmm5 -; AVX1-NEXT: vpcmpgtd 64(%rdi), %xmm3, %xmm6 -; AVX1-NEXT: vpcmpgtd 16(%rdi), %xmm3, %xmm7 -; AVX1-NEXT: vpcmpgtd (%rdi), %xmm3, %xmm8 +; AVX1-NEXT: vpcmpgtd 64(%rdi), %xmm3, %xmm5 +; AVX1-NEXT: vpcmpgtd 80(%rdi), %xmm3, %xmm6 +; AVX1-NEXT: vpcmpgtd (%rdi), %xmm3, %xmm7 +; AVX1-NEXT: vpcmpgtd 16(%rdi), %xmm3, %xmm8 ; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm3[2,3],xmm8[4,5],xmm3[6,7] -; AVX1-NEXT: vpslld $31, %xmm8, %xmm8 ; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm3[2,3],xmm7[4,5],xmm3[6,7] -; AVX1-NEXT: vpslld $31, %xmm7, %xmm7 -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 ; AVX1-NEXT: vmaskmovps %ymm0, %ymm7, (%rdx) ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1],xmm3[2,3],xmm6[4,5],xmm3[6,7] -; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,3],xmm5[4,5],xmm3[6,7] -; AVX1-NEXT: vpslld $31, %xmm5, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 ; AVX1-NEXT: vmaskmovps %ymm2, %ymm0, 64(%rdx) ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero @@ -6175,19 +6178,20 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] ; AVX2-NEXT: vpcmpgtd %ymm5, %ymm3, %ymm3 ; AVX2-NEXT: vpacksswb %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,1,3] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX2-NEXT: vpslld $31, %ymm3, %ymm3 -; AVX2-NEXT: vpmaskmovd %ymm0, %ymm3, (%rdx) -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm0 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 -; AVX2-NEXT: vpmaskmovd %ymm2, %ymm0, 64(%rdx) -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX2-NEXT: vpslld $31, %ymm4, %ymm4 +; AVX2-NEXT: vpmaskmovd %ymm1, %ymm4, 32(%rdx) +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vpmaskmovd %ymm0, %ymm1, (%rdx) +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 -; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, 32(%rdx) +; AVX2-NEXT: vpmaskmovd %ymm2, %ymm0, 64(%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -6411,8 +6415,8 @@ ; ; AVX2-LABEL: undefshuffle: ; AVX2: ## %bb.0: -; AVX2-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,u,u,2,u,u,u,4,u,u,u,6,u,u,u],zero,ymm0[u,u,u],zero,ymm0[u,u,u],zero,ymm0[u,u,u],zero,ymm0[u,u,u] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, (%rsi) diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -514,8 +514,9 @@ ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB1_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -901,8 +902,9 @@ ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB2_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -1290,8 +1292,9 @@ ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB4_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -1521,8 +1524,9 @@ ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB5_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -1760,8 +1764,9 @@ ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB7_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -1879,8 +1884,9 @@ ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB8_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -3560,8 +3566,9 @@ ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB11_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -3937,8 +3944,9 @@ ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB12_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -4141,8 +4149,9 @@ ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB13_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -4315,8 +4324,9 @@ ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB14_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -4385,7 +4395,7 @@ ; SSE2-NEXT: notl %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %ecx ; SSE2-NEXT: jne .LBB15_1 ; SSE2-NEXT: # %bb.2: # %else @@ -4602,7 +4612,7 @@ ; SSE4-NEXT: notl %eax ; SSE4-NEXT: shll $16, %eax ; SSE4-NEXT: orl %ecx, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $1, %cl ; SSE4-NEXT: jne .LBB15_1 ; SSE4-NEXT: # %bb.2: # %else ; SSE4-NEXT: testb $2, %al @@ -4847,7 +4857,7 @@ ; AVX1-NEXT: notl %eax ; AVX1-NEXT: shll $16, %eax ; AVX1-NEXT: orl %ecx, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $1, %cl ; AVX1-NEXT: jne .LBB15_1 ; AVX1-NEXT: # %bb.2: # %else ; AVX1-NEXT: testb $2, %al @@ -6407,8 +6417,9 @@ ; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB17_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -16,99 +16,103 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm2, %xmm6 ; SSE2-NEXT: pxor %xmm8, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm11 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [4294967295,4294967295] -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm12[1,1,3,3] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm2 ; SSE2-NEXT: pandn %xmm9, %xmm6 ; SSE2-NEXT: por %xmm2, %xmm6 ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm11 -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm9, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm11 -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,3,3] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm9, %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm11 -; SSE2-NEXT: pand %xmm11, %xmm1 -; SSE2-NEXT: pandn %xmm9, %xmm11 -; SSE2-NEXT: por %xmm1, %xmm11 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm1 +; SSE2-NEXT: pandn %xmm9, %xmm10 +; SSE2-NEXT: por %xmm1, %xmm10 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562067968,18446744071562067968] -; SSE2-NEXT: movdqa %xmm11, %xmm1 +; SSE2-NEXT: movdqa %xmm10, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm12 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744069414584320,18446744069414584320] -; SSE2-NEXT: pcmpgtd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm12, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm13, %xmm12 -; SSE2-NEXT: pand %xmm12, %xmm11 -; SSE2-NEXT: pandn %xmm0, %xmm12 -; SSE2-NEXT: por %xmm11, %xmm12 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744069414584320,18446744069414584320] +; SSE2-NEXT: movdqa %xmm1, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm11 +; SSE2-NEXT: pand %xmm11, %xmm10 +; SSE2-NEXT: pandn %xmm0, %xmm11 +; SSE2-NEXT: por %xmm10, %xmm11 ; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm12[0,2] -; SSE2-NEXT: movdqa %xmm2, %xmm11 -; SSE2-NEXT: pxor %xmm8, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,2,2] -; SSE2-NEXT: pand %xmm12, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[0,2] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] ; SSE2-NEXT: pxor %xmm6, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm8[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE2-NEXT: pand %xmm12, %xmm9 +; SSE2-NEXT: movdqa %xmm8, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm8 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm11[1,1,3,3] ; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm7, %xmm4 ; SSE2-NEXT: packssdw %xmm5, %xmm4 @@ -120,8 +124,8 @@ ; SSE2-NEXT: # %bb.1: # %cond.store ; SSE2-NEXT: movss %xmm1, (%rdi) ; SSE2-NEXT: .LBB0_2: # %else -; SSE2-NEXT: por %xmm11, %xmm3 -; SSE2-NEXT: por %xmm8, %xmm9 +; SSE2-NEXT: por %xmm10, %xmm3 +; SSE2-NEXT: por %xmm9, %xmm8 ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB0_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 @@ -130,8 +134,8 @@ ; SSE2-NEXT: .LBB0_4: # %else2 ; SSE2-NEXT: pand %xmm3, %xmm2 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm9, %xmm6 -; SSE2-NEXT: pandn %xmm0, %xmm9 +; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: pandn %xmm0, %xmm8 ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB0_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 @@ -139,7 +143,7 @@ ; SSE2-NEXT: movd %xmm0, 8(%rdi) ; SSE2-NEXT: .LBB0_6: # %else4 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: por %xmm9, %xmm6 +; SSE2-NEXT: por %xmm8, %xmm6 ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB0_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 @@ -344,29 +348,30 @@ ; AVX512F-LABEL: truncstore_v8i64_v8i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, (%rdi) {%k1} +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kshiftlw $8, %k0, %k0 +; AVX512F-NEXT: kshiftrw $8, %k0, %k1 +; AVX512F-NEXT: vpmovsqd %zmm0, %ymm0 +; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: truncstore_v8i64_v8i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovqd %zmm0, (%rdi) {%k1} +; AVX512VL-NEXT: vpmovsqd %zmm0, %ymm0 +; AVX512VL-NEXT: vmovdqu32 %ymm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: truncstore_v8i64_v8i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftlw $8, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k0, %k1 +; AVX512BW-NEXT: vpmovsqd %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -387,108 +392,112 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm2, %xmm8 ; SSE2-NEXT: pxor %xmm7, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm8[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm11 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147516415,2147516415] -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,3,3] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm2 ; SSE2-NEXT: pandn %xmm9, %xmm8 ; SSE2-NEXT: por %xmm2, %xmm8 ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm11 -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm9, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm11 -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,3,3] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm9, %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm11 -; SSE2-NEXT: pand %xmm11, %xmm1 -; SSE2-NEXT: pandn %xmm9, %xmm11 -; SSE2-NEXT: por %xmm1, %xmm11 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm1 +; SSE2-NEXT: pandn %xmm9, %xmm10 +; SSE2-NEXT: por %xmm1, %xmm10 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] -; SSE2-NEXT: movdqa %xmm11, %xmm0 +; SSE2-NEXT: movdqa %xmm10, %xmm0 ; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm12 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744071562035200,18446744071562035200] -; SSE2-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm12, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm13, %xmm12 -; SSE2-NEXT: pand %xmm12, %xmm11 -; SSE2-NEXT: pandn %xmm1, %xmm12 -; SSE2-NEXT: por %xmm11, %xmm12 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562035200,18446744071562035200] +; SSE2-NEXT: movdqa %xmm0, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm11 +; SSE2-NEXT: pand %xmm11, %xmm10 +; SSE2-NEXT: pandn %xmm1, %xmm11 +; SSE2-NEXT: por %xmm10, %xmm11 ; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: packssdw %xmm12, %xmm0 +; SSE2-NEXT: packssdw %xmm11, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: pxor %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm12 +; SSE2-NEXT: movdqa %xmm3, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm12, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm11, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm2 +; SSE2-NEXT: pandn %xmm1, %xmm10 +; SSE2-NEXT: por %xmm2, %xmm10 ; SSE2-NEXT: pxor %xmm8, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm8 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm8, %xmm2 -; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: packssdw %xmm10, %xmm2 ; SSE2-NEXT: packssdw %xmm2, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 @@ -840,8 +849,9 @@ ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovsqw %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB1_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -903,19 +913,19 @@ ; AVX512BW-LABEL: truncstore_v8i64_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqw %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftld $24, %k0, %k0 +; AVX512BW-NEXT: kshiftrd $24, %k0, %k1 +; AVX512BW-NEXT: vpmovsqw %zmm0, %xmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v8i64_v8i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovqw %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsqw %zmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -936,108 +946,112 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm2, %xmm8 ; SSE2-NEXT: pxor %xmm7, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm8[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm11 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483775,2147483775] -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,3,3] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm2 ; SSE2-NEXT: pandn %xmm9, %xmm8 ; SSE2-NEXT: por %xmm2, %xmm8 ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm11 -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm9, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm11 -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,3,3] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm9, %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm11 -; SSE2-NEXT: pand %xmm11, %xmm1 -; SSE2-NEXT: pandn %xmm9, %xmm11 -; SSE2-NEXT: por %xmm1, %xmm11 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm1 +; SSE2-NEXT: pandn %xmm9, %xmm10 +; SSE2-NEXT: por %xmm1, %xmm10 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] -; SSE2-NEXT: movdqa %xmm11, %xmm0 +; SSE2-NEXT: movdqa %xmm10, %xmm0 ; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm12 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744071562067840,18446744071562067840] -; SSE2-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm12, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm13, %xmm12 -; SSE2-NEXT: pand %xmm12, %xmm11 -; SSE2-NEXT: pandn %xmm1, %xmm12 -; SSE2-NEXT: por %xmm11, %xmm12 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm0, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm11 +; SSE2-NEXT: pand %xmm11, %xmm10 +; SSE2-NEXT: pandn %xmm1, %xmm11 +; SSE2-NEXT: por %xmm10, %xmm11 ; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: packssdw %xmm12, %xmm0 +; SSE2-NEXT: packssdw %xmm11, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: pxor %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm12 +; SSE2-NEXT: movdqa %xmm3, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm12, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm11, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm2 +; SSE2-NEXT: pandn %xmm1, %xmm10 +; SSE2-NEXT: por %xmm2, %xmm10 ; SSE2-NEXT: pxor %xmm8, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm8 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm8, %xmm2 -; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: packssdw %xmm10, %xmm2 ; SSE2-NEXT: packssdw %xmm2, %xmm0 ; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 @@ -1386,8 +1400,9 @@ ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB2_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -1449,19 +1464,19 @@ ; AVX512BW-LABEL: truncstore_v8i64_v8i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqb %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftlq $56, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $56, %k0, %k1 +; AVX512BW-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v8i64_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -1482,25 +1497,26 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] -; SSE2-NEXT: movdqa %xmm8, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm10, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pandn %xmm6, %xmm5 ; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm8 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm7 ; SSE2-NEXT: pand %xmm7, %xmm1 @@ -1509,32 +1525,33 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744069414584320,18446744069414584320] -; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm10, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pandn %xmm1, %xmm6 -; SSE2-NEXT: por %xmm7, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320] +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pandn %xmm1, %xmm8 +; SSE2-NEXT: por %xmm7, %xmm8 ; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm0, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm5 ; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: movmskps %xmm2, %eax +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: movmskps %xmm3, %eax ; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB3_1 @@ -1678,9 +1695,8 @@ ; AVX512VL-LABEL: truncstore_v4i64_v4i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovqd %ymm0, (%rdi) {%k1} +; AVX512VL-NEXT: vpmovsqd %ymm0, %xmm0 +; AVX512VL-NEXT: vmovdqu32 %xmm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -1713,25 +1729,26 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147516415,2147516415] -; SSE2-NEXT: movdqa %xmm8, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147516415,2147516415] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm10, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pandn %xmm6, %xmm5 ; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm8 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm7 ; SSE2-NEXT: pand %xmm7, %xmm1 @@ -1740,33 +1757,34 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562035200,18446744071562035200] -; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm10, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pandn %xmm1, %xmm6 -; SSE2-NEXT: por %xmm7, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562035200,18446744071562035200] +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pandn %xmm1, %xmm8 +; SSE2-NEXT: por %xmm7, %xmm8 ; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm0, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm5 ; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: packssdw %xmm6, %xmm0 +; SSE2-NEXT: packssdw %xmm8, %xmm0 ; SSE2-NEXT: packssdw %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: movmskps %xmm2, %eax +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: movmskps %xmm3, %eax ; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB4_1 @@ -1959,8 +1977,9 @@ ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovsqw %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB4_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -2006,9 +2025,8 @@ ; AVX512BWVL-LABEL: truncstore_v4i64_v4i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsqw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer @@ -2029,25 +2047,26 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483775,2147483775] -; SSE2-NEXT: movdqa %xmm8, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm10, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pandn %xmm6, %xmm5 ; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm8 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm7 ; SSE2-NEXT: pand %xmm7, %xmm1 @@ -2056,37 +2075,38 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744073709551488,18446744073709551488] ; SSE2-NEXT: movdqa %xmm7, %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] -; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm10 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm10, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm7 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm7, %xmm1 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pandn %xmm0, %xmm8 +; SSE2-NEXT: por %xmm7, %xmm8 ; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: packssdw %xmm1, %xmm4 -; SSE2-NEXT: packssdw %xmm4, %xmm4 -; SSE2-NEXT: packsswb %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: movmskps %xmm2, %ecx +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: packssdw %xmm8, %xmm1 +; SSE2-NEXT: packssdw %xmm1, %xmm1 +; SSE2-NEXT: packsswb %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: movmskps %xmm3, %ecx ; SSE2-NEXT: xorl $15, %ecx ; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: movd %xmm4, %eax +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: jne .LBB5_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %cl @@ -2279,8 +2299,9 @@ ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB5_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -2326,9 +2347,8 @@ ; AVX512BWVL-LABEL: truncstore_v4i64_v4i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsqb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer @@ -2348,34 +2368,36 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm5, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 ; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: pxor %xmm5, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744069414584320,18446744069414584320] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm5 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: por %xmm5, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: movmskpd %xmm2, %eax +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movmskpd %xmm1, %eax ; SSE2-NEXT: xorl $3, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB6_1 @@ -2475,9 +2497,8 @@ ; AVX512VL-LABEL: truncstore_v2i64_v2i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovqd %xmm0, (%rdi) {%k1} +; AVX512VL-NEXT: vpmovsqd %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu32 %xmm0, (%rdi) {%k1} ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: truncstore_v2i64_v2i32: @@ -2508,35 +2529,37 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147516415,2147516415] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147516415,2147516415] +; SSE2-NEXT: movdqa %xmm5, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 ; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: pxor %xmm5, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562035200,18446744071562035200] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm5 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: por %xmm5, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: movmskpd %xmm2, %eax +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movmskpd %xmm1, %eax ; SSE2-NEXT: xorl $3, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB7_1 @@ -2653,8 +2676,9 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovsqw %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB7_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -2686,9 +2710,8 @@ ; AVX512BWVL-LABEL: truncstore_v2i64_v2i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsqw %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <2 x i64> %mask, zeroinitializer %b = icmp slt <2 x i64> %x, @@ -2707,25 +2730,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483775,2147483775] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm5, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 ; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: pxor %xmm5, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm5 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -2733,9 +2758,9 @@ ; SSE2-NEXT: packssdw %xmm3, %xmm3 ; SSE2-NEXT: packssdw %xmm3, %xmm3 ; SSE2-NEXT: packsswb %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] -; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: movmskpd %xmm0, %eax ; SSE2-NEXT: xorl $3, %eax ; SSE2-NEXT: testb $1, %al @@ -2849,8 +2874,9 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB8_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -2882,9 +2908,8 @@ ; AVX512BWVL-LABEL: truncstore_v2i64_v2i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsqb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <2 x i64> %mask, zeroinitializer %b = icmp slt <2 x i64> %x, @@ -3551,18 +3576,16 @@ ; AVX512BW-LABEL: truncstore_v16i32_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdw %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vpmovsdw %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v16i32_v16i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovdw %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsdw %zmm0, %ymm0 +; AVX512BWVL-NEXT: vmovdqu16 %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i32> %mask, zeroinitializer @@ -4209,18 +4232,16 @@ ; AVX512BW-LABEL: truncstore_v16i32_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vpmovsdb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v16i32_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovdb %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsdb %zmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i32> %mask, zeroinitializer @@ -4519,8 +4540,9 @@ ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB11_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -4594,9 +4616,8 @@ ; AVX512BWVL-LABEL: truncstore_v8i32_v8i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -4893,8 +4914,9 @@ ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB12_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -4969,9 +4991,8 @@ ; AVX512BWVL-LABEL: truncstore_v8i32_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -5102,8 +5123,9 @@ ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB13_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -5148,9 +5170,8 @@ ; AVX512BWVL-LABEL: truncstore_v4i32_v4i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer %b = icmp slt <4 x i32> %x, @@ -5341,8 +5362,9 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovsdb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB14_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -5388,9 +5410,8 @@ ; AVX512BWVL-LABEL: truncstore_v4i32_v4i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsdb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer %b = icmp slt <4 x i32> %x, @@ -5415,7 +5436,7 @@ ; SSE2-NEXT: notl %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %ecx ; SSE2-NEXT: jne .LBB15_1 ; SSE2-NEXT: # %bb.2: # %else @@ -5627,7 +5648,7 @@ ; SSE4-NEXT: notl %eax ; SSE4-NEXT: shll $16, %eax ; SSE4-NEXT: orl %ecx, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $1, %cl ; SSE4-NEXT: jne .LBB15_1 ; SSE4-NEXT: # %bb.2: # %else ; SSE4-NEXT: testb $2, %al @@ -5869,7 +5890,7 @@ ; AVX1-NEXT: notl %eax ; AVX1-NEXT: shll $16, %eax ; AVX1-NEXT: orl %ecx, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $1, %cl ; AVX1-NEXT: jne .LBB15_1 ; AVX1-NEXT: # %bb.2: # %else ; AVX1-NEXT: testb $2, %al @@ -6570,19 +6591,18 @@ ; AVX512BW-LABEL: truncstore_v32i16_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmb %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmb %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kmovd %k0, %k1 +; AVX512BW-NEXT: vpmovswb %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v32i16_v32i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovswb %zmm0, %ymm0 +; AVX512BWVL-NEXT: vmovdqu8 %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <32 x i8> %mask, zeroinitializer @@ -7208,9 +7228,8 @@ ; AVX512BWVL-LABEL: truncstore_v16i16_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovswb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i8> %mask, zeroinitializer @@ -7428,8 +7447,9 @@ ; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB17_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -7502,9 +7522,8 @@ ; AVX512BWVL-LABEL: truncstore_v8i16_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmw %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i16> %mask, zeroinitializer %b = icmp slt <8 x i16> %x, diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -11,48 +11,48 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i64_v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm9, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647] -; SSE2-NEXT: movdqa %xmm7, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm10 -; SSE2-NEXT: pand %xmm11, %xmm10 +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455] +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm11 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-NEXT: pand %xmm10, %xmm1 -; SSE2-NEXT: pxor %xmm6, %xmm10 -; SSE2-NEXT: por %xmm1, %xmm10 +; SSE2-NEXT: pand %xmm11, %xmm1 +; SSE2-NEXT: pxor %xmm6, %xmm11 +; SSE2-NEXT: por %xmm1, %xmm11 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2] -; SSE2-NEXT: movdqa %xmm7, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 -; SSE2-NEXT: pand %xmm12, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm6, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm10[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[0,2] ; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE2-NEXT: movdqa %xmm7, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: pxor %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm12, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm10 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pxor %xmm2, %xmm8 +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm4 ; SSE2-NEXT: packssdw %xmm5, %xmm4 ; SSE2-NEXT: packsswb %xmm4, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax @@ -62,8 +62,8 @@ ; SSE2-NEXT: # %bb.1: # %cond.store ; SSE2-NEXT: movss %xmm1, (%rdi) ; SSE2-NEXT: .LBB0_2: # %else -; SSE2-NEXT: pand %xmm11, %xmm0 -; SSE2-NEXT: pand %xmm7, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm0 +; SSE2-NEXT: pand %xmm11, %xmm8 ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB0_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 @@ -72,8 +72,8 @@ ; SSE2-NEXT: .LBB0_4: # %else2 ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm6, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm6 -; SSE2-NEXT: pand %xmm10, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm6 +; SSE2-NEXT: pand %xmm8, %xmm2 ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB0_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 @@ -276,26 +276,30 @@ ; AVX512F-LABEL: truncstore_v8i64_v8i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, (%rdi) {%k1} +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kshiftlw $8, %k0, %k0 +; AVX512F-NEXT: kshiftrw $8, %k0, %k1 +; AVX512F-NEXT: vpmovusqd %zmm0, %ymm0 +; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: truncstore_v8i64_v8i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512VL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovqd %zmm0, (%rdi) {%k1} +; AVX512VL-NEXT: vpmovusqd %zmm0, %ymm0 +; AVX512VL-NEXT: vmovdqu32 %ymm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: truncstore_v8i64_v8i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftlw $8, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k0, %k1 +; AVX512BW-NEXT: vpmovusqd %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -313,56 +317,56 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm1, %xmm9 ; SSE2-NEXT: pxor %xmm7, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm8, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm10 -; SSE2-NEXT: pand %xmm11, %xmm10 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm11 ; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE2-NEXT: pand %xmm10, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm10 -; SSE2-NEXT: por %xmm1, %xmm10 +; SSE2-NEXT: pand %xmm11, %xmm1 +; SSE2-NEXT: pxor %xmm9, %xmm11 +; SSE2-NEXT: por %xmm1, %xmm11 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm7, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2] -; SSE2-NEXT: movdqa %xmm8, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm1 -; SSE2-NEXT: pand %xmm12, %xmm1 +; SSE2-NEXT: pand %xmm10, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm9, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm10[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[0,2] ; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE2-NEXT: movdqa %xmm8, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm0 -; SSE2-NEXT: pand %xmm11, %xmm0 +; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm9, %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm10, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm3 -; SSE2-NEXT: pand %xmm8, %xmm3 -; SSE2-NEXT: pxor %xmm3, %xmm9 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: por %xmm9, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,2] -; SSE2-NEXT: pslld $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm7 +; SSE2-NEXT: movdqa %xmm8, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm7 +; SSE2-NEXT: pxor %xmm7, %xmm9 +; SSE2-NEXT: pand %xmm2, %xmm7 +; SSE2-NEXT: por %xmm9, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[0,2] +; SSE2-NEXT: pslld $16, %xmm7 +; SSE2-NEXT: psrad $16, %xmm7 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm3, %xmm1 +; SSE2-NEXT: packssdw %xmm7, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE2-NEXT: packssdw %xmm5, %xmm4 @@ -703,8 +707,9 @@ ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB1_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -766,17 +771,19 @@ ; AVX512BW-LABEL: truncstore_v8i64_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqw %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftld $24, %k0, %k0 +; AVX512BW-NEXT: kshiftrd $24, %k0, %k1 +; AVX512BW-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v8i64_v8i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovqw %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -795,51 +802,51 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm1, %xmm10 ; SSE2-NEXT: pxor %xmm8, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm9, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm10 -; SSE2-NEXT: pand %xmm12, %xmm10 +; SSE2-NEXT: pand %xmm11, %xmm10 ; SSE2-NEXT: pand %xmm10, %xmm1 ; SSE2-NEXT: pandn %xmm7, %xmm10 ; SSE2-NEXT: por %xmm1, %xmm10 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2] -; SSE2-NEXT: movdqa %xmm9, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm1 -; SSE2-NEXT: pand %xmm12, %xmm1 +; SSE2-NEXT: pand %xmm11, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn %xmm7, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: packuswb %xmm10, %xmm1 ; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE2-NEXT: movdqa %xmm9, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 -; SSE2-NEXT: pand %xmm11, %xmm0 +; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm7, %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm10, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 -; SSE2-NEXT: pand %xmm9, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm7, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: packuswb %xmm0, %xmm3 -; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm8 +; SSE2-NEXT: movdqa %xmm9, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: pandn %xmm7, %xmm8 +; SSE2-NEXT: por %xmm2, %xmm8 +; SSE2-NEXT: packuswb %xmm0, %xmm8 +; SSE2-NEXT: packuswb %xmm8, %xmm1 ; SSE2-NEXT: packuswb %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 @@ -1177,8 +1184,9 @@ ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB2_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -1240,17 +1248,19 @@ ; AVX512BW-LABEL: truncstore_v8i64_v8i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqb %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftlq $56, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $56, %k0, %k1 +; AVX512BW-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v8i64_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -1268,27 +1278,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647] -; SSE2-NEXT: movdqa %xmm7, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259455,9223372039002259455] +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm8, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 +; SSE2-NEXT: pand %xmm7, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm7 ; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pxor %xmm6, %xmm5 +; SSE2-NEXT: pxor %xmm7, %xmm5 ; SSE2-NEXT: por %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm8, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm7, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm7 ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: por %xmm6, %xmm1 +; SSE2-NEXT: por %xmm7, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: movmskps %xmm3, %eax @@ -1432,8 +1442,8 @@ ; AVX512VL-LABEL: truncstore_v4i64_v4i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovqd %ymm0, (%rdi) {%k1} +; AVX512VL-NEXT: vpmovusqd %ymm0, %xmm0 +; AVX512VL-NEXT: vmovdqu32 %xmm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -1463,27 +1473,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm7, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm8, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 +; SSE2-NEXT: pand %xmm7, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm7 ; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pxor %xmm6, %xmm5 +; SSE2-NEXT: pxor %xmm7, %xmm5 ; SSE2-NEXT: por %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm8, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm7, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm7 ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: por %xmm6, %xmm1 +; SSE2-NEXT: por %xmm7, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 @@ -1679,8 +1689,9 @@ ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB4_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -1726,8 +1737,8 @@ ; AVX512BWVL-LABEL: truncstore_v4i64_v4i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusqw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer @@ -1746,34 +1757,34 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm8, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE2-NEXT: pand %xmm9, %xmm6 +; SSE2-NEXT: pand %xmm8, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm1 ; SSE2-NEXT: pandn %xmm4, %xmm6 ; SSE2-NEXT: por %xmm1, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm8, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn %xmm4, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm6, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm7, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm1, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm0, %xmm5 +; SSE2-NEXT: packuswb %xmm6, %xmm5 +; SSE2-NEXT: packuswb %xmm5, %xmm5 +; SSE2-NEXT: packuswb %xmm5, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: movmskps %xmm3, %ecx ; SSE2-NEXT: xorl $15, %ecx ; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movd %xmm5, %eax ; SSE2-NEXT: jne .LBB5_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %cl @@ -1963,8 +1974,9 @@ ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB5_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -2010,8 +2022,8 @@ ; AVX512BWVL-LABEL: truncstore_v4i64_v4i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusqb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer @@ -2028,11 +2040,13 @@ ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pandn %xmm4, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE2-NEXT: pxor %xmm3, %xmm4 ; SSE2-NEXT: pand %xmm0, %xmm3 @@ -2129,8 +2143,8 @@ ; AVX512VL-LABEL: truncstore_v2i64_v2i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovqd %xmm0, (%rdi) {%k1} +; AVX512VL-NEXT: vpmovusqd %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu32 %xmm0, (%rdi) {%k1} ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: truncstore_v2i64_v2i32: @@ -2158,11 +2172,13 @@ ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pandn %xmm4, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE2-NEXT: pxor %xmm3, %xmm4 ; SSE2-NEXT: pand %xmm0, %xmm3 @@ -2251,8 +2267,9 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB7_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -2284,8 +2301,8 @@ ; AVX512BWVL-LABEL: truncstore_v2i64_v2i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusqw %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <2 x i64> %mask, zeroinitializer %b = icmp ult <2 x i64> %x, @@ -2301,11 +2318,13 @@ ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pandn %xmm4, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 @@ -2391,8 +2410,9 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB8_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -2424,8 +2444,8 @@ ; AVX512BWVL-LABEL: truncstore_v2i64_v2i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusqb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <2 x i64> %mask, zeroinitializer %b = icmp ult <2 x i64> %x, @@ -3123,16 +3143,16 @@ ; AVX512BW-LABEL: truncstore_v16i32_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdw %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vpmovusdw %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v16i32_v16i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovdw %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusdw %zmm0, %ymm0 +; AVX512BWVL-NEXT: vmovdqu16 %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i32> %mask, zeroinitializer @@ -3819,16 +3839,16 @@ ; AVX512BW-LABEL: truncstore_v16i32_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v16i32_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovdb %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i32> %mask, zeroinitializer @@ -4152,8 +4172,9 @@ ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovusdw %zmm0, %ymm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB11_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -4227,8 +4248,8 @@ ; AVX512BWVL-LABEL: truncstore_v8i32_v8i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -4545,8 +4566,9 @@ ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB12_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -4620,8 +4642,8 @@ ; AVX512BWVL-LABEL: truncstore_v8i32_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -4800,8 +4822,9 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovusdw %zmm0, %ymm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB13_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -4847,8 +4870,8 @@ ; AVX512BWVL-LABEL: truncstore_v4i32_v4i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusdw %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer %b = icmp ult <4 x i32> %x, @@ -5028,8 +5051,9 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB14_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -5075,8 +5099,8 @@ ; AVX512BWVL-LABEL: truncstore_v4i32_v4i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusdb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer %b = icmp ult <4 x i32> %x, @@ -5106,7 +5130,7 @@ ; SSE2-NEXT: notl %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %ecx ; SSE2-NEXT: jne .LBB15_1 ; SSE2-NEXT: # %bb.2: # %else @@ -5327,7 +5351,7 @@ ; SSE4-NEXT: notl %eax ; SSE4-NEXT: shll $16, %eax ; SSE4-NEXT: orl %ecx, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $1, %cl ; SSE4-NEXT: jne .LBB15_1 ; SSE4-NEXT: # %bb.2: # %else ; SSE4-NEXT: testb $2, %al @@ -5574,7 +5598,7 @@ ; AVX1-NEXT: notl %eax ; AVX1-NEXT: shll $16, %eax ; AVX1-NEXT: orl %ecx, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $1, %cl ; AVX1-NEXT: jne .LBB15_1 ; AVX1-NEXT: # %bb.2: # %else ; AVX1-NEXT: testb $2, %al @@ -6281,17 +6305,18 @@ ; AVX512BW-LABEL: truncstore_v32i16_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmb %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmb %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kmovd %k0, %k1 +; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v32i16_v32i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovuswb %zmm0, %ymm0 +; AVX512BWVL-NEXT: vmovdqu8 %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <32 x i8> %mask, zeroinitializer @@ -6930,8 +6955,8 @@ ; AVX512BWVL-LABEL: truncstore_v16i16_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovuswb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i8> %mask, zeroinitializer @@ -7153,8 +7178,9 @@ ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB17_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -7228,8 +7254,8 @@ ; AVX512BWVL-LABEL: truncstore_v8i16_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmw %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovuswb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i16> %mask, zeroinitializer %b = icmp ult <8 x i16> %x, diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll --- a/llvm/test/CodeGen/X86/matrix-multiply.ll +++ b/llvm/test/CodeGen/X86/matrix-multiply.ll @@ -180,24 +180,22 @@ ; SSE-NEXT: movss {{.*#+}} xmm11 = mem[0],zero,zero,zero ; SSE-NEXT: addss %xmm13, %xmm1 ; SSE-NEXT: addss %xmm7, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: mulss %xmm11, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0,0,1,1] +; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: mulss %xmm11, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,0,0,0] ; SSE-NEXT: mulps %xmm0, %xmm11 -; SSE-NEXT: movaps %xmm5, %xmm12 -; SSE-NEXT: mulss %xmm10, %xmm12 -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0,0,1,1] +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: mulss %xmm10, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,0,0,0] ; SSE-NEXT: mulps %xmm3, %xmm10 ; SSE-NEXT: addps %xmm11, %xmm10 ; SSE-NEXT: movaps %xmm9, %xmm11 ; SSE-NEXT: mulss %xmm8, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0,0,1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0,0,0] ; SSE-NEXT: mulps %xmm6, %xmm8 ; SSE-NEXT: addps %xmm10, %xmm8 -; SSE-NEXT: addss %xmm7, %xmm12 -; SSE-NEXT: addss %xmm11, %xmm12 -; SSE-NEXT: movaps %xmm8, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm12[0] +; SSE-NEXT: addss %xmm12, %xmm7 +; SSE-NEXT: addss %xmm11, %xmm7 ; SSE-NEXT: movss {{.*#+}} xmm10 = mem[0],zero,zero,zero ; SSE-NEXT: mulss %xmm10, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0,0,1,1] @@ -212,11 +210,12 @@ ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0,0,1,1] ; SSE-NEXT: mulps %xmm6, %xmm3 ; SSE-NEXT: addps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm3[0,1] ; SSE-NEXT: addss %xmm2, %xmm5 ; SSE-NEXT: addss %xmm9, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm8[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[0,1] ; SSE-NEXT: movss %xmm5, 32(%rdi) ; SSE-NEXT: movaps %xmm7, 16(%rdi) ; SSE-NEXT: movaps %xmm4, (%rdi) @@ -256,7 +255,6 @@ ; AVX1-NEXT: vaddss %xmm4, %xmm9, %xmm4 ; AVX1-NEXT: vmulss %xmm10, %xmm8, %xmm9 ; AVX1-NEXT: vaddss %xmm4, %xmm9, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm7[0,1],xmm4[0],xmm7[3] ; AVX1-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm9 ; AVX1-NEXT: vmulps %xmm0, %xmm9, %xmm0 ; AVX1-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm10 @@ -270,15 +268,13 @@ ; AVX1-NEXT: vaddss %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vmulss %xmm3, %xmm8, %xmm3 ; AVX1-NEXT: vaddss %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm7[1,1,3,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0] ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[0] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm4[1,2,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-NEXT: vmovss %xmm2, 32(%rdi) -; AVX1-NEXT: vmovaps %ymm0, (%rdi) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vmovaps %xmm1, (%rdi) +; AVX1-NEXT: vmovaps %xmm0, 16(%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_mul3x3_f32: @@ -315,36 +311,35 @@ ; AVX2-NEXT: vaddss %xmm4, %xmm9, %xmm4 ; AVX2-NEXT: vmulss %xmm10, %xmm8, %xmm9 ; AVX2-NEXT: vaddss %xmm4, %xmm9, %xmm4 -; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm7[0,1],xmm4[0],xmm7[3] -; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm7 -; AVX2-NEXT: vmulps %xmm7, %xmm0, %xmm0 ; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm9 -; AVX2-NEXT: vmulps %xmm3, %xmm9, %xmm3 +; AVX2-NEXT: vmulps %xmm0, %xmm9, %xmm0 +; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm10 +; AVX2-NEXT: vmulps %xmm3, %xmm10, %xmm3 ; AVX2-NEXT: vaddps %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm3 ; AVX2-NEXT: vmulps %xmm3, %xmm6, %xmm6 ; AVX2-NEXT: vaddps %xmm6, %xmm0, %xmm0 -; AVX2-NEXT: vmulss %xmm7, %xmm2, %xmm2 -; AVX2-NEXT: vmulss %xmm5, %xmm9, %xmm5 +; AVX2-NEXT: vmulss %xmm2, %xmm9, %xmm2 +; AVX2-NEXT: vmulss %xmm5, %xmm10, %xmm5 ; AVX2-NEXT: vaddss %xmm5, %xmm2, %xmm2 ; AVX2-NEXT: vmulss %xmm3, %xmm8, %xmm3 ; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX2-NEXT: vmovaps {{.*#+}} ymm3 = <0,1,2,4,5,6,u,u> -; AVX2-NEXT: vpermps %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovshdup {{.*#+}} xmm3 = xmm7[1,1,3,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[0] ; AVX2-NEXT: vmovss %xmm2, 32(%rdi) -; AVX2-NEXT: vmovaps %ymm0, (%rdi) -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: vmovaps %xmm1, (%rdi) +; AVX2-NEXT: vmovaps %xmm0, 16(%rdi) ; AVX2-NEXT: retq ; ; AVX512F-LABEL: test_mul3x3_f32: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: valignd {{.*#+}} zmm2 = zmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2] -; AVX512F-NEXT: vbroadcastss %xmm1, %xmm3 -; AVX512F-NEXT: vmulps %xmm3, %xmm0, %xmm3 +; AVX512F-NEXT: vbroadcastss %xmm1, %xmm2 +; AVX512F-NEXT: vmulps %xmm2, %xmm0, %xmm3 ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512F-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[2,3] ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm6 = xmm1[1,1,3,3] ; AVX512F-NEXT: vmulps %xmm6, %xmm2, %xmm4 ; AVX512F-NEXT: vaddps %xmm4, %xmm3, %xmm4 @@ -356,105 +351,108 @@ ; AVX512F-NEXT: vaddps %xmm4, %xmm9, %xmm9 ; AVX512F-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] ; AVX512F-NEXT: vmulss %xmm1, %xmm4, %xmm10 -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm5 = xmm5[1,1,3,3] -; AVX512F-NEXT: vmulss %xmm6, %xmm5, %xmm6 -; AVX512F-NEXT: vaddss %xmm6, %xmm10, %xmm6 -; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm10 -; AVX512F-NEXT: vmulss %xmm8, %xmm10, %xmm8 -; AVX512F-NEXT: vaddss %xmm6, %xmm8, %xmm6 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm9[0,1],xmm6[0],xmm9[3] +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm11 = xmm5[1,1,3,3] +; AVX512F-NEXT: vmulss %xmm6, %xmm11, %xmm5 +; AVX512F-NEXT: vaddss %xmm5, %xmm10, %xmm5 +; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm6 +; AVX512F-NEXT: vmulss %xmm6, %xmm8, %xmm8 +; AVX512F-NEXT: vaddss %xmm5, %xmm8, %xmm5 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm9[0,1],xmm5[0],xmm9[3] ; AVX512F-NEXT: vmulps %xmm7, %xmm0, %xmm8 ; AVX512F-NEXT: vextractf128 $1, %ymm1, %xmm9 -; AVX512F-NEXT: vmovsldup {{.*#+}} xmm11 = xmm9[0,0,2,2] -; AVX512F-NEXT: vmulps %xmm2, %xmm11, %xmm11 -; AVX512F-NEXT: vaddps %xmm11, %xmm8, %xmm8 -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm11 = xmm9[1,1,3,3] -; AVX512F-NEXT: vmulps %xmm3, %xmm11, %xmm12 +; AVX512F-NEXT: vmovsldup {{.*#+}} xmm10 = xmm9[0,0,2,2] +; AVX512F-NEXT: vmulps %xmm2, %xmm10, %xmm10 +; AVX512F-NEXT: vaddps %xmm10, %xmm8, %xmm8 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm10 = xmm9[1,1,3,3] +; AVX512F-NEXT: vmulps %xmm3, %xmm10, %xmm12 ; AVX512F-NEXT: vaddps %xmm12, %xmm8, %xmm8 ; AVX512F-NEXT: vmulss %xmm7, %xmm4, %xmm7 -; AVX512F-NEXT: vmulss %xmm5, %xmm9, %xmm12 +; AVX512F-NEXT: vmulss %xmm9, %xmm11, %xmm12 ; AVX512F-NEXT: vaddss %xmm7, %xmm12, %xmm7 -; AVX512F-NEXT: vmulss %xmm11, %xmm10, %xmm11 -; AVX512F-NEXT: vaddss %xmm7, %xmm11, %xmm7 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm7 = xmm8[0,1],xmm7[0],xmm8[3] -; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm9[3,3,3,3] -; AVX512F-NEXT: vshufpd {{.*#+}} xmm11 = xmm9[1,0] +; AVX512F-NEXT: vmulss %xmm6, %xmm10, %xmm10 +; AVX512F-NEXT: vaddss %xmm7, %xmm10, %xmm7 +; AVX512F-NEXT: vshufps {{.*#+}} xmm10 = xmm9[3,3,3,3] +; AVX512F-NEXT: vshufpd {{.*#+}} xmm12 = xmm9[1,0] ; AVX512F-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,2,2,2] ; AVX512F-NEXT: vmulps %xmm0, %xmm9, %xmm0 -; AVX512F-NEXT: vmulps %xmm2, %xmm8, %xmm2 +; AVX512F-NEXT: vmulps %xmm2, %xmm10, %xmm2 ; AVX512F-NEXT: vaddps %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vextractf32x4 $2, %zmm1, %xmm1 ; AVX512F-NEXT: vbroadcastss %xmm1, %xmm2 ; AVX512F-NEXT: vmulps %xmm2, %xmm3, %xmm2 ; AVX512F-NEXT: vaddps %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vmulss %xmm4, %xmm11, %xmm2 -; AVX512F-NEXT: vmulss %xmm5, %xmm8, %xmm3 +; AVX512F-NEXT: vmulss %xmm4, %xmm12, %xmm2 +; AVX512F-NEXT: vmulss %xmm10, %xmm11, %xmm3 ; AVX512F-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vmulss %xmm1, %xmm10, %xmm1 +; AVX512F-NEXT: vmulss %xmm1, %xmm6, %xmm1 ; AVX512F-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512F-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm2 -; AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = <0,1,2,4,5,6,16,17,18,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm8[1,1,3,3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[2,3] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm5[0,1,2],xmm8[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: test_mul3x3_f32: ; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: valignd {{.*#+}} zmm2 = zmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2] -; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm3 -; AVX512VL-NEXT: vmulps %xmm3, %xmm0, %xmm3 -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm2 +; AVX512VL-NEXT: vmulps %xmm2, %xmm0, %xmm2 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm4 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[2,3] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] -; AVX512VL-NEXT: vmulps %xmm5, %xmm2, %xmm6 -; AVX512VL-NEXT: vaddps %xmm6, %xmm3, %xmm3 -; AVX512VL-NEXT: vshufpd {{.*#+}} xmm6 = xmm4[1,0] +; AVX512VL-NEXT: vmulps %xmm5, %xmm4, %xmm6 +; AVX512VL-NEXT: vaddps %xmm6, %xmm2, %xmm2 +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm6 = xmm3[1,0] ; AVX512VL-NEXT: vshufps {{.*#+}} xmm7 = xmm1[3,3,3,3] ; AVX512VL-NEXT: vshufpd {{.*#+}} xmm8 = xmm1[1,0] ; AVX512VL-NEXT: vshufps {{.*#+}} xmm9 = xmm1[2,2,2,2] ; AVX512VL-NEXT: vmulps %xmm6, %xmm9, %xmm9 -; AVX512VL-NEXT: vaddps %xmm3, %xmm9, %xmm3 +; AVX512VL-NEXT: vaddps %xmm2, %xmm9, %xmm2 ; AVX512VL-NEXT: vshufpd {{.*#+}} xmm9 = xmm0[1,0] ; AVX512VL-NEXT: vmulss %xmm1, %xmm9, %xmm10 -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm4[1,1,3,3] -; AVX512VL-NEXT: vmulss %xmm5, %xmm4, %xmm5 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] +; AVX512VL-NEXT: vmulss %xmm5, %xmm3, %xmm5 ; AVX512VL-NEXT: vaddss %xmm5, %xmm10, %xmm5 ; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm10 ; AVX512VL-NEXT: vmulss %xmm8, %xmm10, %xmm8 ; AVX512VL-NEXT: vaddss %xmm5, %xmm8, %xmm5 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0],xmm3[3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm5[0],xmm2[3] ; AVX512VL-NEXT: vmulps %xmm7, %xmm0, %xmm5 ; AVX512VL-NEXT: vextractf128 $1, %ymm1, %xmm8 ; AVX512VL-NEXT: vmovsldup {{.*#+}} xmm11 = xmm8[0,0,2,2] -; AVX512VL-NEXT: vmulps %xmm2, %xmm11, %xmm11 +; AVX512VL-NEXT: vmulps %xmm4, %xmm11, %xmm11 ; AVX512VL-NEXT: vaddps %xmm5, %xmm11, %xmm5 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm11 = xmm8[1,1,3,3] ; AVX512VL-NEXT: vmulps %xmm6, %xmm11, %xmm12 ; AVX512VL-NEXT: vaddps %xmm5, %xmm12, %xmm5 ; AVX512VL-NEXT: vmulss %xmm7, %xmm9, %xmm7 -; AVX512VL-NEXT: vmulss %xmm4, %xmm8, %xmm12 +; AVX512VL-NEXT: vmulss %xmm3, %xmm8, %xmm12 ; AVX512VL-NEXT: vaddss %xmm7, %xmm12, %xmm7 ; AVX512VL-NEXT: vmulss %xmm11, %xmm10, %xmm11 ; AVX512VL-NEXT: vaddss %xmm7, %xmm11, %xmm7 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3] -; AVX512VL-NEXT: vshufps {{.*#+}} xmm7 = xmm8[3,3,3,3] -; AVX512VL-NEXT: vshufpd {{.*#+}} xmm11 = xmm8[1,0] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm11 = xmm8[3,3,3,3] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm12 = xmm8[1,0] ; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,2,2,2] ; AVX512VL-NEXT: vmulps %xmm0, %xmm8, %xmm0 -; AVX512VL-NEXT: vmulps %xmm7, %xmm2, %xmm2 -; AVX512VL-NEXT: vaddps %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vmulps %xmm4, %xmm11, %xmm4 +; AVX512VL-NEXT: vaddps %xmm4, %xmm0, %xmm0 ; AVX512VL-NEXT: vextractf32x4 $2, %zmm1, %xmm1 -; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm2 -; AVX512VL-NEXT: vmulps %xmm2, %xmm6, %xmm2 -; AVX512VL-NEXT: vaddps %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vmulss %xmm11, %xmm9, %xmm2 -; AVX512VL-NEXT: vmulss %xmm7, %xmm4, %xmm4 -; AVX512VL-NEXT: vaddss %xmm4, %xmm2, %xmm2 +; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm4 +; AVX512VL-NEXT: vmulps %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vaddps %xmm4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmulss %xmm12, %xmm9, %xmm4 +; AVX512VL-NEXT: vmulss %xmm3, %xmm11, %xmm3 +; AVX512VL-NEXT: vaddss %xmm3, %xmm4, %xmm3 ; AVX512VL-NEXT: vmulss %xmm1, %xmm10, %xmm1 -; AVX512VL-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm3, %zmm2 -; AVX512VL-NEXT: vmovaps {{.*#+}} zmm0 = <0,1,2,4,5,6,16,17,18,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0 +; AVX512VL-NEXT: vaddss %xmm1, %xmm3, %xmm1 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm3 = xmm5[1,1,3,3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[2,3] +; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq entry: %block = shufflevector <9 x float> %a0, <9 x float> poison, <2 x i32> @@ -617,7 +615,6 @@ ; AVX1-NEXT: vaddsd %xmm4, %xmm9, %xmm4 ; AVX1-NEXT: vmulsd %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vaddsd %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] ; AVX1-NEXT: vmulpd %xmm7, %xmm1, %xmm9 ; AVX1-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] @@ -644,15 +641,13 @@ ; AVX1-NEXT: vaddsd %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vmulsd %xmm3, %xmm8, %xmm3 ; AVX1-NEXT: vaddsd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[2] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm3 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 -; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm3[0],ymm1[2],ymm3[3] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm9[0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm4 = xmm9[1],xmm7[0] ; AVX1-NEXT: vmovsd %xmm2, 64(%rdi) -; AVX1-NEXT: vmovapd %ymm1, 32(%rdi) -; AVX1-NEXT: vmovapd %ymm0, (%rdi) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vmovapd %xmm1, 48(%rdi) +; AVX1-NEXT: vmovapd %xmm0, (%rdi) +; AVX1-NEXT: vmovapd %xmm4, 32(%rdi) +; AVX1-NEXT: vmovapd %xmm3, 16(%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_mul3x3_f64: @@ -675,7 +670,6 @@ ; AVX2-NEXT: vaddsd %xmm4, %xmm9, %xmm4 ; AVX2-NEXT: vmulsd %xmm7, %xmm8, %xmm7 ; AVX2-NEXT: vaddsd %xmm7, %xmm4, %xmm4 -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] ; AVX2-NEXT: vmulpd %xmm7, %xmm1, %xmm9 ; AVX2-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] @@ -702,70 +696,68 @@ ; AVX2-NEXT: vaddsd %xmm5, %xmm2, %xmm2 ; AVX2-NEXT: vmulsd %xmm3, %xmm8, %xmm3 ; AVX2-NEXT: vaddsd %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[2] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm3 -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 -; AVX2-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm3[0],ymm1[2],ymm3[3] +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm9[0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm4 = xmm9[1],xmm7[0] ; AVX2-NEXT: vmovsd %xmm2, 64(%rdi) -; AVX2-NEXT: vmovapd %ymm1, 32(%rdi) -; AVX2-NEXT: vmovapd %ymm0, (%rdi) -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: vmovapd %xmm1, 48(%rdi) +; AVX2-NEXT: vmovapd %xmm0, (%rdi) +; AVX2-NEXT: vmovapd %xmm4, 32(%rdi) +; AVX2-NEXT: vmovapd %xmm3, 16(%rdi) ; AVX2-NEXT: retq ; ; AVX512F-LABEL: test_mul3x3_f64: ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: movq %rdi, %rax ; AVX512F-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero -; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] ; AVX512F-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm0, %xmm9, %xmm10 -; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm4[0] -; AVX512F-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm3, %xmm1, %xmm4 -; AVX512F-NEXT: vaddpd %xmm4, %xmm10, %xmm4 +; AVX512F-NEXT: vmulpd %xmm1, %xmm9, %xmm0 +; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX512F-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm4, %xmm3, %xmm10 +; AVX512F-NEXT: vaddpd %xmm0, %xmm10, %xmm0 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] ; AVX512F-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] ; AVX512F-NEXT: vmulpd %xmm7, %xmm6, %xmm10 -; AVX512F-NEXT: vaddpd %xmm4, %xmm10, %xmm4 +; AVX512F-NEXT: vaddpd %xmm0, %xmm10, %xmm0 ; AVX512F-NEXT: vmulsd %xmm2, %xmm9, %xmm9 -; AVX512F-NEXT: vmulsd %xmm3, %xmm5, %xmm3 -; AVX512F-NEXT: vaddsd %xmm3, %xmm9, %xmm3 -; AVX512F-NEXT: vmulsd %xmm7, %xmm8, %xmm7 -; AVX512F-NEXT: vaddsd %xmm7, %xmm3, %xmm3 -; AVX512F-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX512F-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm4, %xmm0, %xmm7 -; AVX512F-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm1, %xmm9, %xmm10 -; AVX512F-NEXT: vaddpd %xmm7, %xmm10, %xmm7 -; AVX512F-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm6, %xmm10, %xmm11 -; AVX512F-NEXT: vaddpd %xmm7, %xmm11, %xmm7 -; AVX512F-NEXT: vmulsd %xmm4, %xmm2, %xmm4 -; AVX512F-NEXT: vmulsd %xmm5, %xmm9, %xmm9 +; AVX512F-NEXT: vmulsd %xmm4, %xmm5, %xmm4 ; AVX512F-NEXT: vaddsd %xmm4, %xmm9, %xmm4 -; AVX512F-NEXT: vmulsd %xmm10, %xmm8, %xmm9 -; AVX512F-NEXT: vaddsd %xmm4, %xmm9, %xmm4 -; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; AVX512F-NEXT: vmulsd %xmm7, %xmm8, %xmm7 +; AVX512F-NEXT: vaddsd %xmm7, %xmm4, %xmm4 ; AVX512F-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm7, %xmm0, %xmm0 -; AVX512F-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm1, %xmm9, %xmm1 -; AVX512F-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm1, %xmm6, %xmm6 -; AVX512F-NEXT: vaddpd %xmm6, %xmm0, %xmm0 -; AVX512F-NEXT: vmulsd %xmm7, %xmm2, %xmm2 -; AVX512F-NEXT: vmulsd %xmm5, %xmm9, %xmm5 +; AVX512F-NEXT: vmulpd %xmm7, %xmm1, %xmm9 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm3, %xmm10, %xmm11 +; AVX512F-NEXT: vaddpd %xmm11, %xmm9, %xmm9 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm6, %xmm11, %xmm12 +; AVX512F-NEXT: vaddpd %xmm12, %xmm9, %xmm9 +; AVX512F-NEXT: vmulsd %xmm7, %xmm2, %xmm7 +; AVX512F-NEXT: vmulsd %xmm5, %xmm10, %xmm10 +; AVX512F-NEXT: vaddsd %xmm7, %xmm10, %xmm7 +; AVX512F-NEXT: vmulsd %xmm11, %xmm8, %xmm10 +; AVX512F-NEXT: vaddsd %xmm7, %xmm10, %xmm7 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm1, %xmm10, %xmm1 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm3, %xmm11, %xmm3 +; AVX512F-NEXT: vaddpd %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm3, %xmm6, %xmm6 +; AVX512F-NEXT: vaddpd %xmm6, %xmm1, %xmm1 +; AVX512F-NEXT: vmulsd %xmm2, %xmm10, %xmm2 +; AVX512F-NEXT: vmulsd %xmm5, %xmm11, %xmm5 ; AVX512F-NEXT: vaddsd %xmm5, %xmm2, %xmm2 -; AVX512F-NEXT: vmulsd %xmm1, %xmm8, %xmm1 -; AVX512F-NEXT: vaddsd %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vinsertf64x4 $1, %ymm4, %zmm3, %zmm2 -; AVX512F-NEXT: vmovapd {{.*#+}} zmm3 = [0,1,2,4,5,6,8,9] -; AVX512F-NEXT: vpermi2pd %zmm0, %zmm2, %zmm3 -; AVX512F-NEXT: vmovsd %xmm1, 64(%rdi) -; AVX512F-NEXT: vmovapd %zmm3, (%rdi) +; AVX512F-NEXT: vmulsd %xmm3, %xmm8, %xmm3 +; AVX512F-NEXT: vaddsd %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vshufpd {{.*#+}} xmm3 = xmm9[1],xmm7[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm9[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovsd %xmm2, 64(%rdi) +; AVX512F-NEXT: vmovapd %zmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -789,39 +781,39 @@ ; AVX512VL-NEXT: vaddsd %xmm4, %xmm1, %xmm1 ; AVX512VL-NEXT: vmulsd %xmm7, %xmm8, %xmm4 ; AVX512VL-NEXT: vaddsd %xmm4, %xmm1, %xmm1 -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 ; AVX512VL-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] ; AVX512VL-NEXT: vmulpd %xmm4, %xmm0, %xmm7 -; AVX512VL-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX512VL-NEXT: vmulpd %xmm3, %xmm9, %xmm10 -; AVX512VL-NEXT: vaddpd %xmm7, %xmm10, %xmm7 ; AVX512VL-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX512VL-NEXT: vmulpd %xmm6, %xmm10, %xmm11 +; AVX512VL-NEXT: vmulpd %xmm3, %xmm10, %xmm11 ; AVX512VL-NEXT: vaddpd %xmm7, %xmm11, %xmm7 +; AVX512VL-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] +; AVX512VL-NEXT: vmulpd %xmm6, %xmm11, %xmm12 +; AVX512VL-NEXT: vaddpd %xmm7, %xmm12, %xmm7 ; AVX512VL-NEXT: vmulsd %xmm4, %xmm2, %xmm4 -; AVX512VL-NEXT: vmulsd %xmm5, %xmm9, %xmm9 -; AVX512VL-NEXT: vaddsd %xmm4, %xmm9, %xmm4 -; AVX512VL-NEXT: vmulsd %xmm10, %xmm8, %xmm9 -; AVX512VL-NEXT: vaddsd %xmm4, %xmm9, %xmm4 -; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 -; AVX512VL-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX512VL-NEXT: vmulpd %xmm7, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX512VL-NEXT: vmulpd %xmm3, %xmm9, %xmm3 +; AVX512VL-NEXT: vmulsd %xmm5, %xmm10, %xmm10 +; AVX512VL-NEXT: vaddsd %xmm4, %xmm10, %xmm4 +; AVX512VL-NEXT: vmulsd %xmm11, %xmm8, %xmm10 +; AVX512VL-NEXT: vaddsd %xmm4, %xmm10, %xmm4 +; AVX512VL-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] +; AVX512VL-NEXT: vmulpd %xmm0, %xmm10, %xmm0 +; AVX512VL-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] +; AVX512VL-NEXT: vmulpd %xmm3, %xmm11, %xmm3 ; AVX512VL-NEXT: vaddpd %xmm3, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] ; AVX512VL-NEXT: vmulpd %xmm3, %xmm6, %xmm6 ; AVX512VL-NEXT: vaddpd %xmm6, %xmm0, %xmm0 -; AVX512VL-NEXT: vmulsd %xmm7, %xmm2, %xmm2 -; AVX512VL-NEXT: vmulsd %xmm5, %xmm9, %xmm5 +; AVX512VL-NEXT: vmulsd %xmm2, %xmm10, %xmm2 +; AVX512VL-NEXT: vmulsd %xmm5, %xmm11, %xmm5 ; AVX512VL-NEXT: vaddsd %xmm5, %xmm2, %xmm2 ; AVX512VL-NEXT: vmulsd %xmm3, %xmm8, %xmm3 ; AVX512VL-NEXT: vaddsd %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512VL-NEXT: vmovapd {{.*#+}} zmm3 = [0,1,2,4,5,6,8,9] -; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm1, %zmm3 +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm3 = xmm7[1],xmm4[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm7[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512VL-NEXT: vmovsd %xmm2, 64(%rdi) -; AVX512VL-NEXT: vmovapd %zmm3, (%rdi) +; AVX512VL-NEXT: vmovapd %zmm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll --- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll @@ -178,8 +178,8 @@ ; X86-NEXT: movzwl (%ecx), %edx ; X86-NEXT: xorw (%eax), %dx ; X86-NEXT: movzbl 2(%ecx), %ecx -; X86-NEXT: xorb 2(%eax), %cl -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl 2(%eax), %eax +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: orw %dx, %ax ; X86-NEXT: setne %al ; X86-NEXT: retl @@ -311,8 +311,8 @@ ; X86-NEXT: movl (%ecx), %edx ; X86-NEXT: xorl (%eax), %edx ; X86-NEXT: movzbl 4(%ecx), %ecx -; X86-NEXT: xorb 4(%eax), %cl -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl 4(%eax), %eax +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: orl %edx, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl @@ -514,8 +514,8 @@ ; X86-NEXT: xorl 4(%eax), %esi ; X86-NEXT: orl %edx, %esi ; X86-NEXT: movzbl 8(%ecx), %ecx -; X86-NEXT: xorb 8(%eax), %cl -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl 8(%eax), %eax +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: orl %esi, %eax ; X86-NEXT: sete %al ; X86-NEXT: popl %esi @@ -537,8 +537,8 @@ ; X86-NEXT: xorl 4(%eax), %esi ; X86-NEXT: orl %edx, %esi ; X86-NEXT: movzwl 8(%ecx), %ecx -; X86-NEXT: xorw 8(%eax), %cx -; X86-NEXT: movzwl %cx, %eax +; X86-NEXT: movzwl 8(%eax), %eax +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: orl %esi, %eax ; X86-NEXT: sete %al ; X86-NEXT: popl %esi @@ -645,8 +645,8 @@ ; X86-NEXT: movl 8(%edx), %esi ; X86-NEXT: xorl 8(%ecx), %esi ; X86-NEXT: movzbl 12(%edx), %edx -; X86-NEXT: xorb 12(%ecx), %dl -; X86-NEXT: movzbl %dl, %ecx +; X86-NEXT: movzbl 12(%ecx), %ecx +; X86-NEXT: xorl %edx, %ecx ; X86-NEXT: orl %esi, %ecx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete %al @@ -671,8 +671,8 @@ ; X86-NEXT: movl 8(%edx), %esi ; X86-NEXT: xorl 8(%ecx), %esi ; X86-NEXT: movzwl 12(%edx), %edx -; X86-NEXT: xorw 12(%ecx), %dx -; X86-NEXT: movzwl %dx, %ecx +; X86-NEXT: movzwl 12(%ecx), %ecx +; X86-NEXT: xorl %edx, %ecx ; X86-NEXT: orl %esi, %ecx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete %al diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll --- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll +++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll @@ -167,9 +167,9 @@ ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: xorw (%rsi), %ax ; X64-NEXT: movzbl 2(%rdi), %ecx -; X64-NEXT: xorb 2(%rsi), %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orw %ax, %cx +; X64-NEXT: movzbl 2(%rsi), %edx +; X64-NEXT: xorl %ecx, %edx +; X64-NEXT: orw %ax, %dx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind @@ -284,9 +284,9 @@ ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: xorl (%rsi), %eax ; X64-NEXT: movzbl 4(%rdi), %ecx -; X64-NEXT: xorb 4(%rsi), %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orl %eax, %ecx +; X64-NEXT: movzbl 4(%rsi), %edx +; X64-NEXT: xorl %ecx, %edx +; X64-NEXT: orl %eax, %edx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind @@ -443,9 +443,9 @@ ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: xorq (%rsi), %rax ; X64-NEXT: movzbl 8(%rdi), %ecx -; X64-NEXT: xorb 8(%rsi), %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movzbl 8(%rsi), %edx +; X64-NEXT: xorq %rcx, %rdx +; X64-NEXT: orq %rax, %rdx ; X64-NEXT: sete %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind @@ -459,9 +459,9 @@ ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: xorq (%rsi), %rax ; X64-NEXT: movzwl 8(%rdi), %ecx -; X64-NEXT: xorw 8(%rsi), %cx -; X64-NEXT: movzwl %cx, %ecx -; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movzwl 8(%rsi), %edx +; X64-NEXT: xorq %rcx, %rdx +; X64-NEXT: orq %rax, %rdx ; X64-NEXT: sete %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind @@ -490,8 +490,9 @@ ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: xorq (%rsi), %rax ; X64-NEXT: movl 8(%rdi), %ecx -; X64-NEXT: xorl 8(%rsi), %ecx -; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movl 8(%rsi), %edx +; X64-NEXT: xorq %rcx, %rdx +; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind @@ -1636,10 +1637,96 @@ ; X64-AVX1-LABEL: length48_eq: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 32(%rdi), %xmm1 -; X64-AVX1-NEXT: vmovups 32(%rsi), %xmm2 -; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0 +; X64-AVX1-NEXT: movq 32(%rdi), %rax +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $8, %ecx +; X64-AVX1-NEXT: vmovd %eax, %xmm1 +; X64-AVX1-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $16, %ecx +; X64-AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $24, %ecx +; X64-AVX1-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $32, %rcx +; X64-AVX1-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $40, %rcx +; X64-AVX1-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $48, %rcx +; X64-AVX1-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: shrq $56, %rax +; X64-AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq 40(%rdi), %rax +; X64-AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $8, %ecx +; X64-AVX1-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $16, %ecx +; X64-AVX1-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $24, %ecx +; X64-AVX1-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $32, %rcx +; X64-AVX1-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $40, %rcx +; X64-AVX1-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $48, %rcx +; X64-AVX1-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: shrq $56, %rax +; X64-AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq 32(%rsi), %rcx +; X64-AVX1-NEXT: movq 40(%rsi), %rax +; X64-AVX1-NEXT: movl %ecx, %edx +; X64-AVX1-NEXT: shrl $8, %edx +; X64-AVX1-NEXT: vmovd %ecx, %xmm2 +; X64-AVX1-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movl %ecx, %edx +; X64-AVX1-NEXT: shrl $16, %edx +; X64-AVX1-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movl %ecx, %edx +; X64-AVX1-NEXT: shrl $24, %edx +; X64-AVX1-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rcx, %rdx +; X64-AVX1-NEXT: shrq $32, %rdx +; X64-AVX1-NEXT: vpinsrb $4, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rcx, %rdx +; X64-AVX1-NEXT: shrq $40, %rdx +; X64-AVX1-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rcx, %rdx +; X64-AVX1-NEXT: shrq $48, %rdx +; X64-AVX1-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: shrq $56, %rcx +; X64-AVX1-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $8, %ecx +; X64-AVX1-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $16, %ecx +; X64-AVX1-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $24, %ecx +; X64-AVX1-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $32, %rcx +; X64-AVX1-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $40, %rcx +; X64-AVX1-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $48, %rcx +; X64-AVX1-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: shrq $56, %rax +; X64-AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 ; X64-AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm1 +; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0 ; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: sete %al @@ -1649,10 +1736,96 @@ ; X64-AVX2-LABEL: length48_eq: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %xmm1 -; X64-AVX2-NEXT: vmovdqu 32(%rsi), %xmm2 -; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: movq 32(%rdi), %rax +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $8, %ecx +; X64-AVX2-NEXT: vmovd %eax, %xmm1 +; X64-AVX2-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $16, %ecx +; X64-AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $24, %ecx +; X64-AVX2-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $32, %rcx +; X64-AVX2-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $40, %rcx +; X64-AVX2-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $48, %rcx +; X64-AVX2-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: shrq $56, %rax +; X64-AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq 40(%rdi), %rax +; X64-AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $8, %ecx +; X64-AVX2-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $16, %ecx +; X64-AVX2-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $24, %ecx +; X64-AVX2-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $32, %rcx +; X64-AVX2-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $40, %rcx +; X64-AVX2-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $48, %rcx +; X64-AVX2-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: shrq $56, %rax +; X64-AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq 32(%rsi), %rcx +; X64-AVX2-NEXT: movq 40(%rsi), %rax +; X64-AVX2-NEXT: movl %ecx, %edx +; X64-AVX2-NEXT: shrl $8, %edx +; X64-AVX2-NEXT: vmovd %ecx, %xmm2 +; X64-AVX2-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movl %ecx, %edx +; X64-AVX2-NEXT: shrl $16, %edx +; X64-AVX2-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movl %ecx, %edx +; X64-AVX2-NEXT: shrl $24, %edx +; X64-AVX2-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rcx, %rdx +; X64-AVX2-NEXT: shrq $32, %rdx +; X64-AVX2-NEXT: vpinsrb $4, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rcx, %rdx +; X64-AVX2-NEXT: shrq $40, %rdx +; X64-AVX2-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rcx, %rdx +; X64-AVX2-NEXT: shrq $48, %rdx +; X64-AVX2-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: shrq $56, %rcx +; X64-AVX2-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $8, %ecx +; X64-AVX2-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $16, %ecx +; X64-AVX2-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $24, %ecx +; X64-AVX2-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $32, %rcx +; X64-AVX2-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $40, %rcx +; X64-AVX2-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $48, %rcx +; X64-AVX2-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: shrq $56, %rax +; X64-AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 ; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: sete %al @@ -1662,10 +1835,96 @@ ; X64-AVX512-LABEL: length48_eq: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX512-NEXT: vmovdqu 32(%rdi), %xmm1 -; X64-AVX512-NEXT: vmovdqu 32(%rsi), %xmm2 -; X64-AVX512-NEXT: vpxor (%rsi), %ymm0, %ymm0 +; X64-AVX512-NEXT: movq 32(%rdi), %rax +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $8, %ecx +; X64-AVX512-NEXT: vmovd %eax, %xmm1 +; X64-AVX512-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $16, %ecx +; X64-AVX512-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $24, %ecx +; X64-AVX512-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $32, %rcx +; X64-AVX512-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $40, %rcx +; X64-AVX512-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $48, %rcx +; X64-AVX512-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: shrq $56, %rax +; X64-AVX512-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq 40(%rdi), %rax +; X64-AVX512-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $8, %ecx +; X64-AVX512-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $16, %ecx +; X64-AVX512-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $24, %ecx +; X64-AVX512-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $32, %rcx +; X64-AVX512-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $40, %rcx +; X64-AVX512-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $48, %rcx +; X64-AVX512-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: shrq $56, %rax +; X64-AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq 32(%rsi), %rcx +; X64-AVX512-NEXT: movq 40(%rsi), %rax +; X64-AVX512-NEXT: movl %ecx, %edx +; X64-AVX512-NEXT: shrl $8, %edx +; X64-AVX512-NEXT: vmovd %ecx, %xmm2 +; X64-AVX512-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movl %ecx, %edx +; X64-AVX512-NEXT: shrl $16, %edx +; X64-AVX512-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movl %ecx, %edx +; X64-AVX512-NEXT: shrl $24, %edx +; X64-AVX512-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rcx, %rdx +; X64-AVX512-NEXT: shrq $32, %rdx +; X64-AVX512-NEXT: vpinsrb $4, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rcx, %rdx +; X64-AVX512-NEXT: shrq $40, %rdx +; X64-AVX512-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rcx, %rdx +; X64-AVX512-NEXT: shrq $48, %rdx +; X64-AVX512-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: shrq $56, %rcx +; X64-AVX512-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $8, %ecx +; X64-AVX512-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $16, %ecx +; X64-AVX512-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $24, %ecx +; X64-AVX512-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $32, %rcx +; X64-AVX512-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $40, %rcx +; X64-AVX512-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $48, %rcx +; X64-AVX512-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: shrq $56, %rax +; X64-AVX512-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 ; X64-AVX512-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; X64-AVX512-NEXT: vpxor (%rsi), %ymm0, %ymm0 ; X64-AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vptest %ymm0, %ymm0 ; X64-AVX512-NEXT: sete %al @@ -1676,8 +1935,22 @@ ; X64-MIC-AVX: # %bb.0: ; X64-MIC-AVX-NEXT: vmovdqu (%rdi), %ymm0 ; X64-MIC-AVX-NEXT: vmovdqu (%rsi), %ymm1 -; X64-MIC-AVX-NEXT: vmovdqu 32(%rdi), %xmm2 -; X64-MIC-AVX-NEXT: vmovdqu 32(%rsi), %xmm3 +; X64-MIC-AVX-NEXT: movq 32(%rdi), %rax +; X64-MIC-AVX-NEXT: vmovd %eax, %xmm2 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-MIC-AVX-NEXT: movq 40(%rdi), %rax +; X64-MIC-AVX-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-MIC-AVX-NEXT: movq 32(%rsi), %rax +; X64-MIC-AVX-NEXT: vmovd %eax, %xmm3 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; X64-MIC-AVX-NEXT: movq 40(%rsi), %rax +; X64-MIC-AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm3 ; X64-MIC-AVX-NEXT: vpcmpneqd %zmm3, %zmm2, %k0 ; X64-MIC-AVX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 ; X64-MIC-AVX-NEXT: kortestw %k0, %k1 @@ -1823,9 +2096,52 @@ ; X64-AVX1-LABEL: length48_eq_const: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 32(%rdi), %xmm1 -; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX1-NEXT: movq 32(%rdi), %rax +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $8, %ecx +; X64-AVX1-NEXT: vmovd %eax, %xmm1 +; X64-AVX1-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $16, %ecx +; X64-AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $24, %ecx +; X64-AVX1-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $32, %rcx +; X64-AVX1-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $40, %rcx +; X64-AVX1-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $48, %rcx +; X64-AVX1-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: shrq $56, %rax +; X64-AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq 40(%rdi), %rax +; X64-AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $8, %ecx +; X64-AVX1-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $16, %ecx +; X64-AVX1-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $24, %ecx +; X64-AVX1-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $32, %rcx +; X64-AVX1-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $40, %rcx +; X64-AVX1-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $48, %rcx +; X64-AVX1-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: shrq $56, %rax +; X64-AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: setne %al @@ -1835,9 +2151,52 @@ ; X64-AVX2-LABEL: length48_eq_const: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %xmm1 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: movq 32(%rdi), %rax +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $8, %ecx +; X64-AVX2-NEXT: vmovd %eax, %xmm1 +; X64-AVX2-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $16, %ecx +; X64-AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $24, %ecx +; X64-AVX2-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $32, %rcx +; X64-AVX2-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $40, %rcx +; X64-AVX2-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $48, %rcx +; X64-AVX2-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: shrq $56, %rax +; X64-AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq 40(%rdi), %rax +; X64-AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $8, %ecx +; X64-AVX2-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $16, %ecx +; X64-AVX2-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $24, %ecx +; X64-AVX2-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $32, %rcx +; X64-AVX2-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $40, %rcx +; X64-AVX2-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $48, %rcx +; X64-AVX2-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: shrq $56, %rax +; X64-AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: setne %al @@ -1847,9 +2206,52 @@ ; X64-AVX512-LABEL: length48_eq_const: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX512-NEXT: vmovdqu 32(%rdi), %xmm1 -; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX512-NEXT: movq 32(%rdi), %rax +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $8, %ecx +; X64-AVX512-NEXT: vmovd %eax, %xmm1 +; X64-AVX512-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $16, %ecx +; X64-AVX512-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $24, %ecx +; X64-AVX512-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $32, %rcx +; X64-AVX512-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $40, %rcx +; X64-AVX512-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $48, %rcx +; X64-AVX512-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: shrq $56, %rax +; X64-AVX512-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq 40(%rdi), %rax +; X64-AVX512-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $8, %ecx +; X64-AVX512-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $16, %ecx +; X64-AVX512-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $24, %ecx +; X64-AVX512-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $32, %rcx +; X64-AVX512-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $40, %rcx +; X64-AVX512-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $48, %rcx +; X64-AVX512-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: shrq $56, %rax +; X64-AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vptest %ymm0, %ymm0 ; X64-AVX512-NEXT: setne %al @@ -1859,12 +2261,19 @@ ; X64-MIC-AVX-LABEL: length48_eq_const: ; X64-MIC-AVX: # %bb.0: ; X64-MIC-AVX-NEXT: vmovdqu (%rdi), %ymm0 -; X64-MIC-AVX-NEXT: vmovdqu 32(%rdi), %xmm1 -; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [892613426,959985462,858927408,926299444,0,0,0,0] -; X64-MIC-AVX-NEXT: vpcmpneqd %zmm2, %zmm1, %k0 -; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960] -; X64-MIC-AVX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 -; X64-MIC-AVX-NEXT: kortestw %k0, %k1 +; X64-MIC-AVX-NEXT: movq 32(%rdi), %rax +; X64-MIC-AVX-NEXT: vmovd %eax, %xmm1 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X64-MIC-AVX-NEXT: movq 40(%rdi), %rax +; X64-MIC-AVX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960] +; X64-MIC-AVX-NEXT: vpcmpneqd %zmm2, %zmm0, %k0 +; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [892613426,959985462,858927408,926299444,0,0,0,0] +; X64-MIC-AVX-NEXT: vpcmpneqd %zmm0, %zmm1, %k1 +; X64-MIC-AVX-NEXT: kortestw %k1, %k0 ; X64-MIC-AVX-NEXT: setne %al ; X64-MIC-AVX-NEXT: vzeroupper ; X64-MIC-AVX-NEXT: retq @@ -2388,11 +2797,187 @@ ; X64-AVX512BW-LABEL: length96_eq: ; X64-AVX512BW: # %bb.0: ; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-AVX512BW-NEXT: vmovdqu 64(%rdi), %ymm1 -; X64-AVX512BW-NEXT: vmovdqu 64(%rsi), %ymm2 -; X64-AVX512BW-NEXT: vpcmpneqb (%rsi), %zmm0, %k0 -; X64-AVX512BW-NEXT: vpcmpneqb %zmm2, %zmm1, %k1 -; X64-AVX512BW-NEXT: kortestq %k1, %k0 +; X64-AVX512BW-NEXT: movq 80(%rdi), %rax +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vmovd %eax, %xmm1 +; X64-AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq 88(%rdi), %rax +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq 64(%rdi), %rax +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vmovd %eax, %xmm2 +; X64-AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq 72(%rdi), %rax +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; X64-AVX512BW-NEXT: movq 80(%rsi), %rax +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vmovd %eax, %xmm2 +; X64-AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq 88(%rsi), %rax +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq 64(%rsi), %rcx +; X64-AVX512BW-NEXT: movq 72(%rsi), %rax +; X64-AVX512BW-NEXT: movl %ecx, %edx +; X64-AVX512BW-NEXT: shrl $8, %edx +; X64-AVX512BW-NEXT: vmovd %ecx, %xmm3 +; X64-AVX512BW-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movl %ecx, %edx +; X64-AVX512BW-NEXT: shrl $16, %edx +; X64-AVX512BW-NEXT: vpinsrb $2, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movl %ecx, %edx +; X64-AVX512BW-NEXT: shrl $24, %edx +; X64-AVX512BW-NEXT: vpinsrb $3, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rcx, %rdx +; X64-AVX512BW-NEXT: shrq $32, %rdx +; X64-AVX512BW-NEXT: vpinsrb $4, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rcx, %rdx +; X64-AVX512BW-NEXT: shrq $40, %rdx +; X64-AVX512BW-NEXT: vpinsrb $5, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rcx, %rdx +; X64-AVX512BW-NEXT: shrq $48, %rdx +; X64-AVX512BW-NEXT: vpinsrb $6, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: shrq $56, %rcx +; X64-AVX512BW-NEXT: vpinsrb $7, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; X64-AVX512BW-NEXT: vpcmpneqb %zmm2, %zmm1, %k0 +; X64-AVX512BW-NEXT: vpcmpneqb (%rsi), %zmm0, %k1 +; X64-AVX512BW-NEXT: kortestq %k0, %k1 ; X64-AVX512BW-NEXT: setne %al ; X64-AVX512BW-NEXT: vzeroupper ; X64-AVX512BW-NEXT: retq @@ -2400,11 +2985,43 @@ ; X64-AVX512F-LABEL: length96_eq: ; X64-AVX512F: # %bb.0: ; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-AVX512F-NEXT: vmovdqu 64(%rdi), %ymm1 -; X64-AVX512F-NEXT: vmovdqu 64(%rsi), %ymm2 -; X64-AVX512F-NEXT: vpcmpneqd (%rsi), %zmm0, %k0 -; X64-AVX512F-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 -; X64-AVX512F-NEXT: kortestw %k1, %k0 +; X64-AVX512F-NEXT: movq 80(%rdi), %rax +; X64-AVX512F-NEXT: vmovd %eax, %xmm1 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X64-AVX512F-NEXT: movq 88(%rdi), %rax +; X64-AVX512F-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; X64-AVX512F-NEXT: movq 64(%rdi), %rax +; X64-AVX512F-NEXT: vmovd %eax, %xmm2 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: movq 72(%rdi), %rax +; X64-AVX512F-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; X64-AVX512F-NEXT: movq 80(%rsi), %rax +; X64-AVX512F-NEXT: vmovd %eax, %xmm2 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: movq 88(%rsi), %rax +; X64-AVX512F-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: movq 64(%rsi), %rax +; X64-AVX512F-NEXT: movq 72(%rsi), %rcx +; X64-AVX512F-NEXT: vmovd %eax, %xmm3 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; X64-AVX512F-NEXT: vpinsrd $2, %ecx, %xmm3, %xmm3 +; X64-AVX512F-NEXT: shrq $32, %rcx +; X64-AVX512F-NEXT: vpinsrd $3, %ecx, %xmm3, %xmm3 +; X64-AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; X64-AVX512F-NEXT: vpcmpneqd %zmm2, %zmm1, %k0 +; X64-AVX512F-NEXT: vpcmpneqd (%rsi), %zmm0, %k1 +; X64-AVX512F-NEXT: kortestw %k0, %k1 ; X64-AVX512F-NEXT: setne %al ; X64-AVX512F-NEXT: vzeroupper ; X64-AVX512F-NEXT: retq @@ -2429,11 +3046,43 @@ ; X64-MIC-AVX512F-LABEL: length96_eq: ; X64-MIC-AVX512F: # %bb.0: ; X64-MIC-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-MIC-AVX512F-NEXT: vmovdqu 64(%rdi), %ymm1 -; X64-MIC-AVX512F-NEXT: vmovdqu 64(%rsi), %ymm2 -; X64-MIC-AVX512F-NEXT: vpcmpneqd (%rsi), %zmm0, %k0 -; X64-MIC-AVX512F-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 -; X64-MIC-AVX512F-NEXT: kortestw %k1, %k0 +; X64-MIC-AVX512F-NEXT: movq 80(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vmovd %eax, %xmm1 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X64-MIC-AVX512F-NEXT: movq 88(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; X64-MIC-AVX512F-NEXT: movq 64(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vmovd %eax, %xmm2 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: movq 72(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; X64-MIC-AVX512F-NEXT: movq 80(%rsi), %rax +; X64-MIC-AVX512F-NEXT: vmovd %eax, %xmm2 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: movq 88(%rsi), %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: movq 64(%rsi), %rax +; X64-MIC-AVX512F-NEXT: movq 72(%rsi), %rcx +; X64-MIC-AVX512F-NEXT: vmovd %eax, %xmm3 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; X64-MIC-AVX512F-NEXT: vpinsrd $2, %ecx, %xmm3, %xmm3 +; X64-MIC-AVX512F-NEXT: shrq $32, %rcx +; X64-MIC-AVX512F-NEXT: vpinsrd $3, %ecx, %xmm3, %xmm3 +; X64-MIC-AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; X64-MIC-AVX512F-NEXT: vpcmpneqd %zmm2, %zmm1, %k0 +; X64-MIC-AVX512F-NEXT: vpcmpneqd (%rsi), %zmm0, %k1 +; X64-MIC-AVX512F-NEXT: kortestw %k0, %k1 ; X64-MIC-AVX512F-NEXT: setne %al ; X64-MIC-AVX512F-NEXT: vzeroupper ; X64-MIC-AVX512F-NEXT: retq @@ -2517,10 +3166,98 @@ ; X64-AVX512BW-LABEL: length96_eq_const: ; X64-AVX512BW: # %bb.0: ; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-AVX512BW-NEXT: vmovdqu 64(%rdi), %ymm1 -; X64-AVX512BW-NEXT: vpcmpneqb .L.str(%rip), %zmm0, %k0 -; X64-AVX512BW-NEXT: vpcmpneqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k1 -; X64-AVX512BW-NEXT: kortestq %k1, %k0 +; X64-AVX512BW-NEXT: movq 80(%rdi), %rax +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vmovd %eax, %xmm1 +; X64-AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq 88(%rdi), %rax +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq 64(%rdi), %rax +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vmovd %eax, %xmm2 +; X64-AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq 72(%rdi), %rax +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; X64-AVX512BW-NEXT: vpcmpneqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k0 +; X64-AVX512BW-NEXT: vpcmpneqb .L.str(%rip), %zmm0, %k1 +; X64-AVX512BW-NEXT: kortestq %k0, %k1 ; X64-AVX512BW-NEXT: sete %al ; X64-AVX512BW-NEXT: vzeroupper ; X64-AVX512BW-NEXT: retq @@ -2528,10 +3265,26 @@ ; X64-AVX512F-LABEL: length96_eq_const: ; X64-AVX512F: # %bb.0: ; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-AVX512F-NEXT: vmovdqu 64(%rdi), %ymm1 -; X64-AVX512F-NEXT: vpcmpneqd .L.str(%rip), %zmm0, %k0 -; X64-AVX512F-NEXT: vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k1 -; X64-AVX512F-NEXT: kortestw %k1, %k0 +; X64-AVX512F-NEXT: movq 80(%rdi), %rax +; X64-AVX512F-NEXT: vmovd %eax, %xmm1 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X64-AVX512F-NEXT: movq 88(%rdi), %rax +; X64-AVX512F-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; X64-AVX512F-NEXT: movq 64(%rdi), %rax +; X64-AVX512F-NEXT: vmovd %eax, %xmm2 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: movq 72(%rdi), %rax +; X64-AVX512F-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; X64-AVX512F-NEXT: vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k0 +; X64-AVX512F-NEXT: vpcmpneqd .L.str(%rip), %zmm0, %k1 +; X64-AVX512F-NEXT: kortestw %k0, %k1 ; X64-AVX512F-NEXT: sete %al ; X64-AVX512F-NEXT: vzeroupper ; X64-AVX512F-NEXT: retq @@ -2556,10 +3309,26 @@ ; X64-MIC-AVX512F-LABEL: length96_eq_const: ; X64-MIC-AVX512F: # %bb.0: ; X64-MIC-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-MIC-AVX512F-NEXT: vmovdqu 64(%rdi), %ymm1 -; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str(%rip), %zmm0, %k0 -; X64-MIC-AVX512F-NEXT: vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k1 -; X64-MIC-AVX512F-NEXT: kortestw %k1, %k0 +; X64-MIC-AVX512F-NEXT: movq 80(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vmovd %eax, %xmm1 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X64-MIC-AVX512F-NEXT: movq 88(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; X64-MIC-AVX512F-NEXT: movq 64(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vmovd %eax, %xmm2 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: movq 72(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; X64-MIC-AVX512F-NEXT: vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k0 +; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str(%rip), %zmm0, %k1 +; X64-MIC-AVX512F-NEXT: kortestw %k0, %k1 ; X64-MIC-AVX512F-NEXT: sete %al ; X64-MIC-AVX512F-NEXT: vzeroupper ; X64-MIC-AVX512F-NEXT: retq diff --git a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll --- a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll @@ -106,9 +106,9 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzwl (%ecx), %edx ; X86-NEXT: xorw (%eax), %dx -; X86-NEXT: movb 2(%ecx), %cl -; X86-NEXT: xorb 2(%eax), %cl -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl 2(%ecx), %ecx +; X86-NEXT: movzbl 2(%eax), %eax +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: orw %dx, %ax ; X86-NEXT: setne %al ; X86-NEXT: retl @@ -197,9 +197,9 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %edx ; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: movb 4(%ecx), %cl -; X86-NEXT: xorb 4(%eax), %cl -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl 4(%ecx), %ecx +; X86-NEXT: movzbl 4(%eax), %eax +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: orl %edx, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/llvm/test/CodeGen/X86/memcmp-optsize.ll --- a/llvm/test/CodeGen/X86/memcmp-optsize.ll +++ b/llvm/test/CodeGen/X86/memcmp-optsize.ll @@ -94,10 +94,10 @@ ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: xorw (%rsi), %ax -; X64-NEXT: movb 2(%rdi), %cl -; X64-NEXT: xorb 2(%rsi), %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orw %ax, %cx +; X64-NEXT: movzbl 2(%rdi), %ecx +; X64-NEXT: movzbl 2(%rsi), %edx +; X64-NEXT: xorl %ecx, %edx +; X64-NEXT: orw %ax, %dx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind @@ -173,10 +173,10 @@ ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: xorl (%rsi), %eax -; X64-NEXT: movb 4(%rdi), %cl -; X64-NEXT: xorb 4(%rsi), %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orl %eax, %ecx +; X64-NEXT: movzbl 4(%rdi), %ecx +; X64-NEXT: movzbl 4(%rsi), %edx +; X64-NEXT: xorl %ecx, %edx +; X64-NEXT: orl %eax, %edx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind @@ -230,8 +230,9 @@ ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: xorq (%rsi), %rax ; X64-NEXT: movl 8(%rdi), %ecx -; X64-NEXT: xorl 8(%rsi), %ecx -; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movl 8(%rsi), %edx +; X64-NEXT: xorq %rcx, %rdx +; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind diff --git a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll --- a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll @@ -106,9 +106,9 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzwl (%ecx), %edx ; X86-NEXT: xorw (%eax), %dx -; X86-NEXT: movb 2(%ecx), %cl -; X86-NEXT: xorb 2(%eax), %cl -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl 2(%ecx), %ecx +; X86-NEXT: movzbl 2(%eax), %eax +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: orw %dx, %ax ; X86-NEXT: setne %al ; X86-NEXT: retl @@ -197,9 +197,9 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %edx ; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: movb 4(%ecx), %cl -; X86-NEXT: xorb 4(%eax), %cl -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl 4(%ecx), %ecx +; X86-NEXT: movzbl 4(%eax), %eax +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: orl %edx, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll --- a/llvm/test/CodeGen/X86/memcmp-pgso.ll +++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll @@ -94,10 +94,10 @@ ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: xorw (%rsi), %ax -; X64-NEXT: movb 2(%rdi), %cl -; X64-NEXT: xorb 2(%rsi), %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orw %ax, %cx +; X64-NEXT: movzbl 2(%rdi), %ecx +; X64-NEXT: movzbl 2(%rsi), %edx +; X64-NEXT: xorl %ecx, %edx +; X64-NEXT: orw %ax, %dx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind @@ -173,10 +173,10 @@ ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: xorl (%rsi), %eax -; X64-NEXT: movb 4(%rdi), %cl -; X64-NEXT: xorb 4(%rsi), %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orl %eax, %ecx +; X64-NEXT: movzbl 4(%rdi), %ecx +; X64-NEXT: movzbl 4(%rsi), %edx +; X64-NEXT: xorl %ecx, %edx +; X64-NEXT: orl %eax, %edx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind @@ -230,8 +230,9 @@ ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: xorq (%rsi), %rax ; X64-NEXT: movl 8(%rdi), %ecx -; X64-NEXT: xorl 8(%rsi), %ecx -; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movl 8(%rsi), %edx +; X64-NEXT: xorq %rcx, %rdx +; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind diff --git a/llvm/test/CodeGen/X86/memcmp-x32.ll b/llvm/test/CodeGen/X86/memcmp-x32.ll --- a/llvm/test/CodeGen/X86/memcmp-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-x32.ll @@ -206,8 +206,8 @@ ; X86-NEXT: movzwl (%ecx), %edx ; X86-NEXT: xorw (%eax), %dx ; X86-NEXT: movzbl 2(%ecx), %ecx -; X86-NEXT: xorb 2(%eax), %cl -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl 2(%eax), %eax +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: orw %dx, %ax ; X86-NEXT: setne %al ; X86-NEXT: retl @@ -339,8 +339,8 @@ ; X86-NEXT: movl (%ecx), %edx ; X86-NEXT: xorl (%eax), %edx ; X86-NEXT: movzbl 4(%ecx), %ecx -; X86-NEXT: xorb 4(%eax), %cl -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl 4(%eax), %eax +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: orl %edx, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll --- a/llvm/test/CodeGen/X86/memcmp.ll +++ b/llvm/test/CodeGen/X86/memcmp.ll @@ -193,9 +193,9 @@ ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: xorw (%rsi), %ax ; X64-NEXT: movzbl 2(%rdi), %ecx -; X64-NEXT: xorb 2(%rsi), %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orw %ax, %cx +; X64-NEXT: movzbl 2(%rsi), %edx +; X64-NEXT: xorl %ecx, %edx +; X64-NEXT: orw %ax, %dx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind @@ -310,9 +310,9 @@ ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: xorl (%rsi), %eax ; X64-NEXT: movzbl 4(%rdi), %ecx -; X64-NEXT: xorb 4(%rsi), %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orl %eax, %ecx +; X64-NEXT: movzbl 4(%rsi), %edx +; X64-NEXT: xorl %ecx, %edx +; X64-NEXT: orl %eax, %edx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind @@ -469,9 +469,9 @@ ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: xorq (%rsi), %rax ; X64-NEXT: movzbl 8(%rdi), %ecx -; X64-NEXT: xorb 8(%rsi), %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movzbl 8(%rsi), %edx +; X64-NEXT: xorq %rcx, %rdx +; X64-NEXT: orq %rax, %rdx ; X64-NEXT: sete %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind @@ -485,9 +485,9 @@ ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: xorq (%rsi), %rax ; X64-NEXT: movzwl 8(%rdi), %ecx -; X64-NEXT: xorw 8(%rsi), %cx -; X64-NEXT: movzwl %cx, %ecx -; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movzwl 8(%rsi), %edx +; X64-NEXT: xorq %rcx, %rdx +; X64-NEXT: orq %rax, %rdx ; X64-NEXT: sete %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind @@ -516,8 +516,9 @@ ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: xorq (%rsi), %rax ; X64-NEXT: movl 8(%rdi), %ecx -; X64-NEXT: xorl 8(%rsi), %ecx -; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movl 8(%rsi), %edx +; X64-NEXT: xorq %rcx, %rdx +; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind @@ -1493,10 +1494,96 @@ ; X64-AVX1-LABEL: length48_eq: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 32(%rdi), %xmm1 -; X64-AVX1-NEXT: vmovups 32(%rsi), %xmm2 -; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0 +; X64-AVX1-NEXT: movq 32(%rdi), %rax +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $8, %ecx +; X64-AVX1-NEXT: vmovd %eax, %xmm1 +; X64-AVX1-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $16, %ecx +; X64-AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $24, %ecx +; X64-AVX1-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $32, %rcx +; X64-AVX1-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $40, %rcx +; X64-AVX1-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $48, %rcx +; X64-AVX1-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: shrq $56, %rax +; X64-AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq 40(%rdi), %rax +; X64-AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $8, %ecx +; X64-AVX1-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $16, %ecx +; X64-AVX1-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $24, %ecx +; X64-AVX1-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $32, %rcx +; X64-AVX1-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $40, %rcx +; X64-AVX1-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $48, %rcx +; X64-AVX1-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: shrq $56, %rax +; X64-AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq 32(%rsi), %rcx +; X64-AVX1-NEXT: movq 40(%rsi), %rax +; X64-AVX1-NEXT: movl %ecx, %edx +; X64-AVX1-NEXT: shrl $8, %edx +; X64-AVX1-NEXT: vmovd %ecx, %xmm2 +; X64-AVX1-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movl %ecx, %edx +; X64-AVX1-NEXT: shrl $16, %edx +; X64-AVX1-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movl %ecx, %edx +; X64-AVX1-NEXT: shrl $24, %edx +; X64-AVX1-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rcx, %rdx +; X64-AVX1-NEXT: shrq $32, %rdx +; X64-AVX1-NEXT: vpinsrb $4, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rcx, %rdx +; X64-AVX1-NEXT: shrq $40, %rdx +; X64-AVX1-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rcx, %rdx +; X64-AVX1-NEXT: shrq $48, %rdx +; X64-AVX1-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: shrq $56, %rcx +; X64-AVX1-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $8, %ecx +; X64-AVX1-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $16, %ecx +; X64-AVX1-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $24, %ecx +; X64-AVX1-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $32, %rcx +; X64-AVX1-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $40, %rcx +; X64-AVX1-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $48, %rcx +; X64-AVX1-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: shrq $56, %rax +; X64-AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 ; X64-AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm1 +; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0 ; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: sete %al @@ -1506,10 +1593,96 @@ ; X64-AVX2-LABEL: length48_eq: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %xmm1 -; X64-AVX2-NEXT: vmovdqu 32(%rsi), %xmm2 -; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: movq 32(%rdi), %rax +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $8, %ecx +; X64-AVX2-NEXT: vmovd %eax, %xmm1 +; X64-AVX2-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $16, %ecx +; X64-AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $24, %ecx +; X64-AVX2-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $32, %rcx +; X64-AVX2-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $40, %rcx +; X64-AVX2-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $48, %rcx +; X64-AVX2-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: shrq $56, %rax +; X64-AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq 40(%rdi), %rax +; X64-AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $8, %ecx +; X64-AVX2-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $16, %ecx +; X64-AVX2-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $24, %ecx +; X64-AVX2-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $32, %rcx +; X64-AVX2-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $40, %rcx +; X64-AVX2-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $48, %rcx +; X64-AVX2-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: shrq $56, %rax +; X64-AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq 32(%rsi), %rcx +; X64-AVX2-NEXT: movq 40(%rsi), %rax +; X64-AVX2-NEXT: movl %ecx, %edx +; X64-AVX2-NEXT: shrl $8, %edx +; X64-AVX2-NEXT: vmovd %ecx, %xmm2 +; X64-AVX2-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movl %ecx, %edx +; X64-AVX2-NEXT: shrl $16, %edx +; X64-AVX2-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movl %ecx, %edx +; X64-AVX2-NEXT: shrl $24, %edx +; X64-AVX2-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rcx, %rdx +; X64-AVX2-NEXT: shrq $32, %rdx +; X64-AVX2-NEXT: vpinsrb $4, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rcx, %rdx +; X64-AVX2-NEXT: shrq $40, %rdx +; X64-AVX2-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rcx, %rdx +; X64-AVX2-NEXT: shrq $48, %rdx +; X64-AVX2-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: shrq $56, %rcx +; X64-AVX2-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $8, %ecx +; X64-AVX2-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $16, %ecx +; X64-AVX2-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $24, %ecx +; X64-AVX2-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $32, %rcx +; X64-AVX2-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $40, %rcx +; X64-AVX2-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $48, %rcx +; X64-AVX2-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: shrq $56, %rax +; X64-AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 ; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: sete %al @@ -1519,10 +1692,96 @@ ; X64-AVX512-LABEL: length48_eq: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX512-NEXT: vmovdqu 32(%rdi), %xmm1 -; X64-AVX512-NEXT: vmovdqu 32(%rsi), %xmm2 -; X64-AVX512-NEXT: vpxor (%rsi), %ymm0, %ymm0 +; X64-AVX512-NEXT: movq 32(%rdi), %rax +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $8, %ecx +; X64-AVX512-NEXT: vmovd %eax, %xmm1 +; X64-AVX512-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $16, %ecx +; X64-AVX512-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $24, %ecx +; X64-AVX512-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $32, %rcx +; X64-AVX512-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $40, %rcx +; X64-AVX512-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $48, %rcx +; X64-AVX512-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: shrq $56, %rax +; X64-AVX512-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq 40(%rdi), %rax +; X64-AVX512-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $8, %ecx +; X64-AVX512-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $16, %ecx +; X64-AVX512-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $24, %ecx +; X64-AVX512-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $32, %rcx +; X64-AVX512-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $40, %rcx +; X64-AVX512-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $48, %rcx +; X64-AVX512-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: shrq $56, %rax +; X64-AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq 32(%rsi), %rcx +; X64-AVX512-NEXT: movq 40(%rsi), %rax +; X64-AVX512-NEXT: movl %ecx, %edx +; X64-AVX512-NEXT: shrl $8, %edx +; X64-AVX512-NEXT: vmovd %ecx, %xmm2 +; X64-AVX512-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movl %ecx, %edx +; X64-AVX512-NEXT: shrl $16, %edx +; X64-AVX512-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movl %ecx, %edx +; X64-AVX512-NEXT: shrl $24, %edx +; X64-AVX512-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rcx, %rdx +; X64-AVX512-NEXT: shrq $32, %rdx +; X64-AVX512-NEXT: vpinsrb $4, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rcx, %rdx +; X64-AVX512-NEXT: shrq $40, %rdx +; X64-AVX512-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rcx, %rdx +; X64-AVX512-NEXT: shrq $48, %rdx +; X64-AVX512-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: shrq $56, %rcx +; X64-AVX512-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $8, %ecx +; X64-AVX512-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $16, %ecx +; X64-AVX512-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $24, %ecx +; X64-AVX512-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $32, %rcx +; X64-AVX512-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $40, %rcx +; X64-AVX512-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $48, %rcx +; X64-AVX512-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: shrq $56, %rax +; X64-AVX512-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 ; X64-AVX512-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; X64-AVX512-NEXT: vpxor (%rsi), %ymm0, %ymm0 ; X64-AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vptest %ymm0, %ymm0 ; X64-AVX512-NEXT: sete %al @@ -1533,8 +1792,22 @@ ; X64-MIC-AVX: # %bb.0: ; X64-MIC-AVX-NEXT: vmovdqu (%rdi), %ymm0 ; X64-MIC-AVX-NEXT: vmovdqu (%rsi), %ymm1 -; X64-MIC-AVX-NEXT: vmovdqu 32(%rdi), %xmm2 -; X64-MIC-AVX-NEXT: vmovdqu 32(%rsi), %xmm3 +; X64-MIC-AVX-NEXT: movq 32(%rdi), %rax +; X64-MIC-AVX-NEXT: vmovd %eax, %xmm2 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-MIC-AVX-NEXT: movq 40(%rdi), %rax +; X64-MIC-AVX-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-MIC-AVX-NEXT: movq 32(%rsi), %rax +; X64-MIC-AVX-NEXT: vmovd %eax, %xmm3 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; X64-MIC-AVX-NEXT: movq 40(%rsi), %rax +; X64-MIC-AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm3 ; X64-MIC-AVX-NEXT: vpcmpneqd %zmm3, %zmm2, %k0 ; X64-MIC-AVX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 ; X64-MIC-AVX-NEXT: kortestw %k0, %k1 @@ -1606,9 +1879,52 @@ ; X64-AVX1-LABEL: length48_eq_const: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 32(%rdi), %xmm1 -; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX1-NEXT: movq 32(%rdi), %rax +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $8, %ecx +; X64-AVX1-NEXT: vmovd %eax, %xmm1 +; X64-AVX1-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $16, %ecx +; X64-AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $24, %ecx +; X64-AVX1-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $32, %rcx +; X64-AVX1-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $40, %rcx +; X64-AVX1-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $48, %rcx +; X64-AVX1-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: shrq $56, %rax +; X64-AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq 40(%rdi), %rax +; X64-AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $8, %ecx +; X64-AVX1-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $16, %ecx +; X64-AVX1-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $24, %ecx +; X64-AVX1-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $32, %rcx +; X64-AVX1-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $40, %rcx +; X64-AVX1-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $48, %rcx +; X64-AVX1-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: shrq $56, %rax +; X64-AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: setne %al @@ -1618,9 +1934,52 @@ ; X64-AVX2-LABEL: length48_eq_const: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %xmm1 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: movq 32(%rdi), %rax +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $8, %ecx +; X64-AVX2-NEXT: vmovd %eax, %xmm1 +; X64-AVX2-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $16, %ecx +; X64-AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $24, %ecx +; X64-AVX2-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $32, %rcx +; X64-AVX2-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $40, %rcx +; X64-AVX2-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $48, %rcx +; X64-AVX2-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: shrq $56, %rax +; X64-AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq 40(%rdi), %rax +; X64-AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $8, %ecx +; X64-AVX2-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $16, %ecx +; X64-AVX2-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $24, %ecx +; X64-AVX2-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $32, %rcx +; X64-AVX2-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $40, %rcx +; X64-AVX2-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $48, %rcx +; X64-AVX2-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: shrq $56, %rax +; X64-AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: setne %al @@ -1630,9 +1989,52 @@ ; X64-AVX512-LABEL: length48_eq_const: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX512-NEXT: vmovdqu 32(%rdi), %xmm1 -; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX512-NEXT: movq 32(%rdi), %rax +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $8, %ecx +; X64-AVX512-NEXT: vmovd %eax, %xmm1 +; X64-AVX512-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $16, %ecx +; X64-AVX512-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $24, %ecx +; X64-AVX512-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $32, %rcx +; X64-AVX512-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $40, %rcx +; X64-AVX512-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $48, %rcx +; X64-AVX512-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: shrq $56, %rax +; X64-AVX512-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq 40(%rdi), %rax +; X64-AVX512-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $8, %ecx +; X64-AVX512-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $16, %ecx +; X64-AVX512-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $24, %ecx +; X64-AVX512-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $32, %rcx +; X64-AVX512-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $40, %rcx +; X64-AVX512-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $48, %rcx +; X64-AVX512-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: shrq $56, %rax +; X64-AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vptest %ymm0, %ymm0 ; X64-AVX512-NEXT: setne %al @@ -1642,12 +2044,19 @@ ; X64-MIC-AVX-LABEL: length48_eq_const: ; X64-MIC-AVX: # %bb.0: ; X64-MIC-AVX-NEXT: vmovdqu (%rdi), %ymm0 -; X64-MIC-AVX-NEXT: vmovdqu 32(%rdi), %xmm1 -; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [892613426,959985462,858927408,926299444,0,0,0,0] -; X64-MIC-AVX-NEXT: vpcmpneqd %zmm2, %zmm1, %k0 -; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960] -; X64-MIC-AVX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 -; X64-MIC-AVX-NEXT: kortestw %k0, %k1 +; X64-MIC-AVX-NEXT: movq 32(%rdi), %rax +; X64-MIC-AVX-NEXT: vmovd %eax, %xmm1 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X64-MIC-AVX-NEXT: movq 40(%rdi), %rax +; X64-MIC-AVX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960] +; X64-MIC-AVX-NEXT: vpcmpneqd %zmm2, %zmm0, %k0 +; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [892613426,959985462,858927408,926299444,0,0,0,0] +; X64-MIC-AVX-NEXT: vpcmpneqd %zmm0, %zmm1, %k1 +; X64-MIC-AVX-NEXT: kortestw %k1, %k0 ; X64-MIC-AVX-NEXT: setne %al ; X64-MIC-AVX-NEXT: vzeroupper ; X64-MIC-AVX-NEXT: retq @@ -2047,11 +2456,187 @@ ; X64-AVX512BW-LABEL: length96_eq: ; X64-AVX512BW: # %bb.0: ; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-AVX512BW-NEXT: vmovdqu 64(%rdi), %ymm1 -; X64-AVX512BW-NEXT: vmovdqu 64(%rsi), %ymm2 -; X64-AVX512BW-NEXT: vpcmpneqb (%rsi), %zmm0, %k0 -; X64-AVX512BW-NEXT: vpcmpneqb %zmm2, %zmm1, %k1 -; X64-AVX512BW-NEXT: kortestq %k1, %k0 +; X64-AVX512BW-NEXT: movq 80(%rdi), %rax +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vmovd %eax, %xmm1 +; X64-AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq 88(%rdi), %rax +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq 64(%rdi), %rax +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vmovd %eax, %xmm2 +; X64-AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq 72(%rdi), %rax +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; X64-AVX512BW-NEXT: movq 80(%rsi), %rax +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vmovd %eax, %xmm2 +; X64-AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq 88(%rsi), %rax +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq 64(%rsi), %rcx +; X64-AVX512BW-NEXT: movq 72(%rsi), %rax +; X64-AVX512BW-NEXT: movl %ecx, %edx +; X64-AVX512BW-NEXT: shrl $8, %edx +; X64-AVX512BW-NEXT: vmovd %ecx, %xmm3 +; X64-AVX512BW-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movl %ecx, %edx +; X64-AVX512BW-NEXT: shrl $16, %edx +; X64-AVX512BW-NEXT: vpinsrb $2, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movl %ecx, %edx +; X64-AVX512BW-NEXT: shrl $24, %edx +; X64-AVX512BW-NEXT: vpinsrb $3, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rcx, %rdx +; X64-AVX512BW-NEXT: shrq $32, %rdx +; X64-AVX512BW-NEXT: vpinsrb $4, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rcx, %rdx +; X64-AVX512BW-NEXT: shrq $40, %rdx +; X64-AVX512BW-NEXT: vpinsrb $5, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rcx, %rdx +; X64-AVX512BW-NEXT: shrq $48, %rdx +; X64-AVX512BW-NEXT: vpinsrb $6, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: shrq $56, %rcx +; X64-AVX512BW-NEXT: vpinsrb $7, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; X64-AVX512BW-NEXT: vpcmpneqb %zmm2, %zmm1, %k0 +; X64-AVX512BW-NEXT: vpcmpneqb (%rsi), %zmm0, %k1 +; X64-AVX512BW-NEXT: kortestq %k0, %k1 ; X64-AVX512BW-NEXT: setne %al ; X64-AVX512BW-NEXT: vzeroupper ; X64-AVX512BW-NEXT: retq @@ -2059,11 +2644,43 @@ ; X64-AVX512F-LABEL: length96_eq: ; X64-AVX512F: # %bb.0: ; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-AVX512F-NEXT: vmovdqu 64(%rdi), %ymm1 -; X64-AVX512F-NEXT: vmovdqu 64(%rsi), %ymm2 -; X64-AVX512F-NEXT: vpcmpneqd (%rsi), %zmm0, %k0 -; X64-AVX512F-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 -; X64-AVX512F-NEXT: kortestw %k1, %k0 +; X64-AVX512F-NEXT: movq 80(%rdi), %rax +; X64-AVX512F-NEXT: vmovd %eax, %xmm1 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X64-AVX512F-NEXT: movq 88(%rdi), %rax +; X64-AVX512F-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; X64-AVX512F-NEXT: movq 64(%rdi), %rax +; X64-AVX512F-NEXT: vmovd %eax, %xmm2 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: movq 72(%rdi), %rax +; X64-AVX512F-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; X64-AVX512F-NEXT: movq 80(%rsi), %rax +; X64-AVX512F-NEXT: vmovd %eax, %xmm2 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: movq 88(%rsi), %rax +; X64-AVX512F-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: movq 64(%rsi), %rax +; X64-AVX512F-NEXT: movq 72(%rsi), %rcx +; X64-AVX512F-NEXT: vmovd %eax, %xmm3 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; X64-AVX512F-NEXT: vpinsrd $2, %ecx, %xmm3, %xmm3 +; X64-AVX512F-NEXT: shrq $32, %rcx +; X64-AVX512F-NEXT: vpinsrd $3, %ecx, %xmm3, %xmm3 +; X64-AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; X64-AVX512F-NEXT: vpcmpneqd %zmm2, %zmm1, %k0 +; X64-AVX512F-NEXT: vpcmpneqd (%rsi), %zmm0, %k1 +; X64-AVX512F-NEXT: kortestw %k0, %k1 ; X64-AVX512F-NEXT: setne %al ; X64-AVX512F-NEXT: vzeroupper ; X64-AVX512F-NEXT: retq @@ -2081,11 +2698,43 @@ ; X64-MIC-AVX512F-LABEL: length96_eq: ; X64-MIC-AVX512F: # %bb.0: ; X64-MIC-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-MIC-AVX512F-NEXT: vmovdqu 64(%rdi), %ymm1 -; X64-MIC-AVX512F-NEXT: vmovdqu 64(%rsi), %ymm2 -; X64-MIC-AVX512F-NEXT: vpcmpneqd (%rsi), %zmm0, %k0 -; X64-MIC-AVX512F-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 -; X64-MIC-AVX512F-NEXT: kortestw %k1, %k0 +; X64-MIC-AVX512F-NEXT: movq 80(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vmovd %eax, %xmm1 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X64-MIC-AVX512F-NEXT: movq 88(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; X64-MIC-AVX512F-NEXT: movq 64(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vmovd %eax, %xmm2 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: movq 72(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; X64-MIC-AVX512F-NEXT: movq 80(%rsi), %rax +; X64-MIC-AVX512F-NEXT: vmovd %eax, %xmm2 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: movq 88(%rsi), %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: movq 64(%rsi), %rax +; X64-MIC-AVX512F-NEXT: movq 72(%rsi), %rcx +; X64-MIC-AVX512F-NEXT: vmovd %eax, %xmm3 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; X64-MIC-AVX512F-NEXT: vpinsrd $2, %ecx, %xmm3, %xmm3 +; X64-MIC-AVX512F-NEXT: shrq $32, %rcx +; X64-MIC-AVX512F-NEXT: vpinsrd $3, %ecx, %xmm3, %xmm3 +; X64-MIC-AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; X64-MIC-AVX512F-NEXT: vpcmpneqd %zmm2, %zmm1, %k0 +; X64-MIC-AVX512F-NEXT: vpcmpneqd (%rsi), %zmm0, %k1 +; X64-MIC-AVX512F-NEXT: kortestw %k0, %k1 ; X64-MIC-AVX512F-NEXT: setne %al ; X64-MIC-AVX512F-NEXT: vzeroupper ; X64-MIC-AVX512F-NEXT: retq @@ -2161,10 +2810,98 @@ ; X64-AVX512BW-LABEL: length96_eq_const: ; X64-AVX512BW: # %bb.0: ; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-AVX512BW-NEXT: vmovdqu 64(%rdi), %ymm1 -; X64-AVX512BW-NEXT: vpcmpneqb .L.str(%rip), %zmm0, %k0 -; X64-AVX512BW-NEXT: vpcmpneqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k1 -; X64-AVX512BW-NEXT: kortestq %k1, %k0 +; X64-AVX512BW-NEXT: movq 80(%rdi), %rax +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vmovd %eax, %xmm1 +; X64-AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq 88(%rdi), %rax +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq 64(%rdi), %rax +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vmovd %eax, %xmm2 +; X64-AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq 72(%rdi), %rax +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; X64-AVX512BW-NEXT: vpcmpneqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k0 +; X64-AVX512BW-NEXT: vpcmpneqb .L.str(%rip), %zmm0, %k1 +; X64-AVX512BW-NEXT: kortestq %k0, %k1 ; X64-AVX512BW-NEXT: sete %al ; X64-AVX512BW-NEXT: vzeroupper ; X64-AVX512BW-NEXT: retq @@ -2172,10 +2909,26 @@ ; X64-AVX512F-LABEL: length96_eq_const: ; X64-AVX512F: # %bb.0: ; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-AVX512F-NEXT: vmovdqu 64(%rdi), %ymm1 -; X64-AVX512F-NEXT: vpcmpneqd .L.str(%rip), %zmm0, %k0 -; X64-AVX512F-NEXT: vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k1 -; X64-AVX512F-NEXT: kortestw %k1, %k0 +; X64-AVX512F-NEXT: movq 80(%rdi), %rax +; X64-AVX512F-NEXT: vmovd %eax, %xmm1 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X64-AVX512F-NEXT: movq 88(%rdi), %rax +; X64-AVX512F-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; X64-AVX512F-NEXT: movq 64(%rdi), %rax +; X64-AVX512F-NEXT: vmovd %eax, %xmm2 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: movq 72(%rdi), %rax +; X64-AVX512F-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; X64-AVX512F-NEXT: vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k0 +; X64-AVX512F-NEXT: vpcmpneqd .L.str(%rip), %zmm0, %k1 +; X64-AVX512F-NEXT: kortestw %k0, %k1 ; X64-AVX512F-NEXT: sete %al ; X64-AVX512F-NEXT: vzeroupper ; X64-AVX512F-NEXT: retq @@ -2194,10 +2947,26 @@ ; X64-MIC-AVX512F-LABEL: length96_eq_const: ; X64-MIC-AVX512F: # %bb.0: ; X64-MIC-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-MIC-AVX512F-NEXT: vmovdqu 64(%rdi), %ymm1 -; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str(%rip), %zmm0, %k0 -; X64-MIC-AVX512F-NEXT: vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k1 -; X64-MIC-AVX512F-NEXT: kortestw %k1, %k0 +; X64-MIC-AVX512F-NEXT: movq 80(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vmovd %eax, %xmm1 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X64-MIC-AVX512F-NEXT: movq 88(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; X64-MIC-AVX512F-NEXT: movq 64(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vmovd %eax, %xmm2 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: movq 72(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; X64-MIC-AVX512F-NEXT: vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k0 +; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str(%rip), %zmm0, %k1 +; X64-MIC-AVX512F-NEXT: kortestw %k0, %k1 ; X64-MIC-AVX512F-NEXT: sete %al ; X64-MIC-AVX512F-NEXT: vzeroupper ; X64-MIC-AVX512F-NEXT: retq diff --git a/llvm/test/CodeGen/X86/memset-zero.ll b/llvm/test/CodeGen/X86/memset-zero.ll --- a/llvm/test/CodeGen/X86/memset-zero.ll +++ b/llvm/test/CodeGen/X86/memset-zero.ll @@ -735,10 +735,10 @@ ; SANDYBRIDGE-LABEL: memset_64: ; SANDYBRIDGE: # %bb.0: # %entry ; SANDYBRIDGE-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; SANDYBRIDGE-NEXT: vmovups %xmm0, 16(%rdi) -; SANDYBRIDGE-NEXT: vmovups %xmm0, (%rdi) ; SANDYBRIDGE-NEXT: vmovups %xmm0, 48(%rdi) ; SANDYBRIDGE-NEXT: vmovups %xmm0, 32(%rdi) +; SANDYBRIDGE-NEXT: vmovups %xmm0, 16(%rdi) +; SANDYBRIDGE-NEXT: vmovups %xmm0, (%rdi) ; SANDYBRIDGE-NEXT: retq ; ; SKYLAKE-LABEL: memset_64: diff --git a/llvm/test/CodeGen/X86/memset64-on-x86-32.ll b/llvm/test/CodeGen/X86/memset64-on-x86-32.ll --- a/llvm/test/CodeGen/X86/memset64-on-x86-32.ll +++ b/llvm/test/CodeGen/X86/memset64-on-x86-32.ll @@ -18,26 +18,26 @@ ; SLOW_32-LABEL: bork: ; SLOW_32: # %bb.0: ; SLOW_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SLOW_32-NEXT: movl $0, 4(%eax) -; SLOW_32-NEXT: movl $0, (%eax) -; SLOW_32-NEXT: movl $0, 12(%eax) -; SLOW_32-NEXT: movl $0, 8(%eax) -; SLOW_32-NEXT: movl $0, 20(%eax) -; SLOW_32-NEXT: movl $0, 16(%eax) -; SLOW_32-NEXT: movl $0, 28(%eax) -; SLOW_32-NEXT: movl $0, 24(%eax) -; SLOW_32-NEXT: movl $0, 36(%eax) -; SLOW_32-NEXT: movl $0, 32(%eax) -; SLOW_32-NEXT: movl $0, 44(%eax) -; SLOW_32-NEXT: movl $0, 40(%eax) -; SLOW_32-NEXT: movl $0, 52(%eax) -; SLOW_32-NEXT: movl $0, 48(%eax) -; SLOW_32-NEXT: movl $0, 60(%eax) -; SLOW_32-NEXT: movl $0, 56(%eax) -; SLOW_32-NEXT: movl $0, 68(%eax) -; SLOW_32-NEXT: movl $0, 64(%eax) ; SLOW_32-NEXT: movl $0, 76(%eax) ; SLOW_32-NEXT: movl $0, 72(%eax) +; SLOW_32-NEXT: movl $0, 68(%eax) +; SLOW_32-NEXT: movl $0, 64(%eax) +; SLOW_32-NEXT: movl $0, 60(%eax) +; SLOW_32-NEXT: movl $0, 56(%eax) +; SLOW_32-NEXT: movl $0, 52(%eax) +; SLOW_32-NEXT: movl $0, 48(%eax) +; SLOW_32-NEXT: movl $0, 44(%eax) +; SLOW_32-NEXT: movl $0, 40(%eax) +; SLOW_32-NEXT: movl $0, 36(%eax) +; SLOW_32-NEXT: movl $0, 32(%eax) +; SLOW_32-NEXT: movl $0, 28(%eax) +; SLOW_32-NEXT: movl $0, 24(%eax) +; SLOW_32-NEXT: movl $0, 20(%eax) +; SLOW_32-NEXT: movl $0, 16(%eax) +; SLOW_32-NEXT: movl $0, 12(%eax) +; SLOW_32-NEXT: movl $0, 8(%eax) +; SLOW_32-NEXT: movl $0, 4(%eax) +; SLOW_32-NEXT: movl $0, (%eax) ; SLOW_32-NEXT: retl ; ; SLOW_64-LABEL: bork: diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -1226,9 +1226,9 @@ ; ; AVX-LABEL: merge_4f32_f32_X0YY: ; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vbroadcastss (%rsi), %xmm0 ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX-NEXT: retq ; ; X86-SSE-LABEL: merge_4f32_f32_X0YY: diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -8,13 +8,17 @@ define <8 x double> @merge_8f64_2f64_12u4(ptr %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8f64_2f64_12u4: ; ALL: # %bb.0: -; ALL-NEXT: vmovups 16(%rdi), %zmm0 +; ALL-NEXT: vmovups 16(%rdi), %ymm0 +; ALL-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm1 +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_8f64_2f64_12u4: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovups 16(%eax), %zmm0 +; X86-AVX512F-NEXT: vmovups 16(%eax), %ymm0 +; X86-AVX512F-NEXT: vinsertf128 $1, 64(%eax), %ymm0, %ymm1 +; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds <2 x double>, ptr %ptr, i64 1 %ptr1 = getelementptr inbounds <2 x double>, ptr %ptr, i64 2 @@ -31,15 +35,19 @@ define <8 x double> @merge_8f64_2f64_23z5(ptr %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8f64_2f64_23z5: ; ALL: # %bb.0: -; ALL-NEXT: vmovdqu64 32(%rdi), %zmm0 -; ALL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; ALL-NEXT: vmovups 32(%rdi), %ymm0 +; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vinsertf128 $1, 80(%rdi), %ymm1, %ymm1 +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_8f64_2f64_23z5: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovdqu64 32(%eax), %zmm0 -; X86-AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 +; X86-AVX512F-NEXT: vmovups 32(%eax), %ymm0 +; X86-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-AVX512F-NEXT: vinsertf128 $1, 80(%eax), %ymm1, %ymm1 +; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds <2 x double>, ptr %ptr, i64 2 %ptr1 = getelementptr inbounds <2 x double>, ptr %ptr, i64 3 @@ -209,7 +217,7 @@ ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0 -; X86-AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 +; X86-AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i64, ptr %ptr, i64 1 %ptr2 = getelementptr inbounds i64, ptr %ptr, i64 3 diff --git a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll --- a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll @@ -324,8 +324,8 @@ ; X86-SSE4A-NEXT: movsd 8(%ecx), %xmm1 # xmm1 = mem[0],zero ; X86-SSE4A-NEXT: movsd 16(%ecx), %xmm2 # xmm2 = mem[0],zero ; X86-SSE4A-NEXT: movsd 24(%ecx), %xmm3 # xmm3 = mem[0],zero -; X86-SSE4A-NEXT: movntsd %xmm0, (%eax) ; X86-SSE4A-NEXT: movntsd %xmm1, 8(%eax) +; X86-SSE4A-NEXT: movntsd %xmm0, (%eax) ; X86-SSE4A-NEXT: movntsd %xmm3, 24(%eax) ; X86-SSE4A-NEXT: movntsd %xmm2, 16(%eax) ; X86-SSE4A-NEXT: retl @@ -352,8 +352,8 @@ ; X64-SSE4A-NEXT: movsd 8(%rdi), %xmm1 # xmm1 = mem[0],zero ; X64-SSE4A-NEXT: movsd 16(%rdi), %xmm2 # xmm2 = mem[0],zero ; X64-SSE4A-NEXT: movsd 24(%rdi), %xmm3 # xmm3 = mem[0],zero -; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi) ; X64-SSE4A-NEXT: movntsd %xmm1, 8(%rsi) +; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi) ; X64-SSE4A-NEXT: movntsd %xmm3, 24(%rsi) ; X64-SSE4A-NEXT: movntsd %xmm2, 16(%rsi) ; X64-SSE4A-NEXT: retq @@ -435,8 +435,8 @@ ; X86-SSE4A-NEXT: movsd 8(%ecx), %xmm1 # xmm1 = mem[0],zero ; X86-SSE4A-NEXT: movsd 16(%ecx), %xmm2 # xmm2 = mem[0],zero ; X86-SSE4A-NEXT: movsd 24(%ecx), %xmm3 # xmm3 = mem[0],zero -; X86-SSE4A-NEXT: movntsd %xmm0, (%eax) ; X86-SSE4A-NEXT: movntsd %xmm1, 8(%eax) +; X86-SSE4A-NEXT: movntsd %xmm0, (%eax) ; X86-SSE4A-NEXT: movntsd %xmm3, 24(%eax) ; X86-SSE4A-NEXT: movntsd %xmm2, 16(%eax) ; X86-SSE4A-NEXT: retl @@ -463,8 +463,8 @@ ; X64-SSE4A-NEXT: movsd 8(%rdi), %xmm1 # xmm1 = mem[0],zero ; X64-SSE4A-NEXT: movsd 16(%rdi), %xmm2 # xmm2 = mem[0],zero ; X64-SSE4A-NEXT: movsd 24(%rdi), %xmm3 # xmm3 = mem[0],zero -; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi) ; X64-SSE4A-NEXT: movntsd %xmm1, 8(%rsi) +; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi) ; X64-SSE4A-NEXT: movntsd %xmm3, 24(%rsi) ; X64-SSE4A-NEXT: movntsd %xmm2, 16(%rsi) ; X64-SSE4A-NEXT: retq diff --git a/llvm/test/CodeGen/X86/merge-store-constants.ll b/llvm/test/CodeGen/X86/merge-store-constants.ll --- a/llvm/test/CodeGen/X86/merge-store-constants.ll +++ b/llvm/test/CodeGen/X86/merge-store-constants.ll @@ -12,8 +12,10 @@ ; ; X64-LABEL: big_nonzero_16_bytes: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,3,4] -; X64-NEXT: vmovups %xmm0, (%rdi) +; X64-NEXT: movabsq $8589934593, %rax # imm = 0x200000001 +; X64-NEXT: movq %rax, (%rdi) +; X64-NEXT: movabsq $17179869187, %rax # imm = 0x400000003 +; X64-NEXT: movq %rax, 8(%rdi) ; X64-NEXT: retq %arrayidx1 = getelementptr inbounds i32, ptr %a, i64 1 %arrayidx2 = getelementptr inbounds i32, ptr %a, i64 2 @@ -58,9 +60,9 @@ ; X32-LABEL: big_nonzero_32_bytes_splat: ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42] -; X32-NEXT: vmovups %ymm0, (%eax) -; X32-NEXT: vzeroupper +; X32-NEXT: vbroadcastss {{.*#+}} xmm0 = [42,42,42,42] +; X32-NEXT: vmovups %xmm0, (%eax) +; X32-NEXT: vmovups %xmm0, 16(%eax) ; X32-NEXT: retl ; ; X64-LABEL: big_nonzero_32_bytes_splat: diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll --- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll @@ -1869,13 +1869,13 @@ ; ; AVX512VL-FALLBACK-LABEL: vec128_i16_signed_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm3 -; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm3, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %xmm3, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -1994,13 +1994,13 @@ ; AVX512VL-FALLBACK-LABEL: vec128_i16_unsigned_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vpminuw %xmm1, %xmm0, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3 +; AVX512VL-FALLBACK-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX512VL-FALLBACK-NEXT: vpternlogd $190, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm4 ; AVX512VL-FALLBACK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %xmm4, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -2102,13 +2102,13 @@ ; AVX512VL-FALLBACK-LABEL: vec128_i16_signed_mem_reg: ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512VL-FALLBACK-NEXT: vpminsw %xmm0, %xmm1, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %xmm0, %xmm1, %xmm3 -; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm3, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm0, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %xmm0, %xmm2, %xmm0 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpminsw %xmm0, %xmm1, %xmm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0 +; AVX512VL-FALLBACK-NEXT: vpsubw %xmm3, %xmm0, %xmm0 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX512VL-FALLBACK-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX512VL-FALLBACK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -2210,13 +2210,13 @@ ; AVX512VL-FALLBACK-LABEL: vec128_i16_signed_reg_mem: ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512VL-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm3 -; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm3, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %xmm3, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -2323,13 +2323,13 @@ ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512VL-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm3 -; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm3, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %xmm3, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -2554,14 +2554,20 @@ ; ; AVX512VL-FALLBACK-LABEL: vec128_i8_signed_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm3 -; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-FALLBACK-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512VL-FALLBACK-NEXT: vzeroupper ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-FALLBACK-LABEL: vec128_i8_signed_reg_reg: @@ -2787,14 +2793,20 @@ ; AVX512VL-FALLBACK-LABEL: vec128_i8_unsigned_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vpminub %xmm1, %xmm0, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 +; AVX512VL-FALLBACK-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX512VL-FALLBACK-NEXT: vpternlogd $190, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm4 ; AVX512VL-FALLBACK-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-FALLBACK-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512VL-FALLBACK-NEXT: vzeroupper ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-FALLBACK-LABEL: vec128_i8_unsigned_reg_reg: @@ -3026,14 +3038,20 @@ ; AVX512VL-FALLBACK-LABEL: vec128_i8_signed_mem_reg: ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512VL-FALLBACK-NEXT: vpminsb %xmm0, %xmm1, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %xmm0, %xmm1, %xmm3 -; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %xmm0, %xmm2, %xmm0 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %xmm0, %xmm1, %xmm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 +; AVX512VL-FALLBACK-NEXT: vpsubb %xmm3, %xmm0, %xmm0 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-FALLBACK-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-FALLBACK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512VL-FALLBACK-NEXT: vzeroupper ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-FALLBACK-LABEL: vec128_i8_signed_mem_reg: @@ -3263,14 +3281,20 @@ ; AVX512VL-FALLBACK-LABEL: vec128_i8_signed_reg_mem: ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512VL-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm3 -; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-FALLBACK-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512VL-FALLBACK-NEXT: vzeroupper ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-FALLBACK-LABEL: vec128_i8_signed_reg_mem: @@ -3509,14 +3533,20 @@ ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512VL-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm3 -; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-FALLBACK-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512VL-FALLBACK-NEXT: vzeroupper ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-FALLBACK-LABEL: vec128_i8_signed_mem_mem: diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll --- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -1328,13 +1328,13 @@ ; ; AVX512VL-FALLBACK-LABEL: vec256_i16_signed_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpxor %ymm1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm2, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm3, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -1456,13 +1456,13 @@ ; AVX512VL-FALLBACK-LABEL: vec256_i16_unsigned_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpternlogd $190, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm2, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpxor %ymm2, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -1584,13 +1584,13 @@ ; AVX512VL-FALLBACK-LABEL: vec256_i16_signed_mem_reg: ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm1, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpxor %ymm0, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm3, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -1712,13 +1712,13 @@ ; AVX512VL-FALLBACK-LABEL: vec256_i16_signed_reg_mem: ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpxor %ymm1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm2, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm3, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -1845,13 +1845,13 @@ ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpxor %ymm1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm2, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm3, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -2032,13 +2032,23 @@ ; ; AVX512VL-FALLBACK-LABEL: vec256_i8_signed_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -2221,13 +2231,23 @@ ; AVX512VL-FALLBACK-LABEL: vec256_i8_unsigned_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpternlogd $190, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm2, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm3, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -2410,13 +2430,23 @@ ; AVX512VL-FALLBACK-LABEL: vec256_i8_signed_mem_reg: ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm1, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -2599,13 +2629,23 @@ ; AVX512VL-FALLBACK-LABEL: vec256_i8_signed_reg_mem: ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -2793,13 +2833,23 @@ ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll --- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll @@ -274,22 +274,19 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsubw %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm5 +; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm6 ; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm6 +; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsubw %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 -; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpmullw %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -301,22 +298,19 @@ ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -354,19 +348,17 @@ ; AVX512F-NEXT: vpminuw %ymm1, %ymm0, %ymm6 ; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm7 ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 -; AVX512F-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsubw %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 +; AVX512F-NEXT: vpternlogd $190, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7 ; AVX512F-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsubw %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4 -; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 -; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vpmullw %ymm7, %ymm1, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm7, %ymm4 +; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -381,19 +373,17 @@ ; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm7 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm4, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 +; AVX512VL-FALLBACK-NEXT: vpternlogd $190, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7 ; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm4, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm7, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -431,22 +421,19 @@ ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm5 +; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm6 ; AVX512F-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpsubw %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpsubw %ymm6, %ymm0, %ymm0 +; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm6 +; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512F-NEXT: vpsubw %ymm0, %ymm6, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpmullw %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -459,22 +446,19 @@ ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm6, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -512,22 +496,19 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsubw %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm5 +; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm6 ; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm6 +; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsubw %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 -; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpmullw %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -540,22 +521,19 @@ ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -594,22 +572,19 @@ ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm5 +; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm6 ; AVX512F-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpsubw %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpsubw %ymm6, %ymm0, %ymm0 +; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm6 +; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512F-NEXT: vpsubw %ymm0, %ymm6, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpmullw %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -623,22 +598,19 @@ ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm6, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -679,62 +651,88 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwind { ; AVX512F-LABEL: vec512_i8_signed_reg_reg: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5 +; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm6 ; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5 +; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpminsb %ymm3, %ymm2, %ymm6 +; AVX512F-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 +; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2 +; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpmullw %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpmullw %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm1, %ymm8, %ymm1 +; AVX512F-NEXT: vpackuswb %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpor %ymm7, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3 +; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm2, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm8, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm1, %ymm8, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm8, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm8, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq @@ -765,62 +763,90 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwind { ; AVX512F-LABEL: vec512_i8_unsigned_reg_reg: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpminub %ymm2, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpminub %ymm3, %ymm2, %ymm4 +; AVX512F-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5 ; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm6 ; AVX512F-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7 ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 -; AVX512F-NEXT: vpmaxub %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 +; AVX512F-NEXT: vpternlogd $190, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7 ; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpmaxub %ymm3, %ymm2, %ymm3 +; AVX512F-NEXT: vpsubb %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4 -; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm4, %zmm4 -; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2 -; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpmullw %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpmullw %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vextracti64x4 $1, %zmm7, %ymm6 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpmullw %ymm7, %ymm4, %ymm4 +; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpmullw %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpackuswb %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_unsigned_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpminub %ymm2, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminub %ymm3, %ymm2, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 +; AVX512VL-FALLBACK-NEXT: vpternlogd $190, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7 ; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm3, %ymm2, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm4, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm7, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq @@ -853,66 +879,92 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind { ; AVX512F-LABEL: vec512_i8_signed_mem_reg: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsb %ymm1, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpminsb %ymm0, %ymm2, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpsubb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 +; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm5 +; AVX512F-NEXT: vpminsb %ymm0, %ymm1, %ymm6 +; AVX512F-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpsubb %ymm6, %ymm0, %ymm0 +; AVX512F-NEXT: vpminsb %ymm3, %ymm2, %ymm6 +; AVX512F-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 +; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1 +; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0 -; AVX512F-NEXT: vpsubb %ymm0, %ymm7, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpmullw %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpmullw %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm0, %ymm8, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm6, %ymm0, %ymm0 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpor %ymm7, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3 +; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm2, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm1, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm2, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm7, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm8, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpand %ymm0, %ymm8, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm8, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm8, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_signed_mem_reg: @@ -943,66 +995,92 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind { ; AVX512F-LABEL: vec512_i8_signed_reg_mem: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 +; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm6 +; AVX512F-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpsubb %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm6 +; AVX512F-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 +; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2 -; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpmullw %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm8, %ymm2 +; AVX512F-NEXT: vpackuswb %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpor %ymm7, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3 +; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_mem: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm8, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpand %ymm2, %ymm8, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm8, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm8, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_signed_reg_mem: @@ -1033,67 +1111,93 @@ define <64 x i8> @vec512_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; AVX512F-LABEL: vec512_i8_signed_mem_mem: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsb %ymm1, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpminsb %ymm0, %ymm2, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpsubb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 +; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm6 +; AVX512F-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpsubb %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm6 +; AVX512F-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 +; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0 -; AVX512F-NEXT: vpsubb %ymm0, %ymm7, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpmullw %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm8, %ymm2 +; AVX512F-NEXT: vpackuswb %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpor %ymm7, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3 +; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_mem: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm2, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm7, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm8, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpand %ymm2, %ymm8, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm8, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm8, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -17,10 +17,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpaddd 32(%rsi), %ymm1, %ymm1 ; CHECK-NEXT: vpaddd (%rsi), %ymm0, %ymm0 -; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vpaddd 32(%rsi), %ymm1, %ymm1 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vmovdqa %ymm0, (%rdx) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %d = load <16 x i32>, <16 x i32>* %a @@ -48,12 +48,54 @@ define dso_local void @avg_v64i8_256(<64 x i8>* %a, <64 x i8>* %b) "min-legal-vector-width"="256" { ; CHECK-LABEL: avg_v64i8_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rsi), %ymm0 -; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1 -; CHECK-NEXT: vpavgb (%rdi), %ymm0, %ymm0 -; CHECK-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1 -; CHECK-NEXT: vmovdqu %ymm1, (%rax) +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovdb %ymm15, %xmm15 +; CHECK-NEXT: vpmovdb %ymm14, %xmm14 +; CHECK-NEXT: vpmovdb %ymm13, %xmm13 +; CHECK-NEXT: vpmovdb %ymm12, %xmm12 +; CHECK-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 +; CHECK-NEXT: vinserti128 $1, %xmm12, %ymm13, %ymm12 +; CHECK-NEXT: vpmovdb %ymm7, %xmm7 +; CHECK-NEXT: vpmovdb %ymm6, %xmm6 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] +; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; CHECK-NEXT: vpmovdb %ymm5, %xmm5 +; CHECK-NEXT: vpmovdb %ymm4, %xmm4 +; CHECK-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] +; CHECK-NEXT: vpmovdb %ymm11, %xmm5 +; CHECK-NEXT: vpmovdb %ymm10, %xmm6 +; CHECK-NEXT: vpavgb %ymm4, %ymm12, %ymm4 +; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; CHECK-NEXT: vpmovdb %ymm9, %xmm6 +; CHECK-NEXT: vpmovdb %ymm8, %xmm7 +; CHECK-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; CHECK-NEXT: vpmovdb %ymm3, %xmm3 +; CHECK-NEXT: vpmovdb %ymm2, %xmm2 +; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; CHECK-NEXT: vpmovdb %ymm1, %xmm1 +; CHECK-NEXT: vpmovdb %ymm0, %xmm0 +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; CHECK-NEXT: vpavgb %ymm5, %ymm0, %ymm0 ; CHECK-NEXT: vmovdqu %ymm0, (%rax) +; CHECK-NEXT: vmovdqu %ymm4, (%rax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = load <64 x i8>, <64 x i8>* %a @@ -72,8 +114,29 @@ define dso_local void @avg_v64i8_512(<64 x i8>* %a, <64 x i8>* %b) "min-legal-vector-width"="512" { ; CHECK-LABEL: avg_v64i8_512: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 -; CHECK-NEXT: vpavgb (%rsi), %zmm0, %zmm0 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; CHECK-NEXT: vpmovdb %zmm7, %xmm7 +; CHECK-NEXT: vpmovdb %zmm6, %xmm6 +; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; CHECK-NEXT: vpmovdb %zmm5, %xmm5 +; CHECK-NEXT: vpmovdb %zmm4, %xmm4 +; CHECK-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; CHECK-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; CHECK-NEXT: vpmovdb %zmm3, %xmm3 +; CHECK-NEXT: vpmovdb %zmm2, %xmm2 +; CHECK-NEXT: vpmovdb %zmm1, %xmm1 +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 +; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; CHECK-NEXT: vpavgb %zmm4, %zmm0, %zmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, (%rax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -92,12 +155,26 @@ define dso_local void @pmaddwd_32_256(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "min-legal-vector-width"="256" { ; CHECK-LABEL: pmaddwd_32_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 -; CHECK-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 -; CHECK-NEXT: vmovdqa %ymm0, (%rdx) -; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vpmovsxwd 48(%rdi), %ymm0 +; CHECK-NEXT: vpmovsxwd 32(%rdi), %ymm1 +; CHECK-NEXT: vpmovsxwd 16(%rdi), %ymm2 +; CHECK-NEXT: vpackssdw %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpmovsxwd (%rdi), %ymm1 +; CHECK-NEXT: vpackssdw %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vpmovsxwd 48(%rsi), %ymm2 +; CHECK-NEXT: vpmovsxwd 32(%rsi), %ymm3 +; CHECK-NEXT: vpmovsxwd 16(%rsi), %ymm4 +; CHECK-NEXT: vpmovsxwd (%rsi), %ymm5 +; CHECK-NEXT: vpackssdw %ymm2, %ymm3, %ymm2 +; CHECK-NEXT: vpackssdw %ymm4, %ymm5, %ymm3 +; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; CHECK-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1 +; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; CHECK-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vmovdqa %ymm0, 32(%rdx) +; CHECK-NEXT: vmovdqa %ymm1, (%rdx) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %A = load <32 x i16>, <32 x i16>* %APtr @@ -115,8 +192,17 @@ define dso_local void @pmaddwd_32_512(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "min-legal-vector-width"="512" { ; CHECK-LABEL: pmaddwd_32_512: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 -; CHECK-NEXT: vpmaddwd (%rsi), %zmm0, %zmm0 +; CHECK-NEXT: vpmovsxwd 32(%rdi), %zmm0 +; CHECK-NEXT: vpmovsxwd (%rdi), %zmm1 +; CHECK-NEXT: vpmovsxwd 32(%rsi), %zmm2 +; CHECK-NEXT: vpmovsxwd (%rsi), %zmm3 +; CHECK-NEXT: vpmovdw %zmm3, %ymm3 +; CHECK-NEXT: vpmovdw %zmm2, %ymm2 +; CHECK-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; CHECK-NEXT: vpmovdw %zmm1, %ymm1 +; CHECK-NEXT: vpmovdw %zmm0, %ymm0 +; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; CHECK-NEXT: vpmaddwd %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -137,10 +223,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpsubusb 32(%rsi), %ymm1, %ymm1 ; CHECK-NEXT: vpsubusb (%rsi), %ymm0, %ymm0 -; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vpsubusb 32(%rsi), %ymm1, %ymm1 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vmovdqa %ymm0, (%rdx) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %x = load <64 x i8>, <64 x i8>* %xptr @@ -180,14 +266,26 @@ ; CHECK-SKX-NEXT: .p2align 4, 0x90 ; CHECK-SKX-NEXT: .LBB8_1: # %vector.body ; CHECK-SKX-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-SKX-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 -; CHECK-SKX-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 -; CHECK-SKX-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 -; CHECK-SKX-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 +; CHECK-SKX-NEXT: vpmovsxbd 24(%rdi,%rcx), %ymm3 +; CHECK-SKX-NEXT: vpmovsxbd 16(%rdi,%rcx), %ymm4 +; CHECK-SKX-NEXT: vpmovsxbd 8(%rdi,%rcx), %ymm5 +; CHECK-SKX-NEXT: vpackssdw %ymm3, %ymm4, %ymm3 +; CHECK-SKX-NEXT: vpmovsxbd (%rdi,%rcx), %ymm4 +; CHECK-SKX-NEXT: vpackssdw %ymm5, %ymm4, %ymm4 +; CHECK-SKX-NEXT: vpmovsxbd 24(%rsi,%rcx), %ymm5 +; CHECK-SKX-NEXT: vpmovsxbd 16(%rsi,%rcx), %ymm6 +; CHECK-SKX-NEXT: vpmovsxbd 8(%rsi,%rcx), %ymm7 +; CHECK-SKX-NEXT: vpmovsxbd (%rsi,%rcx), %ymm8 +; CHECK-SKX-NEXT: vpackssdw %ymm5, %ymm6, %ymm5 +; CHECK-SKX-NEXT: vpackssdw %ymm7, %ymm8, %ymm6 +; CHECK-SKX-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] +; CHECK-SKX-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,1,3] +; CHECK-SKX-NEXT: vpmaddwd %ymm4, %ymm6, %ymm4 +; CHECK-SKX-NEXT: vpaddd %ymm1, %ymm4, %ymm1 +; CHECK-SKX-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; CHECK-SKX-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,2,1,3] +; CHECK-SKX-NEXT: vpmaddwd %ymm3, %ymm4, %ymm3 ; CHECK-SKX-NEXT: vpaddd %ymm2, %ymm3, %ymm2 -; CHECK-SKX-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 -; CHECK-SKX-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 -; CHECK-SKX-NEXT: vpaddd %ymm1, %ymm3, %ymm1 ; CHECK-SKX-NEXT: addq $32, %rcx ; CHECK-SKX-NEXT: cmpq %rcx, %rax ; CHECK-SKX-NEXT: jne .LBB8_1 @@ -196,9 +294,9 @@ ; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 ; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-SKX-NEXT: vmovd %xmm0, %eax @@ -215,14 +313,26 @@ ; CHECK-AVX512-NEXT: .p2align 4, 0x90 ; CHECK-AVX512-NEXT: .LBB8_1: # %vector.body ; CHECK-AVX512-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-AVX512-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 -; CHECK-AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 -; CHECK-AVX512-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 -; CHECK-AVX512-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 +; CHECK-AVX512-NEXT: vpmovsxbd 24(%rdi,%rcx), %ymm3 +; CHECK-AVX512-NEXT: vpmovsxbd 16(%rdi,%rcx), %ymm4 +; CHECK-AVX512-NEXT: vpmovsxbd 8(%rdi,%rcx), %ymm5 +; CHECK-AVX512-NEXT: vpackssdw %ymm3, %ymm4, %ymm3 +; CHECK-AVX512-NEXT: vpmovsxbd (%rdi,%rcx), %ymm4 +; CHECK-AVX512-NEXT: vpackssdw %ymm5, %ymm4, %ymm4 +; CHECK-AVX512-NEXT: vpmovsxbd 24(%rsi,%rcx), %ymm5 +; CHECK-AVX512-NEXT: vpmovsxbd 16(%rsi,%rcx), %ymm6 +; CHECK-AVX512-NEXT: vpmovsxbd 8(%rsi,%rcx), %ymm7 +; CHECK-AVX512-NEXT: vpmovsxbd (%rsi,%rcx), %ymm8 +; CHECK-AVX512-NEXT: vpackssdw %ymm5, %ymm6, %ymm5 +; CHECK-AVX512-NEXT: vpackssdw %ymm7, %ymm8, %ymm6 +; CHECK-AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] +; CHECK-AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,1,3] +; CHECK-AVX512-NEXT: vpmaddwd %ymm4, %ymm6, %ymm4 +; CHECK-AVX512-NEXT: vpaddd %ymm1, %ymm4, %ymm1 +; CHECK-AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; CHECK-AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,2,1,3] +; CHECK-AVX512-NEXT: vpmaddwd %ymm3, %ymm4, %ymm3 ; CHECK-AVX512-NEXT: vpaddd %ymm2, %ymm3, %ymm2 -; CHECK-AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 -; CHECK-AVX512-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 -; CHECK-AVX512-NEXT: vpaddd %ymm1, %ymm3, %ymm1 ; CHECK-AVX512-NEXT: addq $32, %rcx ; CHECK-AVX512-NEXT: cmpq %rcx, %rax ; CHECK-AVX512-NEXT: jne .LBB8_1 @@ -231,9 +341,9 @@ ; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1 ; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512-NEXT: vmovd %xmm0, %eax @@ -250,14 +360,26 @@ ; CHECK-VBMI-NEXT: .p2align 4, 0x90 ; CHECK-VBMI-NEXT: .LBB8_1: # %vector.body ; CHECK-VBMI-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-VBMI-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 -; CHECK-VBMI-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 -; CHECK-VBMI-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 -; CHECK-VBMI-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 +; CHECK-VBMI-NEXT: vpmovsxbd 24(%rdi,%rcx), %ymm3 +; CHECK-VBMI-NEXT: vpmovsxbd 16(%rdi,%rcx), %ymm4 +; CHECK-VBMI-NEXT: vpmovsxbd 8(%rdi,%rcx), %ymm5 +; CHECK-VBMI-NEXT: vpackssdw %ymm3, %ymm4, %ymm3 +; CHECK-VBMI-NEXT: vpmovsxbd (%rdi,%rcx), %ymm4 +; CHECK-VBMI-NEXT: vpackssdw %ymm5, %ymm4, %ymm4 +; CHECK-VBMI-NEXT: vpmovsxbd 24(%rsi,%rcx), %ymm5 +; CHECK-VBMI-NEXT: vpmovsxbd 16(%rsi,%rcx), %ymm6 +; CHECK-VBMI-NEXT: vpmovsxbd 8(%rsi,%rcx), %ymm7 +; CHECK-VBMI-NEXT: vpmovsxbd (%rsi,%rcx), %ymm8 +; CHECK-VBMI-NEXT: vpackssdw %ymm5, %ymm6, %ymm5 +; CHECK-VBMI-NEXT: vpackssdw %ymm7, %ymm8, %ymm6 +; CHECK-VBMI-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] +; CHECK-VBMI-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,1,3] +; CHECK-VBMI-NEXT: vpmaddwd %ymm4, %ymm6, %ymm4 +; CHECK-VBMI-NEXT: vpaddd %ymm1, %ymm4, %ymm1 +; CHECK-VBMI-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; CHECK-VBMI-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,2,1,3] +; CHECK-VBMI-NEXT: vpmaddwd %ymm3, %ymm4, %ymm3 ; CHECK-VBMI-NEXT: vpaddd %ymm2, %ymm3, %ymm2 -; CHECK-VBMI-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 -; CHECK-VBMI-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 -; CHECK-VBMI-NEXT: vpaddd %ymm1, %ymm3, %ymm1 ; CHECK-VBMI-NEXT: addq $32, %rcx ; CHECK-VBMI-NEXT: cmpq %rcx, %rax ; CHECK-VBMI-NEXT: jne .LBB8_1 @@ -266,9 +388,9 @@ ; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; CHECK-VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-VBMI-NEXT: vmovd %xmm0, %eax @@ -320,8 +442,12 @@ ; CHECK-SKX-NEXT: .p2align 4, 0x90 ; CHECK-SKX-NEXT: .LBB9_1: # %vector.body ; CHECK-SKX-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-SKX-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 -; CHECK-SKX-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 +; CHECK-SKX-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm2 +; CHECK-SKX-NEXT: vpmovsxbw (%rdi,%rcx), %ymm3 +; CHECK-SKX-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm4 +; CHECK-SKX-NEXT: vpmovsxbw (%rsi,%rcx), %ymm5 +; CHECK-SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; CHECK-SKX-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm3 ; CHECK-SKX-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 ; CHECK-SKX-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; CHECK-SKX-NEXT: addq $32, %rcx @@ -332,9 +458,9 @@ ; CHECK-SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; CHECK-SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 ; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-SKX-NEXT: vmovd %xmm0, %eax @@ -350,8 +476,12 @@ ; CHECK-AVX512-NEXT: .p2align 4, 0x90 ; CHECK-AVX512-NEXT: .LBB9_1: # %vector.body ; CHECK-AVX512-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 -; CHECK-AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 +; CHECK-AVX512-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm2 +; CHECK-AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %ymm3 +; CHECK-AVX512-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm4 +; CHECK-AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %ymm5 +; CHECK-AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; CHECK-AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm3 ; CHECK-AVX512-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 ; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; CHECK-AVX512-NEXT: addq $32, %rcx @@ -362,9 +492,9 @@ ; CHECK-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1 ; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512-NEXT: vmovd %xmm0, %eax @@ -380,8 +510,12 @@ ; CHECK-VBMI-NEXT: .p2align 4, 0x90 ; CHECK-VBMI-NEXT: .LBB9_1: # %vector.body ; CHECK-VBMI-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-VBMI-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 -; CHECK-VBMI-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 +; CHECK-VBMI-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm2 +; CHECK-VBMI-NEXT: vpmovsxbw (%rdi,%rcx), %ymm3 +; CHECK-VBMI-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm4 +; CHECK-VBMI-NEXT: vpmovsxbw (%rsi,%rcx), %ymm5 +; CHECK-VBMI-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; CHECK-VBMI-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm3 ; CHECK-VBMI-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 ; CHECK-VBMI-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; CHECK-VBMI-NEXT: addq $32, %rcx @@ -392,9 +526,9 @@ ; CHECK-VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; CHECK-VBMI-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-VBMI-NEXT: vmovd %xmm0, %eax @@ -456,9 +590,9 @@ ; CHECK-SKX-NEXT: # %bb.2: # %middle.block ; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 ; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-SKX-NEXT: vmovd %xmm0, %eax @@ -481,9 +615,9 @@ ; CHECK-AVX512-NEXT: # %bb.2: # %middle.block ; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1 ; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512-NEXT: vmovd %xmm0, %eax @@ -506,9 +640,9 @@ ; CHECK-VBMI-NEXT: # %bb.2: # %middle.block ; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; CHECK-VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-VBMI-NEXT: vmovd %xmm0, %eax @@ -567,9 +701,9 @@ ; CHECK-SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; CHECK-SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 ; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-SKX-NEXT: vmovd %xmm0, %eax @@ -592,9 +726,9 @@ ; CHECK-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1 ; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512-NEXT: vmovd %xmm0, %eax @@ -617,9 +751,9 @@ ; CHECK-VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; CHECK-VBMI-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-VBMI-NEXT: vmovd %xmm0, %eax @@ -889,23 +1023,23 @@ ; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-SKX-VBMI-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-SKX-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-SKX-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62] -; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm1 ; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; CHECK-SKX-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4 ; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-SKX-VBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm0 -; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm0 -; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62] +; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm4, %ymm2, %ymm0 +; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-SKX-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-SKX-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm1 +; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm4, %ymm2, %ymm1 ; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm0, (%rdx) ; CHECK-SKX-VBMI-NEXT: vzeroupper ; CHECK-SKX-VBMI-NEXT: retq ; @@ -915,27 +1049,27 @@ ; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-AVX512-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-AVX512-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; CHECK-AVX512-NEXT: vpmullw %ymm4, %ymm5, %ymm4 ; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-AVX512-NEXT: vpand %ymm5, %ymm4, %ymm4 -; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; CHECK-AVX512-NEXT: vpand %ymm5, %ymm1, %ymm1 -; CHECK-AVX512-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 -; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm4, %ymm3 -; CHECK-AVX512-NEXT: vpand %ymm5, %ymm3, %ymm3 ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; CHECK-AVX512-NEXT: vpand %ymm5, %ymm0, %ymm0 -; CHECK-AVX512-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 -; CHECK-AVX512-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-AVX512-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 +; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-AVX512-NEXT: vpmullw %ymm2, %ymm4, %ymm2 +; CHECK-AVX512-NEXT: vpand %ymm5, %ymm2, %ymm2 +; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm1, %ymm1 +; CHECK-AVX512-NEXT: vpand %ymm5, %ymm1, %ymm1 +; CHECK-AVX512-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 ; CHECK-AVX512-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-AVX512-NEXT: vmovdqa %ymm0, (%rdx) ; CHECK-AVX512-NEXT: vzeroupper ; CHECK-AVX512-NEXT: retq ; @@ -945,23 +1079,23 @@ ; CHECK-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-VBMI-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62] -; CHECK-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm1 ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; CHECK-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4 ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-VBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm0 -; CHECK-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm0 -; CHECK-VBMI-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62] +; CHECK-VBMI-NEXT: vpermt2b %ymm4, %ymm2, %ymm0 +; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm1 +; CHECK-VBMI-NEXT: vpermt2b %ymm4, %ymm2, %ymm1 ; CHECK-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-VBMI-NEXT: vmovdqa %ymm0, (%rdx) ; CHECK-VBMI-NEXT: vzeroupper ; CHECK-VBMI-NEXT: retq %d = load <64 x i8>, <64 x i8>* %a @@ -1069,11 +1203,12 @@ ; CHECK-NEXT: vmovdqa 96(%rdi), %ymm3 ; CHECK-NEXT: vpmovqb %ymm3, %xmm3 ; CHECK-NEXT: vpmovqb %ymm2, %xmm2 -; CHECK-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,4,0,4] +; CHECK-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 ; CHECK-NEXT: vpmovqb %ymm1, %xmm1 ; CHECK-NEXT: vpmovqb %ymm0, %xmm0 ; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %a = load <16 x i64>, <16 x i64>* %x @@ -1415,10 +1550,11 @@ ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm0 ; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k0 ; CHECK-NEXT: kshiftrd $1, %k0, %k1 +; CHECK-NEXT: movq $-3, %rax +; CHECK-NEXT: kmovq %rax, %k2 +; CHECK-NEXT: kandq %k2, %k1, %k1 ; CHECK-NEXT: kshiftlq $63, %k0, %k2 ; CHECK-NEXT: kshiftrq $62, %k2, %k2 -; CHECK-NEXT: kshiftlq $63, %k1, %k1 -; CHECK-NEXT: kshiftrq $63, %k1, %k1 ; CHECK-NEXT: korq %k2, %k1, %k1 ; CHECK-NEXT: movq $-5, %rax ; CHECK-NEXT: kmovq %rax, %k2 @@ -1889,10 +2025,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rsi), %ymm0 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1 -; CHECK-NEXT: vpcmpgtq 32(%rdi), %ymm1, %ymm1 ; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 -; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vpcmpgtq 32(%rdi), %ymm1, %ymm1 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vmovdqa %ymm0, (%rdx) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %x = load <8 x i64>, <8 x i64>* %xptr @@ -1908,12 +2044,12 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rsi), %ymm0 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1 -; CHECK-NEXT: vpcmpgtq 32(%rdi), %ymm1, %ymm1 ; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 -; CHECK-NEXT: vpsrlq $63, %ymm1, %ymm1 +; CHECK-NEXT: vpcmpgtq 32(%rdi), %ymm1, %ymm1 ; CHECK-NEXT: vpsrlq $63, %ymm0, %ymm0 -; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vpsrlq $63, %ymm1, %ymm1 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vmovdqa %ymm0, (%rdx) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %x = load <8 x i64>, <8 x i64>* %xptr diff --git a/llvm/test/CodeGen/X86/mmx-cvt.ll b/llvm/test/CodeGen/X86/mmx-cvt.ll --- a/llvm/test/CodeGen/X86/mmx-cvt.ll +++ b/llvm/test/CodeGen/X86/mmx-cvt.ll @@ -260,17 +260,11 @@ define <2 x double> @sitofp_v2i32_v2f64(ptr) nounwind { ; X86-LABEL: sitofp_v2i32_v2f64: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movq (%eax), %mm0 ; X86-NEXT: paddd %mm0, %mm0 -; X86-NEXT: movq %mm0, (%esp) -; X86-NEXT: cvtdq2pd (%esp), %xmm0 -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp +; X86-NEXT: movq2dq %mm0, %xmm0 +; X86-NEXT: cvtdq2pd %xmm0, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: sitofp_v2i32_v2f64: diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -259,12 +259,8 @@ ; KNL: # %bb.0: ; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; KNL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 -; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: cmpb $-1, %al -; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper +; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; KNL-NEXT: setb %al ; KNL-NEXT: retq ; ; SKX-LABEL: allones_v8i16_sign: @@ -298,11 +294,8 @@ ; KNL: # %bb.0: ; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; KNL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kortestw %k0, %k0 +; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: allzeros_v8i16_sign: @@ -350,9 +343,8 @@ ; KNL: # %bb.0: ; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; KNL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kortestw %k0, %k0 +; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] +; KNL-NEXT: vptest %ymm1, %ymm0 ; KNL-NEXT: setb %al ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq @@ -401,9 +393,8 @@ ; KNL: # %bb.0: ; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; KNL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kortestw %k0, %k0 +; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] +; KNL-NEXT: vptest %ymm1, %ymm0 ; KNL-NEXT: sete %al ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq @@ -424,10 +415,10 @@ define i1 @allones_v32i16_sign(<32 x i16> %arg) { ; SSE-LABEL: allones_v32i16_sign: ; SSE: # %bb.0: +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: packsswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pmovmskb %xmm2, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax ; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE-NEXT: sete %al ; SSE-NEXT: retq @@ -456,18 +447,15 @@ ; ; KNL-LABEL: allones_v32i16_sign: ; KNL: # %bb.0: -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm2 -; KNL-NEXT: vpmovsxwd %ymm2, %zmm2 -; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; KNL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: andl %eax, %ecx -; KNL-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 +; KNL-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; KNL-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; KNL-NEXT: kortestw %k0, %k0 ; KNL-NEXT: sete %al ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq @@ -488,9 +476,9 @@ define i1 @allzeros_v32i16_sign(<32 x i16> %arg) { ; SSE-LABEL: allzeros_v32i16_sign: ; SSE: # %bb.0: -; SSE-NEXT: packsswb %xmm3, %xmm2 -; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %eax ; SSE-NEXT: testl %eax, %eax ; SSE-NEXT: sete %al @@ -524,9 +512,8 @@ ; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; KNL-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; KNL-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 -; KNL-NEXT: vpor %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; KNL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 ; KNL-NEXT: kortestw %k0, %k0 ; KNL-NEXT: sete %al ; KNL-NEXT: vzeroupper @@ -836,10 +823,10 @@ ; AVX1-LABEL: allones_v8i64_sign: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vtestps %xmm1, %xmm0 ; AVX1-NEXT: setb %al @@ -892,10 +879,10 @@ ; AVX1-LABEL: allzeros_v8i64_sign: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vtestps %xmm0, %xmm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper @@ -976,7 +963,7 @@ ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE2-NEXT: pmovmskb %xmm1, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; @@ -1002,8 +989,10 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movmskpd %xmm0, %eax +; SSE2-NEXT: cmpl $3, %eax ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; @@ -1013,11 +1002,29 @@ ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; -; AVX-LABEL: allzeros_v2i64_not: -; AVX: # %bb.0: -; AVX-NEXT: vptest %xmm0, %xmm0 -; AVX-NEXT: setne %al -; AVX-NEXT: retq +; AVX1OR2-LABEL: allzeros_v2i64_not: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vptest %xmm0, %xmm0 +; AVX1OR2-NEXT: setne %al +; AVX1OR2-NEXT: retq +; +; KNL-LABEL: allzeros_v2i64_not: +; KNL: # %bb.0: +; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: testb $3, %al +; KNL-NEXT: setne %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v2i64_not: +; SKX: # %bb.0: +; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k0 +; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: cmpb $3, %al +; SKX-NEXT: setne %al +; SKX-NEXT: retq %1 = icmp eq <2 x i64> %a0, zeroinitializer %2 = bitcast <2 x i1> %1 to i2 %3 = icmp ne i2 %2, -1 @@ -1025,29 +1032,40 @@ } define i1 @allzeros_v8i32_not(<8 x i32> %a0) { -; SSE2-LABEL: allzeros_v8i32_not: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax -; SSE2-NEXT: setne %al -; SSE2-NEXT: retq +; SSE-LABEL: allzeros_v8i32_not: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpb $-1, %al +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; -; SSE41-LABEL: allzeros_v8i32_not: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; AVX1-LABEL: allzeros_v8i32_not: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vptest %xmm0, %xmm0 +; AVX1-NEXT: setne %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq ; -; AVX-LABEL: allzeros_v8i32_not: -; AVX: # %bb.0: -; AVX-NEXT: vptest %ymm0, %ymm0 -; AVX-NEXT: setne %al -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX2-LABEL: allzeros_v8i32_not: +; AVX2: # %bb.0: +; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: setne %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: allzeros_v8i32_not: +; AVX512: # %bb.0: +; AVX512-NEXT: vptest %ymm0, %ymm0 +; AVX512-NEXT: setne %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = icmp eq <8 x i32> %a0, zeroinitializer %2 = bitcast <8 x i1> %1 to i8 %3 = icmp ne i8 %2, -1 @@ -1057,38 +1075,71 @@ define i1 @allzeros_v8i64_not(<8 x i64> %a0) { ; SSE2-LABEL: allzeros_v8i64_not: ; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,0,3,2] +; SSE2-NEXT: pand %xmm3, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: packssdw %xmm5, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm1 +; SSE2-NEXT: packsswb %xmm1, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: cmpb $-1, %al ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: allzeros_v8i64_not: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm3 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm2 +; SSE41-NEXT: packssdw %xmm3, %xmm2 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm1 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm0 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: packssdw %xmm2, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: cmpb $-1, %al ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: allzeros_v8i64_not: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 -; AVX1-NEXT: setne %al +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vtestps %xmm1, %xmm0 +; AVX1-NEXT: setae %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v8i64_not: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 -; AVX2-NEXT: setne %al +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vtestps %ymm1, %ymm0 +; AVX2-NEXT: setae %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1106,29 +1157,27 @@ } define i1 @allzeros_v16i8_and1(<16 x i8> %arg) { -; SSE2-LABEL: allzeros_v16i8_and1: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v16i8_and1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v16i8_and1: +; SSE: # %bb.0: +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: allzeros_v16i8_and1: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: testl %eax, %eax ; AVX1OR2-NEXT: sete %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: allzeros_v16i8_and1: ; KNL: # %bb.0: -; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; KNL-NEXT: vpsllw $7, %xmm0, %xmm0 +; KNL-NEXT: vpmovmskb %xmm0, %eax +; KNL-NEXT: testl %eax, %eax ; KNL-NEXT: sete %al ; KNL-NEXT: retq ; @@ -1199,44 +1248,51 @@ } define i1 @allzeros_v32i8_and1(<32 x i8> %arg) { -; SSE2-LABEL: allzeros_v32i8_and1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v32i8_and1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v32i8_and1: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v32i8_and1: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v32i8_and1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v32i8_and1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v32i8_and1: +; KNL: # %bb.0: +; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 +; KNL-NEXT: vpmovmskb %ymm0, %eax +; KNL-NEXT: testl %eax, %eax +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v32i8_and1: +; SKX: # %bb.0: +; SKX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] +; SKX-NEXT: vptest %ymm1, %ymm0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <32 x i8> %arg, %tmp1 = icmp ne <32 x i8> %tmp, zeroinitializer %tmp2 = bitcast <32 x i1> %tmp1 to i32 @@ -1258,11 +1314,11 @@ ; ; AVX1-LABEL: allones_v64i8_and1: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF @@ -1306,30 +1362,27 @@ } define i1 @allzeros_v64i8_and1(<64 x i8> %arg) { -; SSE2-LABEL: allzeros_v64i8_and1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v64i8_and1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v64i8_and1: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v64i8_and1: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1337,19 +1390,31 @@ ; AVX2-LABEL: allzeros_v64i8_and1: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v64i8_and1: -; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v64i8_and1: +; KNL: # %bb.0: +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; KNL-NEXT: vpor %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 +; KNL-NEXT: vpmovmskb %ymm0, %eax +; KNL-NEXT: testl %eax, %eax +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v64i8_and1: +; SKX: # %bb.0: +; SKX-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 +; SKX-NEXT: kortestw %k0, %k0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <64 x i8> %arg, %tmp1 = icmp ne <64 x i8> %tmp, zeroinitializer %tmp2 = bitcast <64 x i1> %tmp1 to i64 @@ -1378,14 +1443,12 @@ ; ; KNL-LABEL: allones_v8i16_and1: ; KNL: # %bb.0: -; KNL-NEXT: vpsllw $15, %xmm0, %xmm0 -; KNL-NEXT: vpsraw $15, %xmm0, %xmm0 -; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 -; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: cmpb $-1, %al +; KNL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] +; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; KNL-NEXT: vptest %xmm1, %xmm0 ; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: allones_v8i16_and1: @@ -1402,28 +1465,26 @@ } define i1 @allzeros_v8i16_and1(<8 x i16> %arg) { -; SSE2-LABEL: allzeros_v8i16_and1: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v8i16_and1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v8i16_and1: +; SSE: # %bb.0: +; SSE-NEXT: psllw $15, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: allzeros_v8i16_and1: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: testl $43690, %eax # imm = 0xAAAA ; AVX1OR2-NEXT: sete %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: allzeros_v8i16_and1: ; KNL: # %bb.0: +; KNL-NEXT: vpsllw $15, %xmm0, %xmm0 +; KNL-NEXT: vpsraw $15, %xmm0, %xmm0 ; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; KNL-NEXT: sete %al ; KNL-NEXT: retq @@ -1477,12 +1538,12 @@ ; ; KNL-LABEL: allones_v16i16_and1: ; KNL: # %bb.0: -; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 -; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kortestw %k0, %k0 -; KNL-NEXT: setb %al +; KNL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; KNL-NEXT: vptest %ymm1, %ymm0 +; KNL-NEXT: sete %al ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; @@ -1542,18 +1603,13 @@ ; KNL-LABEL: allones_v32i16_and1: ; KNL: # %bb.0: ; KNL-NEXT: vpsllw $15, %ymm0, %ymm1 -; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 -; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpsrlw $15, %ymm1, %ymm1 ; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 -; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: andl %eax, %ecx -; KNL-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; KNL-NEXT: vpsrlw $15, %ymm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; KNL-NEXT: vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 +; KNL-NEXT: kortestw %k0, %k0 ; KNL-NEXT: sete %al ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq @@ -1573,50 +1629,65 @@ } define i1 @allzeros_v32i16_and1(<32 x i16> %arg) { -; SSE2-LABEL: allzeros_v32i16_and1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v32i16_and1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v32i16_and1: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: psllw $15, %xmm1 +; SSE-NEXT: psllw $15, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v32i16_and1: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsllw $15, %xmm2, %xmm2 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v32i16_and1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllw $15, %ymm1, %ymm1 +; AVX2-NEXT: vpsllw $15, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v32i16_and1: -; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v32i16_and1: +; KNL: # %bb.0: +; KNL-NEXT: vpsllw $15, %ymm0, %ymm1 +; KNL-NEXT: vpsrlw $15, %ymm1, %ymm1 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 +; KNL-NEXT: vpsrlw $15, %ymm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kortestw %k0, %k0 +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v32i16_and1: +; SKX: # %bb.0: +; SKX-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 +; SKX-NEXT: kortestw %k0, %k0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <32 x i16> %arg, %tmp1 = icmp ne <32 x i16> %tmp, zeroinitializer %tmp2 = bitcast <32 x i1> %tmp1 to i32 @@ -1625,44 +1696,53 @@ } define i1 @allzeros_v16i16_and1(<16 x i16> %arg) { -; SSE2-LABEL: allzeros_v16i16_and1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v16i16_and1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v16i16_and1: +; SSE: # %bb.0: +; SSE-NEXT: psllw $15, %xmm1 +; SSE-NEXT: psllw $15, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v16i16_and1: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v16i16_and1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllw $15, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v16i16_and1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v16i16_and1: +; KNL: # %bb.0: +; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 +; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 +; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v16i16_and1: +; SKX: # %bb.0: +; SKX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] +; SKX-NEXT: vptest %ymm1, %ymm0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <16 x i16> %arg, %tmp1 = icmp ne <16 x i16> %tmp, zeroinitializer %tmp2 = bitcast <16 x i1> %tmp1 to i16 @@ -1712,36 +1792,35 @@ } define i1 @allzeros_v4i32_and1(<4 x i32> %arg) { -; SSE2-LABEL: allzeros_v4i32_and1: -; SSE2: # %bb.0: -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v4i32_and1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v4i32_and1: +; SSE: # %bb.0: +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: allzeros_v4i32_and1: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1OR2-NEXT: vtestps %xmm0, %xmm0 ; AVX1OR2-NEXT: sete %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: allzeros_v4i32_and1: ; KNL: # %bb.0: -; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; KNL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: testb $15, %al ; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: allzeros_v4i32_and1: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967297,4294967297] -; SKX-NEXT: vptest %xmm1, %xmm0 +; SKX-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k0 +; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: retq %tmp = and <4 x i32> %arg, @@ -1808,33 +1887,30 @@ } define i1 @allzeros_v8i32_and1(<8 x i32> %arg) { -; SSE2-LABEL: allzeros_v8i32_and1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v8i32_and1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v8i32_and1: +; SSE: # %bb.0: +; SSE-NEXT: pslld $31, %xmm1 +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v8i32_and1: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vtestps %xmm0, %xmm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v8i32_and1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vtestps %ymm0, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1913,39 +1989,46 @@ } define i1 @allzeros_v16i32_and1(<16 x i32> %arg) { -; SSE2-LABEL: allzeros_v16i32_and1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v16i32_and1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v16i32_and1: +; SSE: # %bb.0: +; SSE-NEXT: pslld $31, %xmm3 +; SSE-NEXT: pslld $31, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: pslld $31, %xmm1 +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v16i32_and1: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v16i32_and1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vpsrad $31, %ymm1, %ymm1 +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2007,36 +2090,36 @@ } define i1 @allzeros_v2i64_and1(<2 x i64> %arg) { -; SSE2-LABEL: allzeros_v2i64_and1: -; SSE2: # %bb.0: -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testb $5, %al -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v2i64_and1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v2i64_and1: +; SSE: # %bb.0: +; SSE-NEXT: psllq $63, %xmm0 +; SSE-NEXT: movmskpd %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: allzeros_v2i64_and1: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX1OR2-NEXT: vtestpd %xmm0, %xmm0 ; AVX1OR2-NEXT: sete %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: allzeros_v2i64_and1: ; KNL: # %bb.0: -; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; KNL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] +; KNL-NEXT: vptestmq %zmm1, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: testb $3, %al ; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: allzeros_v2i64_and1: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; SKX-NEXT: vptest %xmm1, %xmm0 +; SKX-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k0 +; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: retq %tmp = and <2 x i64> %arg, @@ -2103,44 +2186,51 @@ } define i1 @allzeros_v4i64_and1(<4 x i64> %arg) { -; SSE2-LABEL: allzeros_v4i64_and1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testb $5, %al -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v4i64_and1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v4i64_and1: +; SSE: # %bb.0: +; SSE-NEXT: psllq $63, %xmm1 +; SSE-NEXT: psllq $63, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v4i64_and1: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX1-NEXT: vtestpd %xmm0, %xmm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v4i64_and1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX2-NEXT: vtestpd %ymm0, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v4i64_and1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v4i64_and1: +; KNL: # %bb.0: +; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; KNL-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: testb $15, %al +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v4i64_and1: +; SKX: # %bb.0: +; SKX-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k0 +; SKX-NEXT: kortestb %k0, %k0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <4 x i64> %arg, %tmp1 = icmp ne <4 x i64> %tmp, zeroinitializer %tmp2 = bitcast <4 x i1> %tmp1 to i4 @@ -2166,15 +2256,15 @@ ; ; AVX1-LABEL: allones_v8i64_and1: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $63, %xmm1, %xmm2 -; AVX1-NEXT: vpsllq $63, %xmm0, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2 ; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2 ; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vtestps %xmm1, %xmm0 ; AVX1-NEXT: setb %al @@ -2216,39 +2306,42 @@ } define i1 @allzeros_v8i64_and1(<8 x i64> %arg) { -; SSE2-LABEL: allzeros_v8i64_and1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testb $5, %al -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v8i64_and1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v8i64_and1: +; SSE: # %bb.0: +; SSE-NEXT: psllq $63, %xmm3 +; SSE-NEXT: psllq $63, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: psllq $63, %xmm1 +; SSE-NEXT: psllq $63, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packssdw %xmm2, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v8i64_and1: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vtestps %xmm0, %xmm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v8i64_and1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllq $63, %ymm1, %ymm1 +; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vtestps %ymm0, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2307,29 +2400,27 @@ } define i1 @allzeros_v16i8_and4(<16 x i8> %arg) { -; SSE2-LABEL: allzeros_v16i8_and4: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $5, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v16i8_and4: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v16i8_and4: +; SSE: # %bb.0: +; SSE-NEXT: psllw $5, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: allzeros_v16i8_and4: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: testl %eax, %eax ; AVX1OR2-NEXT: sete %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: allzeros_v16i8_and4: ; KNL: # %bb.0: -; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; KNL-NEXT: vpsllw $5, %xmm0, %xmm0 +; KNL-NEXT: vpmovmskb %xmm0, %eax +; KNL-NEXT: testl %eax, %eax ; KNL-NEXT: sete %al ; KNL-NEXT: retq ; @@ -2400,44 +2491,51 @@ } define i1 @allzeros_v32i8_and4(<32 x i8> %arg) { -; SSE2-LABEL: allzeros_v32i8_and4: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psllw $5, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v32i8_and4: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v32i8_and4: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: psllw $5, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v32i8_and4: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v32i8_and4: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [289360691352306692,289360691352306692,289360691352306692,289360691352306692] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllw $5, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v32i8_and4: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [289360691352306692,289360691352306692,289360691352306692,289360691352306692] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v32i8_and4: +; KNL: # %bb.0: +; KNL-NEXT: vpsllw $5, %ymm0, %ymm0 +; KNL-NEXT: vpmovmskb %ymm0, %eax +; KNL-NEXT: testl %eax, %eax +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v32i8_and4: +; SKX: # %bb.0: +; SKX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [289360691352306692,289360691352306692,289360691352306692,289360691352306692] +; SKX-NEXT: vptest %ymm1, %ymm0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <32 x i8> %arg, %tmp1 = icmp ne <32 x i8> %tmp, zeroinitializer %tmp2 = bitcast <32 x i1> %tmp1 to i32 @@ -2459,11 +2557,11 @@ ; ; AVX1-LABEL: allones_v64i8_and4: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF @@ -2507,30 +2605,27 @@ } define i1 @allzeros_v64i8_and4(<64 x i8> %arg) { -; SSE2-LABEL: allzeros_v64i8_and4: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psllw $5, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v64i8_and4: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v64i8_and4: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: psllw $5, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v64i8_and4: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2538,19 +2633,31 @@ ; AVX2-LABEL: allzeros_v64i8_and4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [289360691352306692,289360691352306692,289360691352306692,289360691352306692] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllw $5, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v64i8_and4: -; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v64i8_and4: +; KNL: # %bb.0: +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; KNL-NEXT: vpor %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpsllw $5, %ymm0, %ymm0 +; KNL-NEXT: vpmovmskb %ymm0, %eax +; KNL-NEXT: testl %eax, %eax +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v64i8_and4: +; SKX: # %bb.0: +; SKX-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 +; SKX-NEXT: kortestw %k0, %k0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <64 x i8> %arg, %tmp1 = icmp ne <64 x i8> %tmp, zeroinitializer %tmp2 = bitcast <64 x i1> %tmp1 to i64 @@ -2579,14 +2686,11 @@ ; ; KNL-LABEL: allones_v8i16_and4: ; KNL: # %bb.0: -; KNL-NEXT: vpsllw $13, %xmm0, %xmm0 -; KNL-NEXT: vpsraw $15, %xmm0, %xmm0 -; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 -; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: cmpb $-1, %al +; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: allones_v8i16_and4: @@ -2603,28 +2707,26 @@ } define i1 @allzeros_v8i16_and4(<8 x i16> %arg) { -; SSE2-LABEL: allzeros_v8i16_and4: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $5, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v8i16_and4: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v8i16_and4: +; SSE: # %bb.0: +; SSE-NEXT: psllw $13, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: allzeros_v8i16_and4: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpsllw $13, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: testl $43690, %eax # imm = 0xAAAA ; AVX1OR2-NEXT: sete %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: allzeros_v8i16_and4: ; KNL: # %bb.0: +; KNL-NEXT: vpsllw $13, %xmm0, %xmm0 +; KNL-NEXT: vpsraw $15, %xmm0, %xmm0 ; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; KNL-NEXT: sete %al ; KNL-NEXT: retq @@ -2678,12 +2780,12 @@ ; ; KNL-LABEL: allones_v16i16_and4: ; KNL: # %bb.0: -; KNL-NEXT: vpsllw $13, %ymm0, %ymm0 -; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kortestw %k0, %k0 -; KNL-NEXT: setb %al +; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] +; KNL-NEXT: vptest %ymm1, %ymm0 +; KNL-NEXT: sete %al ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; @@ -2743,18 +2845,13 @@ ; KNL-LABEL: allones_v32i16_and4: ; KNL: # %bb.0: ; KNL-NEXT: vpsllw $13, %ymm0, %ymm1 -; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 -; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpsrlw $15, %ymm1, %ymm1 ; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; KNL-NEXT: vpsllw $13, %ymm0, %ymm0 -; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: andl %eax, %ecx -; KNL-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; KNL-NEXT: vpsrlw $15, %ymm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; KNL-NEXT: vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 +; KNL-NEXT: kortestw %k0, %k0 ; KNL-NEXT: sete %al ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq @@ -2774,50 +2871,65 @@ } define i1 @allzeros_v32i16_and4(<32 x i16> %arg) { -; SSE2-LABEL: allzeros_v32i16_and4: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psllw $5, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v32i16_and4: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v32i16_and4: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: psllw $13, %xmm1 +; SSE-NEXT: psllw $13, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v32i16_and4: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsllw $13, %xmm2, %xmm2 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $13, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v32i16_and4: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1125917086973956,1125917086973956,1125917086973956,1125917086973956] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllw $13, %ymm1, %ymm1 +; AVX2-NEXT: vpsllw $13, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v32i16_and4: -; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v32i16_and4: +; KNL: # %bb.0: +; KNL-NEXT: vpsllw $13, %ymm0, %ymm1 +; KNL-NEXT: vpsrlw $15, %ymm1, %ymm1 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; KNL-NEXT: vpsllw $13, %ymm0, %ymm0 +; KNL-NEXT: vpsrlw $15, %ymm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kortestw %k0, %k0 +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v32i16_and4: +; SKX: # %bb.0: +; SKX-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 +; SKX-NEXT: kortestw %k0, %k0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <32 x i16> %arg, %tmp1 = icmp ne <32 x i16> %tmp, zeroinitializer %tmp2 = bitcast <32 x i1> %tmp1 to i32 @@ -2826,44 +2938,54 @@ } define i1 @allzeros_v16i16_and4(<16 x i16> %arg) { -; SSE2-LABEL: allzeros_v16i16_and4: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psllw $5, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v16i16_and4: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v16i16_and4: +; SSE: # %bb.0: +; SSE-NEXT: psllw $13, %xmm1 +; SSE-NEXT: psllw $13, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v16i16_and4: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsllw $13, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $13, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v16i16_and4: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1125917086973956,1125917086973956,1125917086973956,1125917086973956] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllw $13, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v16i16_and4: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1125917086973956,1125917086973956,1125917086973956,1125917086973956] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v16i16_and4: +; KNL: # %bb.0: +; KNL-NEXT: vpsllw $13, %ymm0, %ymm0 +; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 +; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] +; KNL-NEXT: vptest %ymm1, %ymm0 +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v16i16_and4: +; SKX: # %bb.0: +; SKX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1125917086973956,1125917086973956,1125917086973956,1125917086973956] +; SKX-NEXT: vptest %ymm1, %ymm0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <16 x i16> %arg, %tmp1 = icmp ne <16 x i16> %tmp, zeroinitializer %tmp2 = bitcast <16 x i1> %tmp1 to i16 @@ -2913,36 +3035,35 @@ } define i1 @allzeros_v4i32_and4(<4 x i32> %arg) { -; SSE2-LABEL: allzeros_v4i32_and4: -; SSE2: # %bb.0: -; SSE2-NEXT: pslld $29, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v4i32_and4: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v4i32_and4: +; SSE: # %bb.0: +; SSE-NEXT: pslld $29, %xmm0 +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: allzeros_v4i32_and4: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpslld $29, %xmm0, %xmm0 +; AVX1OR2-NEXT: vtestps %xmm0, %xmm0 ; AVX1OR2-NEXT: sete %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: allzeros_v4i32_and4: ; KNL: # %bb.0: -; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; KNL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: testb $15, %al ; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: allzeros_v4i32_and4: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [17179869188,17179869188] -; SKX-NEXT: vptest %xmm1, %xmm0 +; SKX-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k0 +; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: retq %tmp = and <4 x i32> %arg, @@ -3009,33 +3130,30 @@ } define i1 @allzeros_v8i32_and4(<8 x i32> %arg) { -; SSE2-LABEL: allzeros_v8i32_and4: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pslld $29, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v8i32_and4: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v8i32_and4: +; SSE: # %bb.0: +; SSE-NEXT: pslld $29, %xmm1 +; SSE-NEXT: pslld $29, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v8i32_and4: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $29, %xmm0, %xmm0 +; AVX1-NEXT: vtestps %xmm0, %xmm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v8i32_and4: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [17179869188,17179869188,17179869188,17179869188] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpslld $29, %ymm0, %ymm0 +; AVX2-NEXT: vtestps %ymm0, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -3114,39 +3232,46 @@ } define i1 @allzeros_v16i32_and4(<16 x i32> %arg) { -; SSE2-LABEL: allzeros_v16i32_and4: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pslld $29, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v16i32_and4: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v16i32_and4: +; SSE: # %bb.0: +; SSE-NEXT: pslld $29, %xmm3 +; SSE-NEXT: pslld $29, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: pslld $29, %xmm1 +; SSE-NEXT: pslld $29, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v16i32_and4: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpslld $29, %xmm2, %xmm2 +; AVX1-NEXT: vpslld $29, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpslld $29, %xmm2, %xmm2 +; AVX1-NEXT: vpslld $29, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v16i32_and4: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [17179869188,17179869188,17179869188,17179869188] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpslld $29, %ymm1, %ymm1 +; AVX2-NEXT: vpsrad $31, %ymm1, %ymm1 +; AVX2-NEXT: vpslld $29, %ymm0, %ymm0 +; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -3208,36 +3333,36 @@ } define i1 @allzeros_v2i64_and4(<2 x i64> %arg) { -; SSE2-LABEL: allzeros_v2i64_and4: -; SSE2: # %bb.0: -; SSE2-NEXT: pslld $29, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testb $5, %al -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v2i64_and4: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v2i64_and4: +; SSE: # %bb.0: +; SSE-NEXT: psllq $61, %xmm0 +; SSE-NEXT: movmskpd %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: allzeros_v2i64_and4: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpsllq $61, %xmm0, %xmm0 +; AVX1OR2-NEXT: vtestpd %xmm0, %xmm0 ; AVX1OR2-NEXT: sete %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: allzeros_v2i64_and4: ; KNL: # %bb.0: -; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; KNL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] +; KNL-NEXT: vptestmq %zmm1, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: testb $3, %al ; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: allzeros_v2i64_and4: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] -; SKX-NEXT: vptest %xmm1, %xmm0 +; SKX-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k0 +; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: retq %tmp = and <2 x i64> %arg, @@ -3304,44 +3429,51 @@ } define i1 @allzeros_v4i64_and4(<4 x i64> %arg) { -; SSE2-LABEL: allzeros_v4i64_and4: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pslld $29, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testb $5, %al -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v4i64_and4: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v4i64_and4: +; SSE: # %bb.0: +; SSE-NEXT: psllq $61, %xmm1 +; SSE-NEXT: psllq $61, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v4i64_and4: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq $61, %xmm0, %xmm0 +; AVX1-NEXT: vtestpd %xmm0, %xmm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v4i64_and4: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllq $61, %ymm0, %ymm0 +; AVX2-NEXT: vtestpd %ymm0, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v4i64_and4: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v4i64_and4: +; KNL: # %bb.0: +; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; KNL-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: testb $15, %al +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v4i64_and4: +; SKX: # %bb.0: +; SKX-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k0 +; SKX-NEXT: kortestb %k0, %k0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <4 x i64> %arg, %tmp1 = icmp ne <4 x i64> %tmp, zeroinitializer %tmp2 = bitcast <4 x i1> %tmp1 to i4 @@ -3367,15 +3499,15 @@ ; ; AVX1-LABEL: allones_v8i64_and4: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $61, %xmm1, %xmm2 -; AVX1-NEXT: vpsllq $61, %xmm0, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2 ; AVX1-NEXT: vpsllq $61, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2 ; AVX1-NEXT: vpsllq $61, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vtestps %xmm1, %xmm0 ; AVX1-NEXT: setb %al @@ -3417,39 +3549,42 @@ } define i1 @allzeros_v8i64_and4(<8 x i64> %arg) { -; SSE2-LABEL: allzeros_v8i64_and4: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pslld $29, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testb $5, %al -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v8i64_and4: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v8i64_and4: +; SSE: # %bb.0: +; SSE-NEXT: psllq $61, %xmm3 +; SSE-NEXT: psllq $61, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: psllq $61, %xmm1 +; SSE-NEXT: psllq $61, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packssdw %xmm2, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v8i64_and4: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq $61, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq $61, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vtestps %xmm0, %xmm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v8i64_and4: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllq $61, %ymm1, %ymm1 +; AVX2-NEXT: vpsllq $61, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vtestps %ymm0, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -3723,20 +3858,34 @@ ; SSE: # %bb.0: ; SSE-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE-NEXT: packsswb %xmm0, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: notb %al -; SSE-NEXT: testb $-109, %al -; SSE-NEXT: sete %al +; SSE-NEXT: pmovmskb %xmm0, %ecx +; SSE-NEXT: movl %ecx, %edx +; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: shrb %al +; SSE-NEXT: andb %cl, %al +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shrb $7, %cl +; SSE-NEXT: andb $16, %dl +; SSE-NEXT: shrb $4, %dl +; SSE-NEXT: andb %cl, %dl +; SSE-NEXT: andb %dl, %al ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: movmsk_v8i16: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax -; AVX1OR2-NEXT: notb %al -; AVX1OR2-NEXT: testb $-109, %al -; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: vpmovmskb %xmm0, %ecx +; AVX1OR2-NEXT: movl %ecx, %edx +; AVX1OR2-NEXT: movl %ecx, %eax +; AVX1OR2-NEXT: shrb %al +; AVX1OR2-NEXT: andb %cl, %al +; AVX1OR2-NEXT: # kill: def $cl killed $cl killed $ecx +; AVX1OR2-NEXT: shrb $7, %cl +; AVX1OR2-NEXT: andb $16, %dl +; AVX1OR2-NEXT: shrb $4, %dl +; AVX1OR2-NEXT: andb %cl, %dl +; AVX1OR2-NEXT: andb %dl, %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: movmsk_v8i16: @@ -3832,24 +3981,31 @@ ; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al +; SSE2-NEXT: movmskpd %xmm1, %ecx +; SSE2-NEXT: xorl $3, %ecx +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: shrb %al +; SSE2-NEXT: andb %cl, %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: movmsk_and_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: pcmpeqq %xmm1, %xmm0 -; SSE41-NEXT: movmskpd %xmm0, %eax -; SSE41-NEXT: testl %eax, %eax -; SSE41-NEXT: sete %al +; SSE41-NEXT: movmskpd %xmm0, %ecx +; SSE41-NEXT: xorl $3, %ecx +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: shrb %al +; SSE41-NEXT: andb %cl, %al ; SSE41-NEXT: retq ; ; AVX1OR2-LABEL: movmsk_and_v2i64: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vtestpd %xmm0, %xmm0 -; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: vmovmskpd %xmm0, %ecx +; AVX1OR2-NEXT: xorl $3, %ecx +; AVX1OR2-NEXT: movl %ecx, %eax +; AVX1OR2-NEXT: shrb %al +; AVX1OR2-NEXT: andb %cl, %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: movmsk_and_v2i64: @@ -3883,24 +4039,39 @@ ; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: cmpl $3, %eax -; SSE2-NEXT: setne %al +; SSE2-NEXT: movmskpd %xmm1, %ecx +; SSE2-NEXT: xorl $3, %ecx +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: shrb %al +; SSE2-NEXT: orb %cl, %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: movmsk_or_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: setne %al +; SSE41-NEXT: pcmpeqq %xmm1, %xmm0 +; SSE41-NEXT: movmskpd %xmm0, %ecx +; SSE41-NEXT: xorl $3, %ecx +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: shrb %al +; SSE41-NEXT: orb %cl, %al ; SSE41-NEXT: retq ; -; AVX-LABEL: movmsk_or_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vptest %xmm0, %xmm0 -; AVX-NEXT: setne %al -; AVX-NEXT: retq +; AVX1OR2-LABEL: movmsk_or_v2i64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vmovmskpd %xmm0, %ecx +; AVX1OR2-NEXT: xorl $3, %ecx +; AVX1OR2-NEXT: movl %ecx, %eax +; AVX1OR2-NEXT: shrb %al +; AVX1OR2-NEXT: orb %cl, %al +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: movmsk_or_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vptest %xmm0, %xmm0 +; AVX512-NEXT: setne %al +; AVX512-NEXT: retq %cmp = icmp ne <2 x i64> %x, %y %e1 = extractelement <2 x i1> %cmp, i32 0 %e2 = extractelement <2 x i1> %cmp, i32 1 @@ -3933,18 +4104,24 @@ ; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL-NEXT: vcmpeq_uqps %zmm1, %zmm0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k1 +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: testb $14, %al +; KNL-NEXT: testb $6, %al ; KNL-NEXT: setne %al +; KNL-NEXT: orb %cl, %al ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: movmsk_v4f32: ; SKX: # %bb.0: ; SKX-NEXT: vcmpeq_uqps %xmm1, %xmm0, %k0 +; SKX-NEXT: kshiftrb $3, %k0, %k1 +; SKX-NEXT: kmovd %k1, %ecx ; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: testb $14, %al +; SKX-NEXT: testb $6, %al ; SKX-NEXT: setne %al +; SKX-NEXT: orb %cl, %al ; SKX-NEXT: retq %cmp = fcmp ueq <4 x float> %x, %y %e1 = extractelement <4 x i1> %cmp, i32 1 @@ -3959,17 +4136,19 @@ ; SSE-LABEL: movmsk_and_v2f64: ; SSE: # %bb.0: ; SSE-NEXT: cmplepd %xmm0, %xmm1 -; SSE-NEXT: movmskpd %xmm1, %eax -; SSE-NEXT: cmpl $3, %eax -; SSE-NEXT: sete %al +; SSE-NEXT: movmskpd %xmm1, %ecx +; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: shrb %al +; SSE-NEXT: andb %cl, %al ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: movmsk_and_v2f64: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vcmplepd %xmm0, %xmm1, %xmm0 -; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1OR2-NEXT: vtestpd %xmm1, %xmm0 -; AVX1OR2-NEXT: setb %al +; AVX1OR2-NEXT: vmovmskpd %xmm0, %ecx +; AVX1OR2-NEXT: movl %ecx, %eax +; AVX1OR2-NEXT: shrb %al +; AVX1OR2-NEXT: andb %cl, %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: movmsk_and_v2f64: @@ -4002,16 +4181,19 @@ ; SSE-LABEL: movmsk_or_v2f64: ; SSE: # %bb.0: ; SSE-NEXT: cmplepd %xmm0, %xmm1 -; SSE-NEXT: movmskpd %xmm1, %eax -; SSE-NEXT: testl %eax, %eax -; SSE-NEXT: setne %al +; SSE-NEXT: movmskpd %xmm1, %ecx +; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: shrb %al +; SSE-NEXT: orb %cl, %al ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: movmsk_or_v2f64: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vcmplepd %xmm0, %xmm1, %xmm0 -; AVX1OR2-NEXT: vtestpd %xmm0, %xmm0 -; AVX1OR2-NEXT: setne %al +; AVX1OR2-NEXT: vmovmskpd %xmm0, %ecx +; AVX1OR2-NEXT: movl %ecx, %eax +; AVX1OR2-NEXT: shrb %al +; AVX1OR2-NEXT: orb %cl, %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: movmsk_or_v2f64: @@ -4350,26 +4532,23 @@ ; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL-NEXT: vcmpltpd %zmm0, %zmm1, %k0 -; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: testb $2, %cl -; KNL-NEXT: movl $42, %eax -; KNL-NEXT: movl $99, %edx -; KNL-NEXT: cmovel %edx, %eax -; KNL-NEXT: testb $1, %cl -; KNL-NEXT: cmovel %edx, %eax +; KNL-NEXT: knotw %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: testb $3, %al +; KNL-NEXT: movl $42, %ecx +; KNL-NEXT: movl $99, %eax +; KNL-NEXT: cmovel %ecx, %eax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: PR39665_c_ray: ; SKX: # %bb.0: ; SKX-NEXT: vcmpltpd %xmm0, %xmm1, %k0 -; SKX-NEXT: kmovd %k0, %ecx -; SKX-NEXT: testb $2, %cl -; SKX-NEXT: movl $42, %eax -; SKX-NEXT: movl $99, %edx -; SKX-NEXT: cmovel %edx, %eax -; SKX-NEXT: testb $1, %cl -; SKX-NEXT: cmovel %edx, %eax +; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: cmpb $3, %al +; SKX-NEXT: movl $42, %ecx +; SKX-NEXT: movl $99, %eax +; SKX-NEXT: cmovel %ecx, %eax ; SKX-NEXT: retq %cmp = fcmp ogt <2 x double> %x, %y %e1 = extractelement <2 x i1> %cmp, i32 0 diff --git a/llvm/test/CodeGen/X86/mul128.ll b/llvm/test/CodeGen/X86/mul128.ll --- a/llvm/test/CodeGen/X86/mul128.ll +++ b/llvm/test/CodeGen/X86/mul128.ll @@ -108,12 +108,12 @@ define void @PR13897() nounwind { ; X64-LABEL: PR13897: ; X64: # %bb.0: # %"0x0" -; X64-NEXT: movq bbb(%rip), %rax -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: shlq $32, %rax -; X64-NEXT: orq %rcx, %rax -; X64-NEXT: movq %rax, aaa+8(%rip) -; X64-NEXT: movq %rax, aaa(%rip) +; X64-NEXT: movl bbb(%rip), %eax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: shlq $32, %rcx +; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movq %rcx, aaa+8(%rip) +; X64-NEXT: movq %rcx, aaa(%rip) ; X64-NEXT: retq ; ; X86-LABEL: PR13897: diff --git a/llvm/test/CodeGen/X86/mulvi32.ll b/llvm/test/CodeGen/X86/mulvi32.ll --- a/llvm/test/CodeGen/X86/mulvi32.ll +++ b/llvm/test/CodeGen/X86/mulvi32.ll @@ -135,22 +135,22 @@ ; SSE2-LABEL: _mul4xi32toi64a: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,1,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,1,3,3] ; SSE2-NEXT: pmuludq %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE42-LABEL: _mul4xi32toi64a: ; SSE42: # %bb.0: ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,1,3,3] ; SSE42-NEXT: pmuludq %xmm3, %xmm2 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; SSE42-NEXT: pmuludq %xmm1, %xmm0 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SSE42-NEXT: pmuludq %xmm4, %xmm0 ; SSE42-NEXT: movdqa %xmm2, %xmm1 ; SSE42-NEXT: retq ; @@ -288,11 +288,11 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero +; AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %lower0 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> %lower1 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/musttail-varargs.ll b/llvm/test/CodeGen/X86/musttail-varargs.ll --- a/llvm/test/CodeGen/X86/musttail-varargs.ll +++ b/llvm/test/CodeGen/X86/musttail-varargs.ll @@ -52,11 +52,11 @@ ; LINUX-NEXT: movq %rdx, %r12 ; LINUX-NEXT: movq %rsi, %r13 ; LINUX-NEXT: movq %rdi, %rbx -; LINUX-NEXT: movq %rsi, {{[0-9]+}}(%rsp) -; LINUX-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; LINUX-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; LINUX-NEXT: movq %r8, {{[0-9]+}}(%rsp) ; LINUX-NEXT: movq %r9, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %r8, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %rsi, {{[0-9]+}}(%rsp) ; LINUX-NEXT: testb %al, %al ; LINUX-NEXT: je .LBB0_2 ; LINUX-NEXT: # %bb.1: @@ -145,11 +145,11 @@ ; LINUX-X32-NEXT: movq %rdx, %r12 ; LINUX-X32-NEXT: movq %rsi, %r13 ; LINUX-X32-NEXT: movq %rdi, %rbx -; LINUX-X32-NEXT: movq %rsi, {{[0-9]+}}(%esp) -; LINUX-X32-NEXT: movq %rdx, {{[0-9]+}}(%esp) -; LINUX-X32-NEXT: movq %rcx, {{[0-9]+}}(%esp) -; LINUX-X32-NEXT: movq %r8, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: movq %r9, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %r8, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %rcx, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %rdx, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %rsi, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: testb %al, %al ; LINUX-X32-NEXT: je .LBB0_2 ; LINUX-X32-NEXT: # %bb.1: @@ -218,9 +218,9 @@ ; WINDOWS-NEXT: movq %r8, %rdi ; WINDOWS-NEXT: movq %rdx, %rbx ; WINDOWS-NEXT: movq %rcx, %r14 -; WINDOWS-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WINDOWS-NEXT: movq %r8, {{[0-9]+}}(%rsp) ; WINDOWS-NEXT: movq %r9, {{[0-9]+}}(%rsp) +; WINDOWS-NEXT: movq %r8, {{[0-9]+}}(%rsp) +; WINDOWS-NEXT: movq %rdx, {{[0-9]+}}(%rsp) ; WINDOWS-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; WINDOWS-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; WINDOWS-NEXT: callq get_f diff --git a/llvm/test/CodeGen/X86/neg-abs.ll b/llvm/test/CodeGen/X86/neg-abs.ll --- a/llvm/test/CodeGen/X86/neg-abs.ll +++ b/llvm/test/CodeGen/X86/neg-abs.ll @@ -36,7 +36,7 @@ ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movswl %cx, %eax -; X86-NEXT: sarl $15, %eax +; X86-NEXT: shrl $15, %eax ; X86-NEXT: xorl %eax, %ecx ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -180,7 +180,7 @@ ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movswl %cx, %eax -; X86-NEXT: sarl $15, %eax +; X86-NEXT: shrl $15, %eax ; X86-NEXT: xorl %eax, %ecx ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax diff --git a/llvm/test/CodeGen/X86/neg_fp.ll b/llvm/test/CodeGen/X86/neg_fp.ll --- a/llvm/test/CodeGen/X86/neg_fp.ll +++ b/llvm/test/CodeGen/X86/neg_fp.ll @@ -64,12 +64,14 @@ ; CHECK-NEXT: pushl %eax ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: subss {{[0-9]+}}(%esp), %xmm1 -; CHECK-NEXT: movaps %xmm1, %xmm2 -; CHECK-NEXT: mulss %xmm0, %xmm2 -; CHECK-NEXT: subss %xmm1, %xmm0 -; CHECK-NEXT: divss %xmm2, %xmm0 -; CHECK-NEXT: movss %xmm0, (%esp) +; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps %xmm2, %xmm3 +; CHECK-NEXT: subss %xmm1, %xmm3 +; CHECK-NEXT: mulss %xmm0, %xmm3 +; CHECK-NEXT: subss %xmm2, %xmm1 +; CHECK-NEXT: addss %xmm0, %xmm1 +; CHECK-NEXT: divss %xmm3, %xmm1 +; CHECK-NEXT: movss %xmm1, (%esp) ; CHECK-NEXT: flds (%esp) ; CHECK-NEXT: popl %eax ; CHECK-NEXT: retl diff --git a/llvm/test/CodeGen/X86/no-wide-load.ll b/llvm/test/CodeGen/X86/no-wide-load.ll --- a/llvm/test/CodeGen/X86/no-wide-load.ll +++ b/llvm/test/CodeGen/X86/no-wide-load.ll @@ -7,7 +7,7 @@ ; CHECK-LABEL: foo: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movzwl 4(%rdi), %eax -; CHECK-NEXT: andl $-1121, %eax # imm = 0xFB9F +; CHECK-NEXT: andl $64415, %eax # imm = 0xFB9F ; CHECK-NEXT: orl $1024, %eax # imm = 0x400 ; CHECK-NEXT: movw %ax, 4(%rdi) ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/nontemporal-loads.ll b/llvm/test/CodeGen/X86/nontemporal-loads.ll --- a/llvm/test/CodeGen/X86/nontemporal-loads.ll +++ b/llvm/test/CodeGen/X86/nontemporal-loads.ll @@ -828,12 +828,12 @@ ; ; AVX1-LABEL: test_arg_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vmovntdqa (%rdi), %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v8i32: @@ -908,12 +908,12 @@ ; ; AVX1-LABEL: test_arg_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vmovntdqa (%rdi), %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v4i64: @@ -949,12 +949,12 @@ ; ; AVX1-LABEL: test_arg_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpaddw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vmovntdqa (%rdi), %xmm2 -; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v16i16: @@ -990,12 +990,12 @@ ; ; AVX1-LABEL: test_arg_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vmovntdqa (%rdi), %xmm2 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v32i8: @@ -1090,18 +1090,18 @@ ; ; AVX1-LABEL: test_arg_v16i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovntdqa (%rdi), %xmm3 -; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm3 -; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm4 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v16i32: @@ -1196,18 +1196,18 @@ ; ; AVX1-LABEL: test_arg_v8i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovntdqa (%rdi), %xmm3 -; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm3 -; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm4 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v8i64: @@ -1251,18 +1251,18 @@ ; ; AVX1-LABEL: test_arg_v32i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovntdqa (%rdi), %xmm3 -; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm3 -; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm4 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpaddw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpaddw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpaddw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v32i16: @@ -1326,18 +1326,18 @@ ; ; AVX1-LABEL: test_arg_v64i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovntdqa (%rdi), %xmm3 -; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm3 -; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm4 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v64i8: diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -236,12 +236,11 @@ ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,0,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,2,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,6,4,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,2,3,2,4,5,6,7] ; SSE2-NEXT: movw %ax, 12(%rdi) -; SSE2-NEXT: movq %xmm0, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: movd %xmm0, 8(%rdi) +; SSE2-NEXT: movq %xmm1, (%rdi) ; SSE2-NEXT: retq ; ; SSE42-LABEL: v7i16: @@ -473,7 +472,7 @@ ; SSE2-LABEL: v12i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm2, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[1,0] ; SSE2-NEXT: movaps %xmm0, %xmm4 ; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] @@ -481,7 +480,7 @@ ; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[1,1] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE2-NEXT: movaps %xmm2, 16(%rdi) @@ -799,32 +798,32 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,4,5] -; SSE2-NEXT: packuswb %xmm5, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,2] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,6,6] -; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: movq {{.*#+}} xmm5 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,3,3,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; SSE2-NEXT: pandn %xmm2, %xmm4 ; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,4,5] +; SSE2-NEXT: packuswb %xmm4, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,6,6] +; SSE2-NEXT: pandn %xmm4, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,1,3,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7] ; SSE2-NEXT: packuswb %xmm1, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255] @@ -833,7 +832,7 @@ ; SSE2-NEXT: pandn %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: movq %xmm2, 16(%rdi) -; SSE2-NEXT: movdqu %xmm4, (%rdi) +; SSE2-NEXT: movdqu %xmm3, (%rdi) ; SSE2-NEXT: retq ; ; SSE42-LABEL: interleave_24i8_in: @@ -858,42 +857,42 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] ; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5] -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm0[0],zero,zero,xmm0[1],zero,zero,xmm0[2],zero,zero,xmm0[3],zero,zero,xmm0[4],zero -; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[13],zero,xmm1[6,14],zero,xmm1[7,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5],zero,zero,xmm0[6],zero,zero,xmm0[7,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero +; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, 16(%rdi) -; AVX1-NEXT: vmovdqu %xmm2, (%rdi) +; AVX1-NEXT: vmovdqu %xmm1, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: interleave_24i8_in: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5] -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm0[0],zero,zero,xmm0[1],zero,zero,xmm0[2],zero,zero,xmm0[3],zero,zero,xmm0[4],zero -; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[13],zero,xmm1[6,14],zero,xmm1[7,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5],zero,zero,xmm0[6],zero,zero,xmm0[7,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovq %xmm0, 16(%rdi) -; AVX2-NEXT: vmovdqu %xmm2, (%rdi) +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5,29],zero,ymm0[22,30],zero,ymm0[23,31],zero,ymm0[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpbroadcastq (%rcx), %ymm1 +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[0],zero,zero,ymm1[1],zero,zero,ymm1[2],zero,zero,ymm1[3],zero,zero,ymm1[4],zero,zero,ymm1[21],zero,zero,ymm1[22],zero,zero,ymm1[23,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovq %xmm1, 16(%rdi) +; AVX2-NEXT: vmovdqu %xmm0, (%rdi) +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; XOP-LABEL: interleave_24i8_in: ; XOP: # %bb.0: ; XOP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; XOP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; XOP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm1[0,8],xmm0[0],xmm1[1,9],xmm0[1],xmm1[2,10],xmm0[2],xmm1[3,11],xmm0[3],xmm1[4,12],xmm0[4],xmm1[5] -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[13],xmm0[5],xmm1[6,14],xmm0[6],xmm1[7,15],xmm0[7],xmm1[u,u,u,u,u,u,u,u] +; XOP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm0[0,8],xmm1[0],xmm0[1,9],xmm1[1],xmm0[2,10],xmm1[2],xmm0[3,11],xmm1[3],xmm0[4,12],xmm1[4],xmm0[5] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[13],xmm1[5],xmm0[6,14],xmm1[6],xmm0[7,15],xmm1[7],xmm0[u,u,u,u,u,u,u,u] ; XOP-NEXT: vmovq %xmm0, 16(%rdi) ; XOP-NEXT: vmovdqu %xmm2, (%rdi) ; XOP-NEXT: retq @@ -1037,12 +1036,12 @@ ; XOP-NEXT: vmovdqu (%rdi), %xmm0 ; XOP-NEXT: vmovdqu 16(%rdi), %xmm1 ; XOP-NEXT: vmovdqu 32(%rdi), %xmm2 -; XOP-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15],xmm2[4,5,10,11] -; XOP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11],xmm2[0,1,6,7,12,13] -; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13],xmm2[2,3,8,9,14,15] +; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm0[0,1,6,7,12,13],xmm1[2,3,8,9,14,15],xmm0[u,u,u,u] +; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,7,8,9,10,11],xmm2[4,5,10,11] +; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm0[2,3,8,9,14,15],xmm1[4,5,10,11,u,u,u,u,u,u] +; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,7,8,9],xmm2[0,1,6,7,12,13] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[4,5,10,11],xmm1[0,1,6,7,12,13,u,u,u,u,u,u] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9],xmm2[2,3,8,9,14,15] ; XOP-NEXT: vmovdqu %xmm3, (%rsi) ; XOP-NEXT: vmovdqu %xmm4, (%rdx) ; XOP-NEXT: vmovdqu %xmm0, (%rcx) @@ -1187,12 +1186,12 @@ ; XOP-NEXT: vmovdqu (%rdi), %xmm0 ; XOP-NEXT: vmovdqu 16(%rdi), %xmm1 ; XOP-NEXT: vmovdqu 32(%rdi), %xmm2 -; XOP-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm2[14,15,8,9,2,3],xmm3[12,13,6,7,0,1,10,11,4,5] -; XOP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm2[12,13,6,7,0,1],xmm4[10,11,4,5,14,15,8,9,2,3] -; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm2[10,11,4,5],xmm0[14,15,8,9,2,3,12,13,6,7,0,1] +; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,12,13,6,7,0,1],xmm0[10,11,4,5] +; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm2[14,15,8,9,2,3],xmm3[6,7,8,9,10,11,12,13,14,15] +; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u],xmm1[10,11,4,5],xmm0[14,15,8,9,2,3] +; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm2[12,13,6,7,0,1],xmm4[6,7,8,9,10,11,12,13,14,15] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[u,u,u,u,14,15,8,9,2,3],xmm0[12,13,6,7,0,1] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm2[10,11,4,5],xmm0[4,5,6,7,8,9,10,11,12,13,14,15] ; XOP-NEXT: vmovdqu %xmm3, (%rsi) ; XOP-NEXT: vmovdqu %xmm4, (%rdx) ; XOP-NEXT: vmovdqu %xmm0, (%rcx) @@ -1326,14 +1325,14 @@ ; AVX2-FAST-ALL-NEXT: vmovdqu (%rsi), %xmm0 ; AVX2-FAST-ALL-NEXT: vmovdqu (%rdx), %xmm1 ; AVX2-FAST-ALL-NEXT: vmovdqu (%rcx), %xmm2 -; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm4, %ymm4 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm3, %ymm3 +; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [0,4,1,5,1,5,2,6] -; AVX2-FAST-ALL-NEXT: vpermd %ymm3, %ymm5, %ymm3 -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5,u,u,2,3,6,7,u,u,8,9,12,13,u,u,18,19,22,23,u,u,24,25,28,29,u,u,26,27] +; AVX2-FAST-ALL-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,4,5,u,u,2,3,6,7,u,u,8,9,12,13,u,u,18,19,22,23,u,u,24,25,28,29,u,u,26,27] ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 ; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] ; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] @@ -1371,17 +1370,18 @@ ; XOP-NEXT: vmovdqu (%rsi), %xmm0 ; XOP-NEXT: vmovdqu (%rdx), %xmm1 ; XOP-NEXT: vmovdqu (%rcx), %xmm2 -; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm0[u,u,6,7],xmm1[6,7],xmm0[u,u,8,9],xmm1[8,9],xmm0[u,u,10,11] -; XOP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] -; XOP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] -; XOP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[0,1],xmm4[4,5,6,7],xmm2[2,3],xmm4[8,9,10,11] -; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; XOP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[4,5],xmm2[10,11],xmm0[10,11,8,9],xmm2[12,13],xmm0[14,15,12,13],xmm2[14,15] -; XOP-NEXT: vmovdqu %xmm0, 32(%rdi) -; XOP-NEXT: vmovups %ymm3, (%rdi) -; XOP-NEXT: vzeroupper +; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm1[10,11,u,u],xmm0[12,13],xmm1[12,13,u,u],xmm0[14,15],xmm1[14,15,u,u] +; XOP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] +; XOP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7] +; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm0[0,1],xmm1[0,1],xmm0[u,u,2,3],xmm1[2,3],xmm0[u,u,4,5],xmm1[4,5] +; XOP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,0,0] +; XOP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[u,u,6,7],xmm1[6,7],xmm0[u,u,8,9],xmm1[8,9],xmm0[u,u,10,11] +; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] +; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; XOP-NEXT: vmovdqu %xmm0, 16(%rdi) +; XOP-NEXT: vmovdqu %xmm4, (%rdi) +; XOP-NEXT: vmovdqu %xmm3, 32(%rdi) ; XOP-NEXT: retq %s1 = load <8 x i16>, ptr %q1, align 4 %s2 = load <8 x i16>, ptr %q2, align 4 @@ -1418,10 +1418,10 @@ ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[2,0] ; SSE2-NEXT: movdqa %xmm1, %xmm10 ; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm4[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm2[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[3,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,1],xmm3[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[3,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[0,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[2,0] ; SSE2-NEXT: movups %xmm10, 16(%rsi) @@ -1474,27 +1474,27 @@ ; AVX1-NEXT: vmovups 32(%rdi), %ymm1 ; AVX1-NEXT: vmovups (%rdi), %ymm2 ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX1-NEXT: vmovups 16(%rdi), %xmm4 -; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1],ymm1[1,3],ymm4[6,5],ymm1[5,7] -; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4] -; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] -; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,0],ymm4[3,0],ymm0[6,4],ymm4[7,4] -; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm5[2,0],ymm4[4,4],ymm5[6,4] -; AVX1-NEXT: vmovups 16(%rdi), %xmm6 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm4[2,1],ymm1[1,3],ymm4[6,5],ymm1[5,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm5[0,2],ymm3[4,7],ymm5[4,6] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3,0,1] +; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm0[1,0],ymm5[2,0],ymm0[5,4],ymm5[6,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm0[2,0],ymm5[3,0],ymm0[6,4],ymm5[7,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm5[0,0],ymm6[2,0],ymm5[4,4],ymm6[6,4] ; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7] -; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] -; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,2],ymm4[0,3],ymm7[5,6],ymm4[4,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5] +; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX1-NEXT: vmovups 16(%rdi), %xmm6 ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,3],ymm2[6,4],ymm1[4,7] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,1],ymm0[0,3],ymm4[4,5],ymm0[4,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,1],ymm0[0,3],ymm5[4,5],ymm0[4,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-NEXT: vmovups %ymm3, (%rsi) -; AVX1-NEXT: vmovups %ymm5, (%rdx) +; AVX1-NEXT: vmovups %ymm4, (%rdx) ; AVX1-NEXT: vmovups %ymm0, (%rcx) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1596,27 +1596,27 @@ ; XOP-NEXT: vmovups 32(%rdi), %ymm1 ; XOP-NEXT: vmovups (%rdi), %ymm2 ; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; XOP-NEXT: vmovups 16(%rdi), %xmm4 -; XOP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1],ymm1[1,3],ymm4[6,5],ymm1[5,7] -; XOP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6] -; XOP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1] -; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4] -; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] -; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,0],ymm4[3,0],ymm0[6,4],ymm4[7,4] -; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm5[2,0],ymm4[4,4],ymm5[6,4] -; XOP-NEXT: vmovups 16(%rdi), %xmm6 +; XOP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3,0,1] +; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm4[2,1],ymm1[1,3],ymm4[6,5],ymm1[5,7] +; XOP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm5[0,2],ymm3[4,7],ymm5[4,6] +; XOP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3,0,1] +; XOP-NEXT: vshufps {{.*#+}} ymm6 = ymm0[1,0],ymm5[2,0],ymm0[5,4],ymm5[6,4] +; XOP-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] +; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; XOP-NEXT: vshufps {{.*#+}} ymm6 = ymm0[2,0],ymm5[3,0],ymm0[6,4],ymm5[7,4] +; XOP-NEXT: vshufps {{.*#+}} ymm6 = ymm5[0,0],ymm6[2,0],ymm5[4,4],ymm6[6,4] ; XOP-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; XOP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7] -; XOP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] -; XOP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] +; XOP-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,2],ymm4[0,3],ymm7[5,6],ymm4[4,7] +; XOP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5] +; XOP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] ; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; XOP-NEXT: vmovups 16(%rdi), %xmm6 ; XOP-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4] ; XOP-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,3],ymm2[6,4],ymm1[4,7] -; XOP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,1],ymm0[0,3],ymm4[4,5],ymm0[4,7] +; XOP-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,1],ymm0[0,3],ymm5[4,5],ymm0[4,7] ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; XOP-NEXT: vmovups %ymm3, (%rsi) -; XOP-NEXT: vmovups %ymm5, (%rdx) +; XOP-NEXT: vmovups %ymm4, (%rdx) ; XOP-NEXT: vmovups %ymm0, (%rcx) ; XOP-NEXT: vzeroupper ; XOP-NEXT: retq @@ -1633,79 +1633,79 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; SSE2-LABEL: interleave_24i32_in: ; SSE2: # %bb.0: -; SSE2-NEXT: movups (%rsi), %xmm1 -; SSE2-NEXT: movups 16(%rsi), %xmm0 -; SSE2-NEXT: movups (%rdx), %xmm3 -; SSE2-NEXT: movups 16(%rdx), %xmm5 -; SSE2-NEXT: movups (%rcx), %xmm4 -; SSE2-NEXT: movups 16(%rcx), %xmm7 -; SSE2-NEXT: movaps %xmm4, %xmm6 -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[1,3] -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm6[0,2] -; SSE2-NEXT: movaps %xmm0, %xmm8 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm5[1] -; SSE2-NEXT: movaps %xmm7, %xmm9 -; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[1,3] -; SSE2-NEXT: movaps %xmm0, %xmm6 +; SSE2-NEXT: movups (%rsi), %xmm0 +; SSE2-NEXT: movups 16(%rsi), %xmm1 +; SSE2-NEXT: movups (%rdx), %xmm5 +; SSE2-NEXT: movups 16(%rdx), %xmm3 +; SSE2-NEXT: movups (%rcx), %xmm6 +; SSE2-NEXT: movups 16(%rcx), %xmm4 +; SSE2-NEXT: movaps %xmm6, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm0[1,0] +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm7[0,2] +; SSE2-NEXT: movaps %xmm0, %xmm7 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm5[1] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm7[2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm5[1,1] -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm8[0,2] -; SSE2-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm9[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm6[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm5[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm7[0,2] +; SSE2-NEXT: movaps %xmm4, %xmm5 +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm1[1,0] +; SSE2-NEXT: movaps %xmm1, %xmm7 +; SSE2-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[0,2] ; SSE2-NEXT: movaps %xmm1, %xmm5 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm3[3,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm4[2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[1,1] ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[0,2] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE2-NEXT: movups %xmm4, 16(%rdi) -; SSE2-NEXT: movups %xmm6, 48(%rdi) -; SSE2-NEXT: movups %xmm7, 64(%rdi) +; SSE2-NEXT: movups %xmm4, 64(%rdi) +; SSE2-NEXT: movups %xmm7, 48(%rdi) +; SSE2-NEXT: movups %xmm6, 16(%rdi) ; SSE2-NEXT: movups %xmm2, (%rdi) -; SSE2-NEXT: movups %xmm1, 32(%rdi) -; SSE2-NEXT: movups %xmm0, 80(%rdi) +; SSE2-NEXT: movups %xmm1, 80(%rdi) +; SSE2-NEXT: movups %xmm0, 32(%rdi) ; SSE2-NEXT: retq ; ; SSE42-LABEL: interleave_24i32_in: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqu (%rsi), %xmm0 -; SSE42-NEXT: movdqu 16(%rsi), %xmm2 +; SSE42-NEXT: movdqu (%rsi), %xmm2 +; SSE42-NEXT: movdqu 16(%rsi), %xmm0 ; SSE42-NEXT: movdqu (%rdx), %xmm3 ; SSE42-NEXT: movdqu 16(%rdx), %xmm4 ; SSE42-NEXT: movdqu (%rcx), %xmm5 ; SSE42-NEXT: movdqu 16(%rcx), %xmm6 -; SSE42-NEXT: movdqa %xmm0, %xmm1 +; SSE42-NEXT: movdqa %xmm2, %xmm1 ; SSE42-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,2] ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4,5],xmm7[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,2,2] +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,2,2] ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4,5],xmm7[6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7] -; SSE42-NEXT: movdqa %xmm2, %xmm8 +; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm5[2,3],xmm7[4,5,6,7] +; SSE42-NEXT: movdqa %xmm0, %xmm8 ; SSE42-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] ; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,2] ; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,1,0,1] ; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm8[0,1,2,3],xmm9[4,5],xmm8[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,2,2] +; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,2,2] ; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm0[4,5],xmm8[6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,3],xmm8[4,5,6,7] -; SSE42-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] -; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] -; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3,4,5],xmm4[6,7] -; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3],xmm8[4,5,6,7] +; SSE42-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3,4,5],xmm3[6,7] +; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7] -; SSE42-NEXT: movdqu %xmm2, 32(%rdi) -; SSE42-NEXT: movdqu %xmm4, 80(%rdi) -; SSE42-NEXT: movdqu %xmm8, 16(%rdi) +; SSE42-NEXT: movdqu %xmm2, 80(%rdi) +; SSE42-NEXT: movdqu %xmm3, 32(%rdi) +; SSE42-NEXT: movdqu %xmm8, 64(%rdi) ; SSE42-NEXT: movdqu %xmm9, 48(%rdi) -; SSE42-NEXT: movdqu %xmm7, 64(%rdi) +; SSE42-NEXT: movdqu %xmm7, 16(%rdi) ; SSE42-NEXT: movdqu %xmm1, (%rdi) ; SSE42-NEXT: retq ; @@ -1716,28 +1716,28 @@ ; AVX1-NEXT: vmovups 16(%rdx), %xmm2 ; AVX1-NEXT: vmovups (%rsi), %xmm3 ; AVX1-NEXT: vmovups 16(%rsi), %xmm4 -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,3],xmm2[3,3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1],xmm4[0,2] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,2,3] -; AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm3[1],xmm1[1] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1],xmm4[0,2] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm3[1],xmm1[1] +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm1[1,1],xmm5[0,2] ; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-NEXT: vbroadcastsd (%rcx), %ymm3 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm4[3,3],xmm2[3,3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1],xmm4[0,2] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,3,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] ; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = mem[0,0,3,3,4,4,7,7] ; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = mem[1,0,2,2] ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7] ; AVX1-NEXT: vmovups %ymm0, 32(%rdi) -; AVX1-NEXT: vmovups %ymm1, (%rdi) ; AVX1-NEXT: vmovups %ymm2, 64(%rdi) +; AVX1-NEXT: vmovups %ymm1, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1746,26 +1746,26 @@ ; AVX2-SLOW-NEXT: vmovups (%rsi), %ymm0 ; AVX2-SLOW-NEXT: vmovups (%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovups (%rcx), %ymm2 -; AVX2-SLOW-NEXT: vbroadcastsd 24(%rsi), %ymm3 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm1[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = mem[1,0,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,0,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vbroadcastsd (%rcx), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = mem[1,0,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vbroadcastsd (%rcx), %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 24(%rsi), %ymm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[2,1,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, 32(%rdi) -; AVX2-SLOW-NEXT: vmovups %ymm4, (%rdi) -; AVX2-SLOW-NEXT: vmovups %ymm3, 64(%rdi) +; AVX2-SLOW-NEXT: vmovups %ymm4, 64(%rdi) +; AVX2-SLOW-NEXT: vmovups %ymm3, (%rdi) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -1774,28 +1774,29 @@ ; AVX2-FAST-ALL-NEXT: vmovups (%rsi), %ymm0 ; AVX2-FAST-ALL-NEXT: vmovups (%rdx), %ymm1 ; AVX2-FAST-ALL-NEXT: vmovups (%rcx), %ymm2 -; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [5,0,7,6,5,0,7,6] -; AVX2-FAST-ALL-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm3, %ymm3 -; AVX2-FAST-ALL-NEXT: vbroadcastsd 24(%rsi), %ymm4 -; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3] -; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX2-FAST-ALL-NEXT: vmovups (%rdx), %xmm3 ; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2] ; AVX2-FAST-ALL-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm4, %ymm4 -; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,0,2,1] -; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] -; AVX2-FAST-ALL-NEXT: vbroadcastsd (%rcx), %ymm5 -; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-ALL-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-ALL-NEXT: vpermps %ymm3, %ymm4, %ymm3 +; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX2-FAST-ALL-NEXT: vbroadcastsd (%rcx), %ymm4 +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-ALL-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2] -; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[1,1,2,2] +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7] +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [5,0,7,6,5,0,7,6] +; AVX2-FAST-ALL-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-FAST-ALL-NEXT: vbroadcastsd 24(%rsi), %ymm4 +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-FAST-ALL-NEXT: vmovups %ymm1, 64(%rdi) ; AVX2-FAST-ALL-NEXT: vmovups %ymm0, 32(%rdi) -; AVX2-FAST-ALL-NEXT: vmovups %ymm4, (%rdi) -; AVX2-FAST-ALL-NEXT: vmovups %ymm3, 64(%rdi) +; AVX2-FAST-ALL-NEXT: vmovups %ymm3, (%rdi) ; AVX2-FAST-ALL-NEXT: vzeroupper ; AVX2-FAST-ALL-NEXT: retq ; @@ -1804,26 +1805,26 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups (%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups (%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rsi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm1[1,2,3,3,5,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = mem[1,0,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rcx), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = mem[1,0,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rcx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, 32(%rdi) -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, (%rdi) -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, 64(%rdi) +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, 64(%rdi) +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, (%rdi) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -1837,25 +1838,25 @@ ; XOP-NEXT: vmovups 16(%rdx), %xmm3 ; XOP-NEXT: vmovups (%rsi), %xmm4 ; XOP-NEXT: vmovups 16(%rsi), %xmm5 -; XOP-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,3],xmm3[3,3] -; XOP-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] -; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[0,2] -; XOP-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; XOP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3,2,3] -; XOP-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0,0,3,3] -; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7] -; XOP-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm4[1],xmm2[1] -; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1],xmm5[0,2] +; XOP-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm4[1],xmm2[1] +; XOP-NEXT: vshufps {{.*#+}} xmm6 = xmm2[1,1],xmm6[0,2] ; XOP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] ; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,1] -; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; XOP-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 ; XOP-NEXT: vbroadcastsd (%rcx), %ymm4 ; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm5[3,3],xmm3[3,3] +; XOP-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[0,2] +; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; XOP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,2,3] +; XOP-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3] +; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] ; XOP-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1,1,2,2] ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; XOP-NEXT: vmovups %ymm0, 32(%rdi) -; XOP-NEXT: vmovups %ymm2, (%rdi) ; XOP-NEXT: vmovups %ymm3, 64(%rdi) +; XOP-NEXT: vmovups %ymm2, (%rdi) ; XOP-NEXT: vzeroupper ; XOP-NEXT: retq %s1 = load <8 x i32>, ptr %q1, align 4 @@ -2229,12 +2230,14 @@ ; ; SSE42-LABEL: splat_v3i32: ; SSE42: # %bb.0: -; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE42-NEXT: movq (%rdi), %rax +; SSE42-NEXT: movq %rax, %xmm0 +; SSE42-NEXT: shrq $32, %rax ; SSE42-NEXT: pxor %xmm1, %xmm1 -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,0,1] +; SSE42-NEXT: movd %eax, %xmm0 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] ; SSE42-NEXT: xorps %xmm3, %xmm3 ; SSE42-NEXT: retq ; @@ -2390,8 +2393,8 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[3,3,3,3] -; SSE-NEXT: movdqa %xmm0, 128(%rsi) ; SSE-NEXT: movdqa %xmm2, 144(%rsi) +; SSE-NEXT: movdqa %xmm0, 128(%rsi) ; SSE-NEXT: movdqa %xmm0, 16(%rsi) ; SSE-NEXT: movdqa %xmm7, 240(%rsi) ; SSE-NEXT: movdqa %xmm6, 208(%rsi) @@ -2403,19 +2406,19 @@ ; ; AVX1-LABEL: D107009: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovups 96(%rdi), %ymm0 -; AVX1-NEXT: vmovups (%rdi), %ymm1 +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: vmovups 64(%rdi), %ymm1 ; AVX1-NEXT: vmovups 128(%rdi), %ymm2 -; AVX1-NEXT: vmovups 224(%rdi), %ymm3 +; AVX1-NEXT: vmovups 192(%rdi), %ymm3 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2] ; AVX1-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5] -; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,0],ymm2[4,5],ymm3[6,4] -; AVX1-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1],ymm3[0,2],ymm2[4,5],ymm3[4,6] +; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 @@ -2424,16 +2427,16 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vmovshdup {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] -; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7] -; AVX1-NEXT: vshufpd {{.*#+}} ymm5 = ymm1[0,0,3,2] +; AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[0,0,3,2] +; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,3,3,3,7,7,7,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] ; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi) ; AVX1-NEXT: vmovdqa %xmm7, 112(%rsi) ; AVX1-NEXT: vmovdqa %xmm6, 48(%rsi) ; AVX1-NEXT: vmovups %ymm1, 128(%rsi) -; AVX1-NEXT: vmovupd %ymm5, 192(%rsi) -; AVX1-NEXT: vmovups %ymm4, 224(%rsi) +; AVX1-NEXT: vmovups %ymm5, 224(%rsi) +; AVX1-NEXT: vmovupd %ymm4, 192(%rsi) ; AVX1-NEXT: vmovups %ymm3, 160(%rsi) ; AVX1-NEXT: vmovups %ymm2, 64(%rsi) ; AVX1-NEXT: vzeroupper @@ -2455,39 +2458,39 @@ ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpbroadcastd %xmm0, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpbroadcastd %xmm0, %ymm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[3,3,3,3,7,7,7,7] -; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[3,3,3,3,7,7,7,7] ; AVX2-NEXT: vmovdqu %ymm0, 128(%rsi) -; AVX2-NEXT: vmovdqu %ymm7, 192(%rsi) -; AVX2-NEXT: vmovdqu %ymm6, 224(%rsi) +; AVX2-NEXT: vmovdqu %ymm7, 224(%rsi) +; AVX2-NEXT: vmovdqu %ymm6, 192(%rsi) ; AVX2-NEXT: vmovdqu %ymm5, 160(%rsi) -; AVX2-NEXT: vmovdqu %ymm4, 64(%rsi) -; AVX2-NEXT: vmovdqa %xmm3, 112(%rsi) -; AVX2-NEXT: vmovdqu %ymm2, (%rsi) -; AVX2-NEXT: vmovdqa %xmm1, 48(%rsi) +; AVX2-NEXT: vmovdqa %xmm4, 112(%rsi) +; AVX2-NEXT: vmovdqu %ymm3, 64(%rsi) +; AVX2-NEXT: vmovdqa %xmm2, 48(%rsi) +; AVX2-NEXT: vmovdqu %ymm1, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; XOP-LABEL: D107009: ; XOP: # %bb.0: -; XOP-NEXT: vmovups 96(%rdi), %ymm0 -; XOP-NEXT: vmovups (%rdi), %ymm1 +; XOP-NEXT: vmovups (%rdi), %ymm0 +; XOP-NEXT: vmovups 64(%rdi), %ymm1 ; XOP-NEXT: vmovups 128(%rdi), %ymm2 -; XOP-NEXT: vmovups 224(%rdi), %ymm3 +; XOP-NEXT: vmovups 192(%rdi), %ymm3 ; XOP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2] ; XOP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5] -; XOP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,0],ymm2[4,5],ymm3[6,4] -; XOP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 -; XOP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; XOP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; XOP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1],ymm3[0,2],ymm2[4,5],ymm3[4,6] +; XOP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm0 -; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; XOP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; XOP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; XOP-NEXT: vpsrld $16, %xmm0, %xmm0 ; XOP-NEXT: vextractf128 $1, %ymm2, %xmm1 ; XOP-NEXT: vpsrld $16, %xmm1, %xmm1 @@ -2496,16 +2499,16 @@ ; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] ; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; XOP-NEXT: vmovshdup {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] -; XOP-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7] -; XOP-NEXT: vshufpd {{.*#+}} ymm5 = ymm1[0,0,3,2] +; XOP-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[0,0,3,2] +; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,3,3,3,7,7,7,7] ; XOP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] ; XOP-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] ; XOP-NEXT: vmovdqa %xmm0, 16(%rsi) ; XOP-NEXT: vmovdqa %xmm7, 112(%rsi) ; XOP-NEXT: vmovdqa %xmm6, 48(%rsi) ; XOP-NEXT: vmovups %ymm1, 128(%rsi) -; XOP-NEXT: vmovupd %ymm5, 192(%rsi) -; XOP-NEXT: vmovups %ymm4, 224(%rsi) +; XOP-NEXT: vmovups %ymm5, 224(%rsi) +; XOP-NEXT: vmovupd %ymm4, 192(%rsi) ; XOP-NEXT: vmovups %ymm3, 160(%rsi) ; XOP-NEXT: vmovups %ymm2, 64(%rsi) ; XOP-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll --- a/llvm/test/CodeGen/X86/oddsubvector.ll +++ b/llvm/test/CodeGen/X86/oddsubvector.ll @@ -77,28 +77,51 @@ ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: retq ; -; AVX-LABEL: PR40815: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX-NEXT: vmovaps 48(%rdi), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rsi) -; AVX-NEXT: vmovaps %xmm3, (%rsi) -; AVX-NEXT: vmovaps %xmm0, 48(%rsi) -; AVX-NEXT: vmovaps %xmm1, 32(%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: PR40815: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-NEXT: vmovaps (%rdi), %ymm1 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3,0,1] +; AVX1-NEXT: vmovups %ymm2, (%rsi) +; AVX1-NEXT: vextractf128 $1, %ymm1, 32(%rsi) +; AVX1-NEXT: vmovaps %xmm0, 48(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: PR40815: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = mem[2,3,0,1] +; AVX2-NEXT: vmovups %ymm2, (%rsi) +; AVX2-NEXT: vextractf128 $1, %ymm1, 32(%rsi) +; AVX2-NEXT: vmovaps %xmm0, 48(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: PR40815: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX512-NEXT: vmovups 16(%rdi), %ymm1 -; AVX512-NEXT: vinsertf128 $1, (%rdi), %ymm1, %ymm1 -; AVX512-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0 -; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vmovups %zmm0, (%rsi) +; AVX512-NEXT: vmovaps (%rdi), %xmm0 +; AVX512-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX512-NEXT: vpermpd {{.*#+}} ymm2 = mem[2,3,0,1] +; AVX512-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX512-NEXT: vmovups %ymm2, (%rsi) +; AVX512-NEXT: vmovaps %xmm1, 32(%rsi) +; AVX512-NEXT: vmovaps %xmm0, 48(%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq +; +; XOP-LABEL: PR40815: +; XOP: # %bb.0: +; XOP-NEXT: vmovaps (%rdi), %xmm0 +; XOP-NEXT: vmovaps (%rdi), %ymm1 +; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3,0,1] +; XOP-NEXT: vmovups %ymm2, (%rsi) +; XOP-NEXT: vextractf128 $1, %ymm1, 32(%rsi) +; XOP-NEXT: vmovaps %xmm0, 48(%rsi) +; XOP-NEXT: vzeroupper +; XOP-NEXT: retq %3 = load <16 x float>, ptr %0, align 64 %4 = shufflevector <16 x float> %3, <16 x float> undef, <4 x i32> %5 = getelementptr inbounds %struct.Mat4, ptr %1, i64 0, i32 0, i32 0, i64 4 @@ -156,106 +179,125 @@ ; SSE2-LABEL: PR42833: ; SSE2: # %bb.0: ; SSE2-NEXT: movl b(%rip), %eax -; SSE2-NEXT: movdqa c+128(%rip), %xmm0 ; SSE2-NEXT: movdqa c+144(%rip), %xmm2 +; SSE2-NEXT: movdqa c+128(%rip), %xmm0 ; SSE2-NEXT: addl c+128(%rip), %eax ; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: movdqa d+144(%rip), %xmm4 -; SSE2-NEXT: psubd %xmm2, %xmm4 -; SSE2-NEXT: paddd %xmm2, %xmm2 +; SSE2-NEXT: movaps {{.*#+}} xmm3 = +; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pslld $23, %xmm3 +; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: cvttps2dq %xmm3, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: paddd %xmm0, %xmm5 -; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3] +; SSE2-NEXT: movdqa d+144(%rip), %xmm3 +; SSE2-NEXT: psubd %xmm2, %xmm3 +; SSE2-NEXT: paddd %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm2, c+144(%rip) ; SSE2-NEXT: movaps %xmm5, c+128(%rip) -; SSE2-NEXT: movdqa c+160(%rip), %xmm2 -; SSE2-NEXT: movdqa c+176(%rip), %xmm3 -; SSE2-NEXT: movdqa d+160(%rip), %xmm5 -; SSE2-NEXT: movdqa d+176(%rip), %xmm6 +; SSE2-NEXT: movdqa c+176(%rip), %xmm2 +; SSE2-NEXT: movdqa c+160(%rip), %xmm4 +; SSE2-NEXT: movdqa d+176(%rip), %xmm5 +; SSE2-NEXT: movdqa d+160(%rip), %xmm6 ; SSE2-NEXT: movdqa d+128(%rip), %xmm7 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE2-NEXT: psubd %xmm0, %xmm7 -; SSE2-NEXT: psubd %xmm3, %xmm6 +; SSE2-NEXT: psubd %xmm4, %xmm6 ; SSE2-NEXT: psubd %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm5, d+160(%rip) -; SSE2-NEXT: movdqa %xmm6, d+176(%rip) -; SSE2-NEXT: movdqa %xmm4, d+144(%rip) +; SSE2-NEXT: movdqa %xmm5, d+176(%rip) +; SSE2-NEXT: movdqa %xmm6, d+160(%rip) +; SSE2-NEXT: movdqa %xmm3, d+144(%rip) ; SSE2-NEXT: movdqa %xmm7, d+128(%rip) -; SSE2-NEXT: paddd %xmm3, %xmm3 +; SSE2-NEXT: paddd %xmm4, %xmm4 ; SSE2-NEXT: paddd %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm2, c+160(%rip) -; SSE2-NEXT: movdqa %xmm3, c+176(%rip) +; SSE2-NEXT: movdqa %xmm2, c+176(%rip) +; SSE2-NEXT: movdqa %xmm4, c+160(%rip) ; SSE2-NEXT: retq ; ; SSE42-LABEL: PR42833: ; SSE42: # %bb.0: ; SSE42-NEXT: movl b(%rip), %eax -; SSE42-NEXT: movdqa c+128(%rip), %xmm0 ; SSE42-NEXT: movdqa c+144(%rip), %xmm1 +; SSE42-NEXT: movdqa c+128(%rip), %xmm0 ; SSE42-NEXT: addl c+128(%rip), %eax -; SSE42-NEXT: movd %eax, %xmm2 -; SSE42-NEXT: paddd %xmm0, %xmm2 +; SSE42-NEXT: movdqa {{.*#+}} xmm2 = +; SSE42-NEXT: pinsrd $0, %eax, %xmm2 +; SSE42-NEXT: movdqa %xmm0, %xmm3 +; SSE42-NEXT: paddd %xmm2, %xmm3 +; SSE42-NEXT: pslld $23, %xmm2 +; SSE42-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE42-NEXT: cvttps2dq %xmm2, %xmm2 +; SSE42-NEXT: pmulld %xmm0, %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] ; SSE42-NEXT: movdqa d+144(%rip), %xmm3 ; SSE42-NEXT: psubd %xmm1, %xmm3 ; SSE42-NEXT: paddd %xmm1, %xmm1 -; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: paddd %xmm0, %xmm4 -; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, c+144(%rip) -; SSE42-NEXT: movdqa %xmm4, c+128(%rip) -; SSE42-NEXT: movdqa c+160(%rip), %xmm1 -; SSE42-NEXT: movdqa c+176(%rip), %xmm2 -; SSE42-NEXT: movdqa d+160(%rip), %xmm4 -; SSE42-NEXT: movdqa d+176(%rip), %xmm5 +; SSE42-NEXT: movdqa %xmm2, c+128(%rip) +; SSE42-NEXT: movdqa c+176(%rip), %xmm1 +; SSE42-NEXT: movdqa c+160(%rip), %xmm2 +; SSE42-NEXT: movdqa d+176(%rip), %xmm4 +; SSE42-NEXT: movdqa d+160(%rip), %xmm5 ; SSE42-NEXT: movdqa d+128(%rip), %xmm6 ; SSE42-NEXT: pinsrd $0, %eax, %xmm0 ; SSE42-NEXT: psubd %xmm0, %xmm6 ; SSE42-NEXT: psubd %xmm2, %xmm5 ; SSE42-NEXT: psubd %xmm1, %xmm4 -; SSE42-NEXT: movdqa %xmm4, d+160(%rip) -; SSE42-NEXT: movdqa %xmm5, d+176(%rip) +; SSE42-NEXT: movdqa %xmm4, d+176(%rip) +; SSE42-NEXT: movdqa %xmm5, d+160(%rip) ; SSE42-NEXT: movdqa %xmm3, d+144(%rip) ; SSE42-NEXT: movdqa %xmm6, d+128(%rip) ; SSE42-NEXT: paddd %xmm2, %xmm2 ; SSE42-NEXT: paddd %xmm1, %xmm1 -; SSE42-NEXT: movdqa %xmm1, c+160(%rip) -; SSE42-NEXT: movdqa %xmm2, c+176(%rip) +; SSE42-NEXT: movdqa %xmm1, c+176(%rip) +; SSE42-NEXT: movdqa %xmm2, c+160(%rip) ; SSE42-NEXT: retq ; ; AVX1-LABEL: PR42833: ; AVX1: # %bb.0: ; AVX1-NEXT: movl b(%rip), %eax ; AVX1-NEXT: addl c+128(%rip), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; AVX1-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa c+128(%rip), %xmm1 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vmovdqa c+144(%rip), %xmm3 +; AVX1-NEXT: vpslld $23, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX1-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7] ; AVX1-NEXT: vmovdqa d+144(%rip), %xmm2 ; AVX1-NEXT: vpsubd c+144(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vmovups %ymm0, c+128(%rip) ; AVX1-NEXT: vpinsrd $0, %eax, %xmm1, %xmm0 ; AVX1-NEXT: vmovdqa d+128(%rip), %xmm1 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa d+176(%rip), %xmm1 -; AVX1-NEXT: vmovdqa c+176(%rip), %xmm3 +; AVX1-NEXT: vmovdqa d+160(%rip), %xmm1 +; AVX1-NEXT: vmovdqa c+160(%rip), %xmm3 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa d+160(%rip), %xmm4 -; AVX1-NEXT: vmovdqa c+160(%rip), %xmm5 +; AVX1-NEXT: vmovdqa d+176(%rip), %xmm4 +; AVX1-NEXT: vmovdqa c+176(%rip), %xmm5 ; AVX1-NEXT: vpsubd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vmovdqa %xmm4, d+176(%rip) +; AVX1-NEXT: vmovdqa %xmm1, d+160(%rip) ; AVX1-NEXT: vmovdqa %xmm2, d+144(%rip) -; AVX1-NEXT: vmovdqa %xmm4, d+160(%rip) -; AVX1-NEXT: vmovdqa %xmm1, d+176(%rip) ; AVX1-NEXT: vmovdqa %xmm0, d+128(%rip) ; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm0 ; AVX1-NEXT: vpaddd %xmm5, %xmm5, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, c+160(%rip) -; AVX1-NEXT: vmovdqa %xmm0, c+176(%rip) +; AVX1-NEXT: vmovdqa %xmm1, c+176(%rip) +; AVX1-NEXT: vmovdqa %xmm0, c+160(%rip) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -287,12 +329,12 @@ ; AVX512-NEXT: movl b(%rip), %eax ; AVX512-NEXT: vmovdqu c+128(%rip), %ymm0 ; AVX512-NEXT: vmovdqu64 c+128(%rip), %zmm1 +; AVX512-NEXT: vmovdqa c+128(%rip), %xmm2 ; AVX512-NEXT: addl c+128(%rip), %eax -; AVX512-NEXT: vmovd %eax, %xmm2 -; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm2 +; AVX512-NEXT: vmovd %eax, %xmm3 +; AVX512-NEXT: vpaddd %ymm3, %ymm0, %ymm3 ; AVX512-NEXT: vpaddd %ymm0, %ymm0, %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7] -; AVX512-NEXT: vmovdqa c+128(%rip), %xmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7] ; AVX512-NEXT: vmovdqu %ymm0, c+128(%rip) ; AVX512-NEXT: vmovdqu c+160(%rip), %ymm0 ; AVX512-NEXT: vmovdqu64 d+128(%rip), %zmm3 @@ -310,34 +352,35 @@ ; XOP: # %bb.0: ; XOP-NEXT: movl b(%rip), %eax ; XOP-NEXT: addl c+128(%rip), %eax -; XOP-NEXT: vmovd %eax, %xmm0 +; XOP-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; XOP-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 ; XOP-NEXT: vmovdqa c+128(%rip), %xmm1 -; XOP-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; XOP-NEXT: vpaddd %xmm1, %xmm1, %xmm2 +; XOP-NEXT: vpaddd %xmm0, %xmm1, %xmm2 ; XOP-NEXT: vmovdqa c+144(%rip), %xmm3 +; XOP-NEXT: vpshld %xmm0, %xmm1, %xmm0 ; XOP-NEXT: vpaddd %xmm3, %xmm3, %xmm3 -; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7] +; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7] ; XOP-NEXT: vmovdqa d+144(%rip), %xmm2 ; XOP-NEXT: vpsubd c+144(%rip), %xmm2, %xmm2 ; XOP-NEXT: vmovups %ymm0, c+128(%rip) ; XOP-NEXT: vpinsrd $0, %eax, %xmm1, %xmm0 ; XOP-NEXT: vmovdqa d+128(%rip), %xmm1 ; XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; XOP-NEXT: vmovdqa d+176(%rip), %xmm1 -; XOP-NEXT: vmovdqa c+176(%rip), %xmm3 +; XOP-NEXT: vmovdqa d+160(%rip), %xmm1 +; XOP-NEXT: vmovdqa c+160(%rip), %xmm3 ; XOP-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vmovdqa d+160(%rip), %xmm4 -; XOP-NEXT: vmovdqa c+160(%rip), %xmm5 +; XOP-NEXT: vmovdqa d+176(%rip), %xmm4 +; XOP-NEXT: vmovdqa c+176(%rip), %xmm5 ; XOP-NEXT: vpsubd %xmm5, %xmm4, %xmm4 +; XOP-NEXT: vmovdqa %xmm4, d+176(%rip) +; XOP-NEXT: vmovdqa %xmm1, d+160(%rip) ; XOP-NEXT: vmovdqa %xmm2, d+144(%rip) -; XOP-NEXT: vmovdqa %xmm4, d+160(%rip) -; XOP-NEXT: vmovdqa %xmm1, d+176(%rip) ; XOP-NEXT: vmovdqa %xmm0, d+128(%rip) ; XOP-NEXT: vpaddd %xmm3, %xmm3, %xmm0 ; XOP-NEXT: vpaddd %xmm5, %xmm5, %xmm1 -; XOP-NEXT: vmovdqa %xmm1, c+160(%rip) -; XOP-NEXT: vmovdqa %xmm0, c+176(%rip) +; XOP-NEXT: vmovdqa %xmm1, c+176(%rip) +; XOP-NEXT: vmovdqa %xmm0, c+160(%rip) ; XOP-NEXT: vzeroupper ; XOP-NEXT: retq %1 = load i32, ptr @b, align 4 diff --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll --- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll +++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll @@ -16,8 +16,14 @@ define i1 @p0_scalar_urem_by_const(i32 %x, i32 %y) { ; CHECK-LABEL: p0_scalar_urem_by_const: ; CHECK: # %bb.0: -; CHECK-NEXT: testb %dil, %dil -; CHECK-NEXT: setns %al +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: andl $128, %edi +; CHECK-NEXT: imulq $715827883, %rdi, %rax # imm = 0x2AAAAAAB +; CHECK-NEXT: shrq $32, %rax +; CHECK-NEXT: addl %eax, %eax +; CHECK-NEXT: leal (%rax,%rax,2), %eax +; CHECK-NEXT: cmpl %eax, %edi +; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %t0 = and i32 %x, 128 ; clearly a power-of-two or zero %t1 = urem i32 %t0, 6 ; '6' is clearly not a power of two @@ -42,12 +48,16 @@ ; CHECK-LABEL: p2_scalar_shifted_urem_by_const: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shll %cl, %edi -; CHECK-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB -; CHECK-NEXT: cmpl $1431655766, %eax # imm = 0x55555556 -; CHECK-NEXT: setb %al +; CHECK-NEXT: movl $2863311531, %eax # imm = 0xAAAAAAAB +; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: shrq $33, %rax +; CHECK-NEXT: leal (%rax,%rax,2), %eax +; CHECK-NEXT: cmpl %eax, %edi +; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %t0 = and i32 %x, 1 ; clearly a power-of-two or zero %t1 = shl i32 %t0, %y ; will still be a power-of-two or zero with any %y @@ -60,12 +70,16 @@ ; CHECK-LABEL: p3_scalar_shifted2_urem_by_const: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NEXT: andl $2, %edi ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shll %cl, %edi -; CHECK-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB -; CHECK-NEXT: cmpl $1431655766, %eax # imm = 0x55555556 -; CHECK-NEXT: setb %al +; CHECK-NEXT: movl $2863311531, %eax # imm = 0xAAAAAAAB +; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: shrq $33, %rax +; CHECK-NEXT: leal (%rax,%rax,2), %eax +; CHECK-NEXT: cmpl %eax, %edi +; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %t0 = and i32 %x, 2 ; clearly a power-of-two or zero %t1 = shl i32 %t0, %y ; will still be a power-of-two or zero with any %y @@ -83,29 +97,33 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: psrld $2, %xmm2 +; SSE2-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: psubd %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: p4_vector_urem_by_const__splat: ; SSE4: # %bb.0: ; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: psrld $1, %xmm0 -; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] -; SSE4-NEXT: pminud %xmm0, %xmm1 +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; SSE4-NEXT: pmuludq %xmm2, %xmm1 +; SSE4-NEXT: pmuludq %xmm0, %xmm2 +; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; SSE4-NEXT: psrld $2, %xmm2 +; SSE4-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE4-NEXT: psubd %xmm2, %xmm0 +; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE4-NEXT: retq ; @@ -113,11 +131,16 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [128,128,128,128] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 +; AVX2-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %t0 = and <4 x i32> %x, ; clearly a power-of-two or zero @@ -130,43 +153,55 @@ ; SSE2-LABEL: p5_vector_urem_by_const__nonsplat: ; SSE2: # %bb.0: ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: psrlq $32, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,954437177] +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrld $2, %xmm2 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[1,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] +; SSE2-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psubd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: p5_vector_urem_by_const__nonsplat: ; SSE4: # %bb.0: ; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: movdqa {{.*#+}} xmm1 = <1,u,2147483648,u> +; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,954437177] ; SSE4-NEXT: pmuludq %xmm0, %xmm1 -; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; SSE4-NEXT: psrlq $32, %xmm1 -; SSE4-NEXT: por %xmm1, %xmm0 -; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [1431655765,858993459,715827882,477218588] -; SSE4-NEXT: pminud %xmm0, %xmm1 +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE4-NEXT: movdqa %xmm2, %xmm1 +; SSE4-NEXT: psrld $2, %xmm1 +; SSE4-NEXT: psrld $1, %xmm2 +; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] +; SSE4-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE4-NEXT: psubd %xmm2, %xmm0 +; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE4-NEXT: retq ; ; AVX2-LABEL: p5_vector_urem_by_const__nonsplat: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %t0 = and <4 x i32> %x, @@ -180,32 +215,39 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: psrld $2, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,6,6,6] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: psubd %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: p6_vector_urem_by_const__nonsplat_undef0: ; SSE4: # %bb.0: ; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: movdqa %xmm0, %xmm1 -; SSE4-NEXT: psrld $1, %xmm1 -; SSE4-NEXT: pslld $31, %xmm0 -; SSE4-NEXT: por %xmm1, %xmm0 -; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] -; SSE4-NEXT: pminud %xmm0, %xmm1 +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; SSE4-NEXT: pmuludq %xmm2, %xmm1 +; SSE4-NEXT: pmuludq %xmm0, %xmm2 +; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; SSE4-NEXT: psrld $2, %xmm2 +; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE4-NEXT: psubd %xmm2, %xmm0 +; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE4-NEXT: retq ; @@ -213,11 +255,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [128,128,128,128] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6] +; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %t0 = and <4 x i32> %x, @@ -354,10 +402,13 @@ define i1 @n0_urem_of_maybe_not_power_of_two(i32 %x, i32 %y) { ; CHECK-LABEL: n0_urem_of_maybe_not_power_of_two: ; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NEXT: andl $3, %edi -; CHECK-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB -; CHECK-NEXT: cmpl $1431655766, %eax # imm = 0x55555556 -; CHECK-NEXT: setb %al +; CHECK-NEXT: imulq $1431655766, %rdi, %rax # imm = 0x55555556 +; CHECK-NEXT: shrq $32, %rax +; CHECK-NEXT: leal (%rax,%rax,2), %eax +; CHECK-NEXT: cmpl %eax, %edi +; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %t0 = and i32 %x, 3 ; up to two bits set, not power-of-two %t1 = urem i32 %t0, 3 diff --git a/llvm/test/CodeGen/X86/overflow.ll b/llvm/test/CodeGen/X86/overflow.ll --- a/llvm/test/CodeGen/X86/overflow.ll +++ b/llvm/test/CodeGen/X86/overflow.ll @@ -56,9 +56,11 @@ ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: andl $1, %ecx +; X64-NEXT: xorl %esi, %esi ; X64-NEXT: addq %rdx, %rcx +; X64-NEXT: setb %sil ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: xorl %edx, %edx +; X64-NEXT: movq %rsi, %rdx ; X64-NEXT: retq %1 = zext i64 %a to i128 %2 = zext i64 %b to i128 diff --git a/llvm/test/CodeGen/X86/packus.ll b/llvm/test/CodeGen/X86/packus.ll --- a/llvm/test/CodeGen/X86/packus.ll +++ b/llvm/test/CodeGen/X86/packus.ll @@ -120,17 +120,18 @@ define <8 x i16> @trunc_lshr_v4i64_demandedelts(<4 x i64> %a0) { ; X86-SSE2-LABEL: trunc_lshr_v4i64_demandedelts: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,0,0,0] +; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pand %xmm2, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: packuswb %xmm1, %xmm0 ; X86-SSE2-NEXT: retl ; ; X64-SSE2-LABEL: trunc_lshr_v4i64_demandedelts: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,18446744073709551615] +; X64-SSE2-NEXT: movl $1, %eax +; X64-SSE2-NEXT: movq %rax, %xmm2 ; X64-SSE2-NEXT: pand %xmm2, %xmm0 ; X64-SSE2-NEXT: pand %xmm2, %xmm1 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] @@ -140,17 +141,18 @@ ; ; X86-SSE4-LABEL: trunc_lshr_v4i64_demandedelts: ; X86-SSE4: # %bb.0: -; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X86-SSE4-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] +; X86-SSE4-NEXT: movdqa {{.*#+}} xmm2 = [1,0,0,0] +; X86-SSE4-NEXT: pand %xmm2, %xmm0 ; X86-SSE4-NEXT: pand %xmm2, %xmm1 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; X86-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X86-SSE4-NEXT: pand %xmm2, %xmm0 ; X86-SSE4-NEXT: packusdw %xmm1, %xmm0 ; X86-SSE4-NEXT: retl ; ; X64-SSE4-LABEL: trunc_lshr_v4i64_demandedelts: ; X64-SSE4: # %bb.0: -; X64-SSE4-NEXT: movdqa {{.*#+}} xmm2 = [1,18446744073709551615] +; X64-SSE4-NEXT: movl $1, %eax +; X64-SSE4-NEXT: movq %rax, %xmm2 ; X64-SSE4-NEXT: pand %xmm2, %xmm0 ; X64-SSE4-NEXT: pand %xmm2, %xmm1 ; X64-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] diff --git a/llvm/test/CodeGen/X86/parity-vec.ll b/llvm/test/CodeGen/X86/parity-vec.ll --- a/llvm/test/CodeGen/X86/parity-vec.ll +++ b/llvm/test/CodeGen/X86/parity-vec.ll @@ -36,8 +36,8 @@ ; POPCNT-NEXT: psllw $7, %xmm0 ; POPCNT-NEXT: pmovmskb %xmm0, %eax ; POPCNT-NEXT: popcntl %eax, %eax -; POPCNT-NEXT: andl $1, %eax -; POPCNT-NEXT: # kill: def $al killed $al killed $eax +; POPCNT-NEXT: testb $1, %al +; POPCNT-NEXT: setne %al ; POPCNT-NEXT: retq %i1 = bitcast <16 x i1> %x to i16 %i2 = call i16 @llvm.ctpop.i16(i16 %i1) @@ -50,23 +50,8 @@ ; NOPOPCNT: # %bb.0: ; NOPOPCNT-NEXT: psllw $7, %xmm0 ; NOPOPCNT-NEXT: pmovmskb %xmm0, %eax -; NOPOPCNT-NEXT: movl %eax, %ecx -; NOPOPCNT-NEXT: shrl %ecx -; NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; NOPOPCNT-NEXT: subl %ecx, %eax -; NOPOPCNT-NEXT: movl %eax, %ecx -; NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; NOPOPCNT-NEXT: shrl $2, %eax -; NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; NOPOPCNT-NEXT: addl %ecx, %eax -; NOPOPCNT-NEXT: movl %eax, %ecx -; NOPOPCNT-NEXT: shrl $4, %ecx -; NOPOPCNT-NEXT: addl %eax, %ecx -; NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; NOPOPCNT-NEXT: movl %ecx, %eax -; NOPOPCNT-NEXT: shrl $8, %eax -; NOPOPCNT-NEXT: addl %ecx, %eax -; NOPOPCNT-NEXT: # kill: def $al killed $al killed $eax +; NOPOPCNT-NEXT: xorb %ah, %al +; NOPOPCNT-NEXT: setnp %al ; NOPOPCNT-NEXT: retq ; ; POPCNT-LABEL: canonical_parity_noncanonical_pred: @@ -74,7 +59,9 @@ ; POPCNT-NEXT: psllw $7, %xmm0 ; POPCNT-NEXT: pmovmskb %xmm0, %eax ; POPCNT-NEXT: popcntl %eax, %eax -; POPCNT-NEXT: # kill: def $al killed $al killed $eax +; POPCNT-NEXT: andl $1, %eax +; POPCNT-NEXT: cmpw $1, %ax +; POPCNT-NEXT: sete %al ; POPCNT-NEXT: retq %i1 = bitcast <16 x i1> %x to i16 %i2 = call i16 @llvm.ctpop.i16(i16 %i1) @@ -143,8 +130,8 @@ ; POPCNT-NEXT: pmovmskb %xmm0, %eax ; POPCNT-NEXT: popcntl %eax, %eax ; POPCNT-NEXT: andl $1, %eax -; POPCNT-NEXT: xorb $1, %al -; POPCNT-NEXT: # kill: def $al killed $al killed $eax +; POPCNT-NEXT: cmpw $1, %ax +; POPCNT-NEXT: setne %al ; POPCNT-NEXT: retq %i1 = bitcast <16 x i1> %x to i16 %i2 = call i16 @llvm.ctpop.i16(i16 %i1) diff --git a/llvm/test/CodeGen/X86/phaddsub-extract.ll b/llvm/test/CodeGen/X86/phaddsub-extract.ll --- a/llvm/test/CodeGen/X86/phaddsub-extract.ll +++ b/llvm/test/CodeGen/X86/phaddsub-extract.ll @@ -1687,23 +1687,64 @@ ; SSE3-FAST-NEXT: movd %xmm1, %eax ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: partial_reduction_add_v8i32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovd %xmm0, %eax -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: partial_reduction_add_v8i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: partial_reduction_add_v8i32: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vmovd %xmm0, %eax -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: partial_reduction_add_v8i32: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: partial_reduction_add_v8i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovd %xmm0, %eax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: partial_reduction_add_v8i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovd %xmm0, %eax +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: partial_reduction_add_v8i32: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-SLOW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vmovd %xmm0, %eax +; AVX512-SLOW-NEXT: vzeroupper +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: partial_reduction_add_v8i32: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vmovd %xmm0, %eax +; AVX512-FAST-NEXT: vzeroupper +; AVX512-FAST-NEXT: retq %x23 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> %x0213 = add <8 x i32> %x, %x23 %x13 = shufflevector <8 x i32> %x0213, <8 x i32> undef, <8 x i32> @@ -1730,23 +1771,64 @@ ; SSE3-FAST-NEXT: movd %xmm1, %eax ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: partial_reduction_add_v16i32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovd %xmm0, %eax -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: partial_reduction_add_v16i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: partial_reduction_add_v16i32: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vmovd %xmm0, %eax -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: partial_reduction_add_v16i32: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: partial_reduction_add_v16i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovd %xmm0, %eax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: partial_reduction_add_v16i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovd %xmm0, %eax +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: partial_reduction_add_v16i32: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-SLOW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vmovd %xmm0, %eax +; AVX512-SLOW-NEXT: vzeroupper +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: partial_reduction_add_v16i32: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-FAST-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vmovd %xmm0, %eax +; AVX512-FAST-NEXT: vzeroupper +; AVX512-FAST-NEXT: retq %x23 = shufflevector <16 x i32> %x, <16 x i32> undef, <16 x i32> %x0213 = add <16 x i32> %x, %x23 %x13 = shufflevector <16 x i32> %x0213, <16 x i32> undef, <16 x i32> @@ -1773,24 +1855,64 @@ ; SSE3-FAST-NEXT: movd %xmm0, %eax ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: partial_reduction_sub_v8i32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovd %xmm0, %eax -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: partial_reduction_sub_v8i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: partial_reduction_sub_v8i32: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vmovd %xmm0, %eax -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: partial_reduction_sub_v8i32: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: partial_reduction_sub_v8i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovd %xmm0, %eax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: partial_reduction_sub_v8i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovd %xmm0, %eax +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: partial_reduction_sub_v8i32: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-SLOW-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vmovd %xmm0, %eax +; AVX512-SLOW-NEXT: vzeroupper +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: partial_reduction_sub_v8i32: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-FAST-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vmovd %xmm0, %eax +; AVX512-FAST-NEXT: vzeroupper +; AVX512-FAST-NEXT: retq %x23 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> %x0213 = sub <8 x i32> %x, %x23 %x13 = shufflevector <8 x i32> %x0213, <8 x i32> undef, <8 x i32> @@ -1817,15 +1939,15 @@ ; SSE3-FAST-NEXT: movd %xmm0, %eax ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: partial_reduction_sub_v16i32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovd %xmm0, %eax -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: partial_reduction_sub_v16i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq ; ; AVX1-FAST-LABEL: partial_reduction_sub_v16i32: ; AVX1-FAST: # %bb.0: @@ -1836,19 +1958,40 @@ ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq ; +; AVX2-SLOW-LABEL: partial_reduction_sub_v16i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovd %xmm0, %eax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; ; AVX2-FAST-LABEL: partial_reduction_sub_v16i32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovd %xmm0, %eax ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; +; AVX512-SLOW-LABEL: partial_reduction_sub_v16i32: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-SLOW-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vmovd %xmm0, %eax +; AVX512-SLOW-NEXT: vzeroupper +; AVX512-SLOW-NEXT: retq +; ; AVX512-FAST-LABEL: partial_reduction_sub_v16i32: ; AVX512-FAST: # %bb.0: ; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vpsubd %zmm1, %zmm0, %zmm0 ; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX512-FAST-NEXT: vmovd %xmm0, %eax @@ -1937,8 +2080,10 @@ ; ; SSE3-FAST-LABEL: hadd16_8: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0 -; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0 +; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE3-FAST-NEXT: paddw %xmm0, %xmm1 +; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE3-FAST-NEXT: paddw %xmm1, %xmm0 ; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0 ; SSE3-FAST-NEXT: movd %xmm0, %eax ; SSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -1958,8 +2103,10 @@ ; ; AVX-FAST-LABEL: hadd16_8: ; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vmovd %xmm0, %eax ; AVX-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -1986,9 +2133,10 @@ ; ; SSE3-FAST-LABEL: hadd32_4: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0 -; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0 -; SSE3-FAST-NEXT: movd %xmm0, %eax +; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE3-FAST-NEXT: paddd %xmm0, %xmm1 +; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1 +; SSE3-FAST-NEXT: movd %xmm1, %eax ; SSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: hadd32_4: @@ -2002,7 +2150,8 @@ ; ; AVX-FAST-LABEL: hadd32_4: ; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vmovd %xmm0, %eax ; AVX-FAST-NEXT: retq @@ -2032,23 +2181,64 @@ ; SSE3-FAST-NEXT: movd %xmm1, %eax ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: hadd32_8: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovd %xmm0, %eax -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: hadd32_8: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: hadd32_8: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vmovd %xmm0, %eax -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: hadd32_8: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: hadd32_8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovd %xmm0, %eax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: hadd32_8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovd %xmm0, %eax +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: hadd32_8: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-SLOW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vmovd %xmm0, %eax +; AVX512-SLOW-NEXT: vzeroupper +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: hadd32_8: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vmovd %xmm0, %eax +; AVX512-FAST-NEXT: vzeroupper +; AVX512-FAST-NEXT: retq %x226 = shufflevector <8 x i32> %x225, <8 x i32> undef, <8 x i32> %x227 = add <8 x i32> %x225, %x226 %x228 = shufflevector <8 x i32> %x227, <8 x i32> undef, <8 x i32> @@ -2075,23 +2265,64 @@ ; SSE3-FAST-NEXT: movd %xmm1, %eax ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: hadd32_16: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovd %xmm0, %eax -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: hadd32_16: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: hadd32_16: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vmovd %xmm0, %eax -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: hadd32_16: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: hadd32_16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovd %xmm0, %eax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: hadd32_16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovd %xmm0, %eax +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: hadd32_16: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-SLOW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vmovd %xmm0, %eax +; AVX512-SLOW-NEXT: vzeroupper +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: hadd32_16: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-FAST-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vmovd %xmm0, %eax +; AVX512-FAST-NEXT: vzeroupper +; AVX512-FAST-NEXT: retq %x226 = shufflevector <16 x i32> %x225, <16 x i32> undef, <16 x i32> %x227 = add <16 x i32> %x225, %x226 %x228 = shufflevector <16 x i32> %x227, <16 x i32> undef, <16 x i32> @@ -2103,8 +2334,10 @@ define i16 @hadd16_8_optsize(<8 x i16> %x223) optsize { ; SSE3-LABEL: hadd16_8_optsize: ; SSE3: # %bb.0: -; SSE3-NEXT: phaddw %xmm0, %xmm0 -; SSE3-NEXT: phaddw %xmm0, %xmm0 +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE3-NEXT: paddw %xmm0, %xmm1 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE3-NEXT: paddw %xmm1, %xmm0 ; SSE3-NEXT: phaddw %xmm0, %xmm0 ; SSE3-NEXT: movd %xmm0, %eax ; SSE3-NEXT: # kill: def $ax killed $ax killed $eax @@ -2112,8 +2345,10 @@ ; ; AVX-LABEL: hadd16_8_optsize: ; AVX: # %bb.0: -; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax @@ -2131,14 +2366,16 @@ define i32 @hadd32_4_optsize(<4 x i32> %x225) optsize { ; SSE3-LABEL: hadd32_4_optsize: ; SSE3: # %bb.0: -; SSE3-NEXT: phaddd %xmm0, %xmm0 -; SSE3-NEXT: phaddd %xmm0, %xmm0 -; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE3-NEXT: paddd %xmm0, %xmm1 +; SSE3-NEXT: phaddd %xmm1, %xmm1 +; SSE3-NEXT: movd %xmm1, %eax ; SSE3-NEXT: retq ; ; AVX-LABEL: hadd32_4_optsize: ; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq @@ -2153,14 +2390,16 @@ define i32 @hadd32_4_pgso(<4 x i32> %x225) !prof !14 { ; SSE3-LABEL: hadd32_4_pgso: ; SSE3: # %bb.0: -; SSE3-NEXT: phaddd %xmm0, %xmm0 -; SSE3-NEXT: phaddd %xmm0, %xmm0 -; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE3-NEXT: paddd %xmm0, %xmm1 +; SSE3-NEXT: phaddd %xmm1, %xmm1 +; SSE3-NEXT: movd %xmm1, %eax ; SSE3-NEXT: retq ; ; AVX-LABEL: hadd32_4_pgso: ; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq @@ -2181,13 +2420,63 @@ ; SSE3-NEXT: movd %xmm1, %eax ; SSE3-NEXT: retq ; -; AVX-LABEL: hadd32_8_optsize: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-SLOW-LABEL: hadd32_8_optsize: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: hadd32_8_optsize: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: hadd32_8_optsize: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovd %xmm0, %eax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: hadd32_8_optsize: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovd %xmm0, %eax +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: hadd32_8_optsize: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-SLOW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vmovd %xmm0, %eax +; AVX512-SLOW-NEXT: vzeroupper +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: hadd32_8_optsize: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vmovd %xmm0, %eax +; AVX512-FAST-NEXT: vzeroupper +; AVX512-FAST-NEXT: retq %x226 = shufflevector <8 x i32> %x225, <8 x i32> undef, <8 x i32> %x227 = add <8 x i32> %x225, %x226 %x228 = shufflevector <8 x i32> %x227, <8 x i32> undef, <8 x i32> @@ -2205,13 +2494,63 @@ ; SSE3-NEXT: movd %xmm1, %eax ; SSE3-NEXT: retq ; -; AVX-LABEL: hadd32_16_optsize: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-SLOW-LABEL: hadd32_16_optsize: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: hadd32_16_optsize: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: hadd32_16_optsize: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovd %xmm0, %eax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: hadd32_16_optsize: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovd %xmm0, %eax +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: hadd32_16_optsize: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-SLOW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vmovd %xmm0, %eax +; AVX512-SLOW-NEXT: vzeroupper +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: hadd32_16_optsize: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-FAST-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vmovd %xmm0, %eax +; AVX512-FAST-NEXT: vzeroupper +; AVX512-FAST-NEXT: retq %x226 = shufflevector <16 x i32> %x225, <16 x i32> undef, <16 x i32> %x227 = add <16 x i32> %x225, %x226 %x228 = shufflevector <16 x i32> %x227, <16 x i32> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/phaddsub.ll b/llvm/test/CodeGen/X86/phaddsub.ll --- a/llvm/test/CodeGen/X86/phaddsub.ll +++ b/llvm/test/CodeGen/X86/phaddsub.ll @@ -340,38 +340,17 @@ } define <4 x i32> @phaddd_single_source2(<4 x i32> %x) { -; SSSE3-SLOW-LABEL: phaddd_single_source2: -; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: retq -; -; SSSE3-FAST-LABEL: phaddd_single_source2: -; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] -; SSSE3-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: phaddd_single_source2: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: phaddd_single_source2: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] -; AVX-FAST-NEXT: retq +; SSSE3-LABEL: phaddd_single_source2: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; SSSE3-NEXT: retq ; -; AVX2-SHUF-LABEL: phaddd_single_source2: -; AVX2-SHUF: # %bb.0: -; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] -; AVX2-SHUF-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-SHUF-NEXT: retq +; AVX-LABEL: phaddd_single_source2: +; AVX: # %bb.0: +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %add = add <4 x i32> %l, %r @@ -503,38 +482,17 @@ } define <8 x i16> @phaddw_single_source2(<8 x i16> %x) { -; SSSE3-SLOW-LABEL: phaddw_single_source2: -; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7] -; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: retq -; -; SSSE3-FAST-LABEL: phaddw_single_source2: -; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 -; SSSE3-FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7] -; SSSE3-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: phaddw_single_source2: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7] -; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; AVX-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: phaddw_single_source2: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7] -; AVX-FAST-NEXT: retq +; SSSE3-LABEL: phaddw_single_source2: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddw %xmm0, %xmm0 +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] +; SSSE3-NEXT: retq ; -; AVX2-SHUF-LABEL: phaddw_single_source2: -; AVX2-SHUF: # %bb.0: -; AVX2-SHUF-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-SHUF-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; AVX2-SHUF-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-SHUF-NEXT: retq +; AVX-LABEL: phaddw_single_source2: +; AVX: # %bb.0: +; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] +; AVX-NEXT: retq %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %add = add <8 x i16> %l, %r @@ -644,9 +602,11 @@ ; AVX1-FAST-LABEL: PR39936_v8i32: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pmaddubsw.ll b/llvm/test/CodeGen/X86/pmaddubsw.ll --- a/llvm/test/CodeGen/X86/pmaddubsw.ll +++ b/llvm/test/CodeGen/X86/pmaddubsw.ll @@ -10,15 +10,67 @@ define <8 x i16> @pmaddubsw_128(ptr %Aptr, ptr %Bptr) { ; SSE-LABEL: pmaddubsw_128: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rsi), %xmm0 -; SSE-NEXT: pmaddubsw (%rdi), %xmm0 +; SSE-NEXT: movdqa (%rsi), %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: psllw $8, %xmm0 +; SSE-NEXT: psraw $8, %xmm0 +; SSE-NEXT: psraw $8, %xmm2 +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: pmaddwd %xmm4, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: pmaddwd %xmm1, %xmm0 +; SSE-NEXT: packssdw %xmm5, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: pmaddubsw_128: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rsi), %xmm0 -; AVX-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: pmaddubsw_128: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa (%rsi), %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 +; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-NEXT: vpmaddwd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX256-LABEL: pmaddubsw_128: +; AVX256: # %bb.0: +; AVX256-NEXT: vmovdqa (%rdi), %xmm0 +; AVX256-NEXT: vmovdqa (%rsi), %xmm1 +; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX256-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX256-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX256-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX256-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX256-NEXT: vpmovsxbd %xmm3, %ymm3 +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX256-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 +; AVX256-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX256-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %A = load <16 x i8>, ptr %Aptr %B = load <16 x i8>, ptr %Bptr %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> @@ -43,26 +95,189 @@ define <16 x i16> @pmaddubsw_256(ptr %Aptr, ptr %Bptr) { ; SSE-LABEL: pmaddubsw_256: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rsi), %xmm0 -; SSE-NEXT: movdqa 16(%rsi), %xmm1 -; SSE-NEXT: pmaddubsw (%rdi), %xmm0 -; SSE-NEXT: pmaddubsw 16(%rdi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: psllw $8, %xmm0 +; SSE-NEXT: psraw $8, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: psllw $8, %xmm1 +; SSE-NEXT: psraw $8, %xmm1 +; SSE-NEXT: psraw $8, %xmm3 +; SSE-NEXT: psraw $8, %xmm5 +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; SSE-NEXT: pmaddwd %xmm7, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: pmaddwd %xmm4, %xmm1 +; SSE-NEXT: packssdw %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; SSE-NEXT: pmaddwd %xmm4, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: pmaddwd %xmm2, %xmm0 +; SSE-NEXT: packssdw %xmm5, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: pmaddubsw_256: ; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm4 +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX1-NEXT: # xmm5 = mem[0,0] +; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm0[0],xmm1[0] ; AVX1-NEXT: vmovdqa (%rsi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX1-NEXT: vpmaddubsw 16(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm7 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpackuswb %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] +; AVX1-NEXT: vpackuswb %xmm3, %xmm3, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,1,1] +; AVX1-NEXT: vpackuswb %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[3,3,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[3,3,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[1,1,1,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm15 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; AVX1-NEXT: vpmovsxbw %xmm6, %xmm6 +; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX1-NEXT: vpmovsxbw %xmm12, %xmm6 +; AVX1-NEXT: vpmovsxbw %xmm5, %xmm5 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpackssdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-NEXT: vpmovsxbw %xmm11, %xmm5 +; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX1-NEXT: vpmaddwd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX1-NEXT: vpmovsxbw %xmm10, %xmm4 +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; -; AVX256-LABEL: pmaddubsw_256: -; AVX256: # %bb.0: -; AVX256-NEXT: vmovdqa (%rsi), %ymm0 -; AVX256-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 -; AVX256-NEXT: retq +; AVX2-LABEL: pmaddubsw_256: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX2-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm3 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-NEXT: vmovdqa 16(%rsi), %xmm6 +; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm7 +; AVX2-NEXT: vpmovsxbd %xmm7, %ymm7 +; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 +; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2 +; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 +; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpaddd %ymm2, %ymm5, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: pmaddubsw_256: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm4 +; AVX512F-NEXT: vpackuswb %xmm3, %xmm4, %xmm3 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX512F-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-NEXT: vmovdqa 16(%rsi), %xmm5 +; AVX512F-NEXT: vpand %xmm2, %xmm5, %xmm6 +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX512F-NEXT: vpackuswb %xmm6, %xmm2, %xmm2 +; AVX512F-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX512F-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero +; AVX512F-NEXT: vpmulld %zmm2, %zmm3, %zmm2 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512F-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; AVX512F-NEXT: vpmovsdw %zmm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: pmaddubsw_256: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm3 +; AVX512BW-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512BW-NEXT: vpmaddwd %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512BW-NEXT: vpmaddwd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsdw %zmm0, %ymm0 +; AVX512BW-NEXT: retq %A = load <32 x i8>, ptr %Aptr %B = load <32 x i8>, ptr %Bptr %A_even = shufflevector <32 x i8> %A, <32 x i8> undef, <16 x i32> @@ -91,63 +306,548 @@ ; SSE-NEXT: movdqa (%rdx), %xmm0 ; SSE-NEXT: movdqa 16(%rdx), %xmm1 ; SSE-NEXT: movdqa 32(%rdx), %xmm2 -; SSE-NEXT: movdqa 48(%rdx), %xmm3 -; SSE-NEXT: pmaddubsw (%rsi), %xmm0 -; SSE-NEXT: pmaddubsw 16(%rsi), %xmm1 -; SSE-NEXT: pmaddubsw 32(%rsi), %xmm2 -; SSE-NEXT: pmaddubsw 48(%rsi), %xmm3 -; SSE-NEXT: movdqa 64(%rdx), %xmm4 -; SSE-NEXT: pmaddubsw 64(%rsi), %xmm4 -; SSE-NEXT: movdqa 80(%rdx), %xmm5 -; SSE-NEXT: pmaddubsw 80(%rsi), %xmm5 -; SSE-NEXT: movdqa 96(%rdx), %xmm6 -; SSE-NEXT: pmaddubsw 96(%rsi), %xmm6 -; SSE-NEXT: movdqa 112(%rdx), %xmm7 -; SSE-NEXT: pmaddubsw 112(%rsi), %xmm7 -; SSE-NEXT: movdqa %xmm7, 112(%rdi) +; SSE-NEXT: movdqa 48(%rdx), %xmm4 +; SSE-NEXT: movdqa 64(%rdx), %xmm7 +; SSE-NEXT: movdqa 80(%rdx), %xmm9 +; SSE-NEXT: movdqa 96(%rdx), %xmm10 +; SSE-NEXT: movdqa 112(%rdx), %xmm8 +; SSE-NEXT: movdqa 112(%rsi), %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: psllw $8, %xmm3 +; SSE-NEXT: psraw $8, %xmm3 +; SSE-NEXT: psraw $8, %xmm6 +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm5[8],xmm11[9],xmm5[9],xmm11[10],xmm5[10],xmm11[11],xmm5[11],xmm11[12],xmm5[12],xmm11[13],xmm5[13],xmm11[14],xmm5[14],xmm11[15],xmm5[15] +; SSE-NEXT: movdqa %xmm3, %xmm12 +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; SSE-NEXT: pmaddwd %xmm11, %xmm12 +; SSE-NEXT: movdqa 96(%rsi), %xmm11 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: psllw $8, %xmm6 +; SSE-NEXT: psraw $8, %xmm6 +; SSE-NEXT: psraw $8, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; SSE-NEXT: pmaddwd %xmm8, %xmm3 +; SSE-NEXT: packssdw %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] +; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; SSE-NEXT: pmaddwd %xmm8, %xmm12 +; SSE-NEXT: movdqa 80(%rsi), %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: psllw $8, %xmm8 +; SSE-NEXT: psraw $8, %xmm8 +; SSE-NEXT: psraw $8, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; SSE-NEXT: pmaddwd %xmm10, %xmm6 +; SSE-NEXT: packssdw %xmm12, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15] +; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] +; SSE-NEXT: pmaddwd %xmm10, %xmm11 +; SSE-NEXT: movdqa 64(%rsi), %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3] +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: psllw $8, %xmm10 +; SSE-NEXT: psraw $8, %xmm10 +; SSE-NEXT: psraw $8, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; SSE-NEXT: pmaddwd %xmm9, %xmm8 +; SSE-NEXT: packssdw %xmm11, %xmm8 +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] +; SSE-NEXT: movdqa %xmm10, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; SSE-NEXT: pmaddwd %xmm9, %xmm11 +; SSE-NEXT: movdqa 48(%rsi), %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; SSE-NEXT: movdqa %xmm13, %xmm9 +; SSE-NEXT: psllw $8, %xmm9 +; SSE-NEXT: psraw $8, %xmm9 +; SSE-NEXT: psraw $8, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE-NEXT: pmaddwd %xmm7, %xmm10 +; SSE-NEXT: packssdw %xmm11, %xmm10 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] +; SSE-NEXT: pmaddwd %xmm7, %xmm11 +; SSE-NEXT: movdqa 32(%rsi), %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: psllw $8, %xmm7 +; SSE-NEXT: psraw $8, %xmm7 +; SSE-NEXT: psraw $8, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE-NEXT: pmaddwd %xmm4, %xmm9 +; SSE-NEXT: packssdw %xmm11, %xmm9 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; SSE-NEXT: pmaddwd %xmm4, %xmm11 +; SSE-NEXT: movdqa 16(%rsi), %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3] +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: psllw $8, %xmm4 +; SSE-NEXT: psraw $8, %xmm4 +; SSE-NEXT: psraw $8, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: pmaddwd %xmm2, %xmm7 +; SSE-NEXT: packssdw %xmm11, %xmm7 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] +; SSE-NEXT: pmaddwd %xmm2, %xmm11 +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: psllw $8, %xmm12 +; SSE-NEXT: psraw $8, %xmm12 +; SSE-NEXT: psraw $8, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: pmaddwd %xmm1, %xmm4 +; SSE-NEXT: packssdw %xmm11, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: movdqa %xmm12, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] +; SSE-NEXT: pmaddwd %xmm1, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] +; SSE-NEXT: pmaddwd %xmm0, %xmm12 +; SSE-NEXT: packssdw %xmm11, %xmm12 +; SSE-NEXT: movdqa %xmm3, 112(%rdi) ; SSE-NEXT: movdqa %xmm6, 96(%rdi) -; SSE-NEXT: movdqa %xmm5, 80(%rdi) -; SSE-NEXT: movdqa %xmm4, 64(%rdi) -; SSE-NEXT: movdqa %xmm3, 48(%rdi) -; SSE-NEXT: movdqa %xmm2, 32(%rdi) -; SSE-NEXT: movdqa %xmm1, 16(%rdi) -; SSE-NEXT: movdqa %xmm0, (%rdi) +; SSE-NEXT: movdqa %xmm8, 80(%rdi) +; SSE-NEXT: movdqa %xmm10, 64(%rdi) +; SSE-NEXT: movdqa %xmm9, 48(%rdi) +; SSE-NEXT: movdqa %xmm7, 32(%rdi) +; SSE-NEXT: movdqa %xmm4, 16(%rdi) +; SSE-NEXT: movdqa %xmm12, (%rdi) ; SSE-NEXT: retq ; ; AVX1-LABEL: pmaddubsw_512: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3 -; AVX1-NEXT: vpmaddubsw 16(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpmaddubsw 48(%rdi), %xmm3, %xmm1 -; AVX1-NEXT: vpmaddubsw 32(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vmovdqa 80(%rsi), %xmm2 -; AVX1-NEXT: vpmaddubsw 80(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX1-NEXT: vpmaddubsw 64(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vmovdqa 112(%rsi), %xmm3 -; AVX1-NEXT: vpmaddubsw 112(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa 96(%rsi), %xmm4 -; AVX1-NEXT: vpmaddubsw 96(%rdi), %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: subq $104, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 112 +; AVX1-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm14 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm15 +; AVX1-NEXT: vmovdqa 80(%rdi), %xmm12 +; AVX1-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX1-NEXT: vmovdqa 112(%rdi), %xmm9 +; AVX1-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX1-NEXT: # xmm4 = mem[0,0] +; AVX1-NEXT: vpshufb %xmm4, %xmm14, %xmm0 +; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm4, %xmm15, %xmm0 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm4, %xmm12, %xmm0 +; AVX1-NEXT: vpshufb %xmm4, %xmm11, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm4, %xmm9, %xmm0 +; AVX1-NEXT: vpshufb %xmm4, %xmm8, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa (%rsi), %xmm7 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm6 +; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm5 +; AVX1-NEXT: vpshufb %xmm4, %xmm7, %xmm10 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm10[0],xmm5[0] +; AVX1-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX1-NEXT: vmovdqa 48(%rsi), %xmm2 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm10 +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm13 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm13[0],xmm10[0] +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa 80(%rsi), %xmm13 +; AVX1-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX1-NEXT: vpshufb %xmm4, %xmm13, %xmm0 +; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm10 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0] +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm10, %xmm14, %xmm14 +; AVX1-NEXT: vpand {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpand %xmm10, %xmm15, %xmm15 +; AVX1-NEXT: vpand {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpand %xmm10, %xmm12, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vpand %xmm10, %xmm11, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpand %xmm10, %xmm9, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpand %xmm10, %xmm8, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpand %xmm6, %xmm10, %xmm0 +; AVX1-NEXT: vpand %xmm7, %xmm10, %xmm7 +; AVX1-NEXT: vpand %xmm2, %xmm10, %xmm6 +; AVX1-NEXT: vpand %xmm3, %xmm10, %xmm11 +; AVX1-NEXT: vpand %xmm10, %xmm13, %xmm12 +; AVX1-NEXT: vpand %xmm1, %xmm10, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa 112(%rsi), %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm10, %xmm13 +; AVX1-NEXT: vmovdqa 96(%rsi), %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm10, %xmm3 +; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpackuswb %xmm0, %xmm7, %xmm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-NEXT: vpackuswb %xmm14, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-NEXT: vpmovsxbw %xmm9, %xmm7 +; AVX1-NEXT: vpmovsxbw %xmm3, %xmm8 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX1-NEXT: vpmaddwd %xmm1, %xmm7, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[1,1,1,1] +; AVX1-NEXT: vmovdqa %xmm9, %xmm10 +; AVX1-NEXT: vpmovsxbw %xmm7, %xmm7 +; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; AVX1-NEXT: vpmaddwd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,3,3,3] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-NEXT: vpackuswb %xmm14, %xmm14, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[3,3,3,3] +; AVX1-NEXT: vpmovsxbw %xmm7, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[1,1,1,1] +; AVX1-NEXT: vpmovsxbw %xmm8, %xmm8 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX1-NEXT: vpmaddwd %xmm2, %xmm7, %xmm2 +; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] +; AVX1-NEXT: vpmovsxbw %xmm5, %xmm5 +; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX1-NEXT: vpmaddwd %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpackuswb %xmm6, %xmm11, %xmm1 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-NEXT: vpackuswb %xmm15, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-NEXT: vpmovsxbw %xmm8, %xmm5 +; AVX1-NEXT: vpmovsxbw %xmm3, %xmm7 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX1-NEXT: vpmaddwd %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[1,1,1,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[1,1,1,1] +; AVX1-NEXT: vpmovsxbw %xmm5, %xmm5 +; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX1-NEXT: vpmaddwd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm6, %xmm6, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[3,3,3,3] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX1-NEXT: vpackuswb %xmm15, %xmm15, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[3,3,3,3] +; AVX1-NEXT: vpmovsxbw %xmm6, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] +; AVX1-NEXT: vpmovsxbw %xmm7, %xmm7 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-NEXT: vpmaddwd %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[2,3,2,3] +; AVX1-NEXT: vpmovsxbw %xmm6, %xmm6 +; AVX1-NEXT: vpmovsxbw %xmm5, %xmm5 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-NEXT: vpmaddwd %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-NEXT: vpackuswb %xmm12, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX1-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-NEXT: vpackuswb %xmm14, %xmm5, %xmm5 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-NEXT: vpmovsxbw %xmm8, %xmm6 +; AVX1-NEXT: vpmovsxbw %xmm5, %xmm7 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-NEXT: vpmaddwd %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] +; AVX1-NEXT: vmovdqa %xmm10, %xmm11 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[1,1,1,1] +; AVX1-NEXT: vmovdqa %xmm8, %xmm10 +; AVX1-NEXT: vpmovsxbw %xmm6, %xmm6 +; AVX1-NEXT: vpmovsxbw %xmm5, %xmm5 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-NEXT: vpackuswb %xmm14, %xmm14, %xmm6 +; AVX1-NEXT: vpackuswb %xmm12, %xmm12, %xmm7 +; AVX1-NEXT: vpmaddwd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,1,1,1] +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[3,3,3,3] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[3,3,3,3] +; AVX1-NEXT: vpmovsxbw %xmm5, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[1,1,1,1] +; AVX1-NEXT: vpmovsxbw %xmm8, %xmm8 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX1-NEXT: vpmaddwd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[2,3,2,3] +; AVX1-NEXT: vpmovsxbw %xmm7, %xmm7 +; AVX1-NEXT: vpmovsxbw %xmm6, %xmm6 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-NEXT: vpackuswb %xmm11, %xmm7, %xmm7 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-NEXT: vpackuswb %xmm13, %xmm8, %xmm8 +; AVX1-NEXT: vpmaddwd %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpackssdw %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-NEXT: vpmovsxbw %xmm10, %xmm5 +; AVX1-NEXT: vpmovsxbw %xmm7, %xmm6 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[1,1,1,1] +; AVX1-NEXT: vpackuswb %xmm11, %xmm11, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,1,1] +; AVX1-NEXT: vpmaddwd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[1,1,1,1] +; AVX1-NEXT: vmovdqa %xmm10, %xmm11 +; AVX1-NEXT: vpmovsxbw %xmm8, %xmm8 +; AVX1-NEXT: vpmovsxbw %xmm6, %xmm6 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,1,1,1] +; AVX1-NEXT: vpackuswb %xmm13, %xmm13, %xmm10 +; AVX1-NEXT: vpmaddwd %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] +; AVX1-NEXT: vpackssdw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[3,3,3,3] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[3,3,3,3] +; AVX1-NEXT: vpmovsxbw %xmm6, %xmm6 +; AVX1-NEXT: vpmovsxbw %xmm8, %xmm8 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX1-NEXT: vpmaddwd %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[2,3,2,3] +; AVX1-NEXT: vpmovsxbw %xmm6, %xmm6 +; AVX1-NEXT: vpmovsxbw %xmm7, %xmm7 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-NEXT: vpmaddwd %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vpackssdw %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: addq $104, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 ; AVX1-NEXT: retq ; ; AVX2-LABEL: pmaddubsw_512: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX2-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX2-NEXT: vmovdqa 96(%rsi), %ymm3 -; AVX2-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpmaddubsw 32(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vpmaddubsw 64(%rdi), %ymm2, %ymm2 -; AVX2-NEXT: vpmaddubsw 96(%rdi), %ymm3, %ymm3 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm6 +; AVX2-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-NEXT: vmovdqa 32(%rsi), %ymm9 +; AVX2-NEXT: vmovdqa 64(%rsi), %ymm3 +; AVX2-NEXT: vmovdqa 96(%rsi), %ymm7 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm10 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX2-NEXT: vpshufb %ymm10, %ymm1, %ymm5 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX2-NEXT: vpshufb %ymm11, %ymm0, %ymm8 +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,1,3] +; AVX2-NEXT: vpshufb %ymm10, %ymm6, %ymm8 +; AVX2-NEXT: vpshufb %ymm11, %ymm4, %ymm12 +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,1,3] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm12 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX2-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm13 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX2-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpshufb %ymm12, %ymm6, %ymm1 +; AVX2-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpshufb %ymm10, %ymm9, %ymm4 +; AVX2-NEXT: vpshufb %ymm11, %ymm2, %ymm6 +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] +; AVX2-NEXT: vpshufb %ymm10, %ymm7, %ymm6 +; AVX2-NEXT: vpshufb %ymm11, %ymm3, %ymm10 +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3],ymm10[4,5],ymm6[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,1,3] +; AVX2-NEXT: vpshufb %ymm12, %ymm9, %ymm9 +; AVX2-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm9[2,3],ymm2[4,5],ymm9[6,7] +; AVX2-NEXT: vpshufb %ymm12, %ymm7, %ymm7 +; AVX2-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3],ymm3[4,5],ymm7[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm7 +; AVX2-NEXT: vpmovsxbd %xmm7, %ymm9 +; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] +; AVX2-NEXT: vpmovsxbd %xmm7, %ymm7 +; AVX2-NEXT: vpmovsxbd %xmm8, %ymm12 +; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] +; AVX2-NEXT: vpmovsxbd %xmm8, %ymm8 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm10 +; AVX2-NEXT: vpmovsxbd %xmm10, %ymm13 +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,3,2,3] +; AVX2-NEXT: vpmovsxbd %xmm10, %ymm14 +; AVX2-NEXT: vpmovsxbd %xmm5, %ymm15 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX2-NEXT: vpmovsxbd %xmm5, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm5 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm10, %ymm9, %ymm11 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm5, %ymm7, %ymm10 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm5, %ymm12, %ymm9 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm5, %ymm8, %ymm8 +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm6, %ymm13, %ymm7 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm5, %ymm14, %ymm6 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm5, %ymm15, %ymm5 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm4, %ymm0, %ymm4 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm12 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm13 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero,xmm13[2],zero,zero,zero,xmm13[3],zero,zero,zero,xmm13[4],zero,zero,zero,xmm13[5],zero,zero,zero,xmm13[6],zero,zero,zero,xmm13[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm14, %ymm12, %ymm12 +; AVX2-NEXT: vpaddd %ymm12, %ymm11, %ymm11 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[2,3,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero,xmm12[4],zero,zero,zero,xmm12[5],zero,zero,zero,xmm12[6],zero,zero,zero,xmm12[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm0, %ymm12, %ymm0 +; AVX2-NEXT: vpaddd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vpmovsxbd %xmm1, %ymm12 +; AVX2-NEXT: vpackssdw %ymm0, %ymm11, %ymm10 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm0, %ymm12, %ymm0 +; AVX2-NEXT: vpermq $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-NEXT: # ymm11 = mem[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm0, %ymm9, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm9 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpmovsxbd %xmm9, %ymm3 +; AVX2-NEXT: vpaddd %ymm1, %ymm8, %ymm1 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm8 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] +; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpaddd %ymm0, %ymm6, %ymm0 +; AVX2-NEXT: vpmovsxbd %xmm11, %ymm3 +; AVX2-NEXT: vpackssdw %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpaddd %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[2,3,2,3] +; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpaddd %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpackssdw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm8[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm10[0,2,1,3] ; AVX2-NEXT: retq ; ; AVX512F-LABEL: pmaddubsw_512: @@ -166,10 +866,73 @@ ; ; AVX512BW-LABEL: pmaddubsw_512: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512BW-NEXT: vpmaddubsw (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaddubsw 64(%rdi), %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [128,128,128,128,128,128,128,128,0,2,4,6,8,10,12,14,128,128,128,128,128,128,128,128,16,18,20,22,24,26,28,30,128,128,128,128,128,128,128,128,32,34,36,38,40,42,44,46,128,128,128,128,128,128,128,128,48,50,52,54,56,58,60,62] +; AVX512BW-NEXT: vpshufb %zmm4, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,2,4,6,8,10,12,14,128,128,128,128,128,128,128,128,16,18,20,22,24,26,28,30,128,128,128,128,128,128,128,128,32,34,36,38,40,42,44,46,128,128,128,128,128,128,128,128,48,50,52,54,56,58,60,62,128,128,128,128,128,128,128,128] +; AVX512BW-NEXT: vpshufb %zmm6, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,9,11,13,15] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [128,128,128,128,128,128,128,128,1,3,5,7,9,11,13,15,128,128,128,128,128,128,128,128,17,19,21,23,25,27,29,31,128,128,128,128,128,128,128,128,33,35,37,39,41,43,45,47,128,128,128,128,128,128,128,128,49,51,53,55,57,59,61,63] +; AVX512BW-NEXT: vpshufb %zmm5, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [1,3,5,7,9,11,13,15,128,128,128,128,128,128,128,128,17,19,21,23,25,27,29,31,128,128,128,128,128,128,128,128,33,35,37,39,41,43,45,47,128,128,128,128,128,128,128,128,49,51,53,55,57,59,61,63,128,128,128,128,128,128,128,128] +; AVX512BW-NEXT: vpshufb %zmm10, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm8, %zmm1 +; AVX512BW-NEXT: vpshufb %zmm4, %zmm3, %zmm0 +; AVX512BW-NEXT: vpshufb %zmm6, %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm4 +; AVX512BW-NEXT: vpshufb %zmm5, %zmm3, %zmm3 +; AVX512BW-NEXT: vpshufb %zmm10, %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm0 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm7, %ymm2 +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512BW-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm5 +; AVX512BW-NEXT: vpmovsxbd %xmm5, %zmm5 +; AVX512BW-NEXT: vpmovsxbd %xmm7, %zmm6 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm4, %ymm7 +; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero,xmm8[4],zero,zero,zero,xmm8[5],zero,zero,zero,xmm8[6],zero,zero,zero,xmm8[7],zero,zero,zero,xmm8[8],zero,zero,zero,xmm8[9],zero,zero,zero,xmm8[10],zero,zero,zero,xmm8[11],zero,zero,zero,xmm8[12],zero,zero,zero,xmm8[13],zero,zero,zero,xmm8[14],zero,zero,zero,xmm8[15],zero,zero,zero +; AVX512BW-NEXT: vpmaddwd %zmm8, %zmm3, %zmm3 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero,xmm7[8],zero,zero,zero,xmm7[9],zero,zero,zero,xmm7[10],zero,zero,zero,xmm7[11],zero,zero,zero,xmm7[12],zero,zero,zero,xmm7[13],zero,zero,zero,xmm7[14],zero,zero,zero,xmm7[15],zero,zero,zero +; AVX512BW-NEXT: vpmaddwd %zmm7, %zmm2, %zmm2 +; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero,xmm7[8],zero,zero,zero,xmm7[9],zero,zero,zero,xmm7[10],zero,zero,zero,xmm7[11],zero,zero,zero,xmm7[12],zero,zero,zero,xmm7[13],zero,zero,zero,xmm7[14],zero,zero,zero,xmm7[15],zero,zero,zero +; AVX512BW-NEXT: vpmaddwd %zmm7, %zmm5, %zmm5 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero +; AVX512BW-NEXT: vpmaddwd %zmm4, %zmm6, %zmm4 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512BW-NEXT: vpmovsxbd %xmm7, %zmm7 +; AVX512BW-NEXT: vpmovsxbd %xmm6, %zmm6 +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX512BW-NEXT: vpmovsxbd %xmm8, %zmm8 +; AVX512BW-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm9 +; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero,xmm10[4],zero,zero,zero,xmm10[5],zero,zero,zero,xmm10[6],zero,zero,zero,xmm10[7],zero,zero,zero,xmm10[8],zero,zero,zero,xmm10[9],zero,zero,zero,xmm10[10],zero,zero,zero,xmm10[11],zero,zero,zero,xmm10[12],zero,zero,zero,xmm10[13],zero,zero,zero,xmm10[14],zero,zero,zero,xmm10[15],zero,zero,zero +; AVX512BW-NEXT: vpmaddwd %zmm10, %zmm7, %zmm7 +; AVX512BW-NEXT: vpaddd %zmm7, %zmm3, %zmm3 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm7 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero,xmm9[8],zero,zero,zero,xmm9[9],zero,zero,zero,xmm9[10],zero,zero,zero,xmm9[11],zero,zero,zero,xmm9[12],zero,zero,zero,xmm9[13],zero,zero,zero,xmm9[14],zero,zero,zero,xmm9[15],zero,zero,zero +; AVX512BW-NEXT: vpmaddwd %zmm7, %zmm6, %zmm6 +; AVX512BW-NEXT: vpaddd %zmm6, %zmm2, %zmm2 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero,xmm6[8],zero,zero,zero,xmm6[9],zero,zero,zero,xmm6[10],zero,zero,zero,xmm6[11],zero,zero,zero,xmm6[12],zero,zero,zero,xmm6[13],zero,zero,zero,xmm6[14],zero,zero,zero,xmm6[15],zero,zero,zero +; AVX512BW-NEXT: vpmaddwd %zmm6, %zmm8, %zmm6 +; AVX512BW-NEXT: vpaddd %zmm6, %zmm5, %zmm5 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512BW-NEXT: vpmaddwd %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm0, %zmm4, %zmm0 +; AVX512BW-NEXT: vpmovsdw %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovsdw %zmm5, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovsdw %zmm2, %ymm1 +; AVX512BW-NEXT: vpmovsdw %zmm3, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512BW-NEXT: retq %A = load <128 x i8>, ptr %Aptr %B = load <128 x i8>, ptr %Bptr @@ -195,15 +958,78 @@ define <8 x i16> @pmaddubsw_swapped_indices(ptr %Aptr, ptr %Bptr) { ; SSE-LABEL: pmaddubsw_swapped_indices: ; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa (%rsi), %xmm0 -; SSE-NEXT: pmaddubsw (%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pshufb {{.*#+}} xmm2 = xmm2[u,1,u,2,u,5,u,6,u,9,u,10,u,13,u,14] +; SSE-NEXT: psraw $8, %xmm2 +; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,0,u,3,u,4,u,7,u,8,u,11,u,12,u,15] +; SSE-NEXT: psraw $8, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pshufb {{.*#+}} xmm4 = xmm4[9],zero,xmm4[8],zero,xmm4[10],zero,xmm4[11],zero,xmm4[13],zero,xmm4[12],zero,xmm4[14],zero,xmm4[15],zero +; SSE-NEXT: pmaddwd %xmm3, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1],zero,xmm0[0],zero,xmm0[2],zero,xmm0[3],zero,xmm0[5],zero,xmm0[4],zero,xmm0[6],zero,xmm0[7],zero +; SSE-NEXT: pmaddwd %xmm2, %xmm0 +; SSE-NEXT: packssdw %xmm4, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: pmaddubsw_swapped_indices: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rsi), %xmm0 -; AVX-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: pmaddubsw_swapped_indices: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa (%rsi), %xmm1 +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,2,5,6,9,10,13,14,1,2,5,6,9,10,13,14] +; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [0,3,4,7,8,11,12,15,0,3,4,7,8,11,12,15] +; AVX1-NEXT: # xmm4 = mem[0,0] +; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbw %xmm3, %xmm2 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-NEXT: vpmovsxbw %xmm6, %xmm2 +; AVX1-NEXT: vpmovsxbw %xmm4, %xmm3 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX256-LABEL: pmaddubsw_swapped_indices: +; AVX256: # %bb.0: +; AVX256-NEXT: vmovdqa (%rdi), %xmm0 +; AVX256-NEXT: vmovdqa (%rsi), %xmm1 +; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,2,5,6,9,10,13,14,1,2,5,6,9,10,13,14] +; AVX256-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,3,4,7,8,11,12,15,0,3,4,7,8,11,12,15] +; AVX256-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX256-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX256-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX256-NEXT: vpmovsxbd %xmm3, %ymm3 +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX256-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 +; AVX256-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX256-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %A = load <16 x i8>, ptr %Aptr %B = load <16 x i8>, ptr %Bptr %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> ;indices aren't all even @@ -228,15 +1054,67 @@ define <8 x i16> @pmaddubsw_swapped_extend(ptr %Aptr, ptr %Bptr) { ; SSE-LABEL: pmaddubsw_swapped_extend: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: pmaddubsw (%rsi), %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: psllw $8, %xmm0 +; SSE-NEXT: psraw $8, %xmm0 +; SSE-NEXT: psraw $8, %xmm2 +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: pmaddwd %xmm4, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: pmaddwd %xmm1, %xmm0 +; SSE-NEXT: packssdw %xmm5, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: pmaddubsw_swapped_extend: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpmaddubsw (%rsi), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: pmaddubsw_swapped_extend: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa (%rsi), %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] +; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 +; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-NEXT: vpmaddwd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX256-LABEL: pmaddubsw_swapped_extend: +; AVX256: # %bb.0: +; AVX256-NEXT: vmovdqa (%rdi), %xmm0 +; AVX256-NEXT: vmovdqa (%rsi), %xmm1 +; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX256-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX256-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX256-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX256-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX256-NEXT: vpmovsxbd %xmm2, %ymm2 +; AVX256-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX256-NEXT: vpmovsxbd %xmm1, %ymm1 +; AVX256-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %A = load <16 x i8>, ptr %Aptr %B = load <16 x i8>, ptr %Bptr %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> @@ -261,15 +1139,77 @@ define <8 x i16> @pmaddubsw_commuted_mul(ptr %Aptr, ptr %Bptr) { ; SSE-LABEL: pmaddubsw_commuted_mul: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rsi), %xmm0 -; SSE-NEXT: pmaddubsw (%rdi), %xmm0 +; SSE-NEXT: movdqa (%rsi), %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: psllw $8, %xmm0 +; SSE-NEXT: psraw $8, %xmm0 +; SSE-NEXT: psraw $8, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,8],zero,xmm3[10],zero,xmm3[12],zero,xmm3[14],zero +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: psrlw $8, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: pmaddwd %xmm3, %xmm5 +; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2],zero,xmm1[4],zero,xmm1[6],zero,xmm1[u,u,u,u,u,u,u,u] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: pmaddwd %xmm1, %xmm0 +; SSE-NEXT: packssdw %xmm5, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: pmaddubsw_commuted_mul: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rsi), %xmm0 -; AVX-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: pmaddubsw_commuted_mul: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa (%rsi), %xmm1 +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] +; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm7 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-NEXT: vpmaddwd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbw %xmm4, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[9],zero,xmm1[11],zero,xmm1[13],zero,xmm1[15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX1-NEXT: vpmovsxbw %xmm6, %xmm3 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX256-LABEL: pmaddubsw_commuted_mul: +; AVX256: # %bb.0: +; AVX256-NEXT: vmovdqa (%rdi), %xmm0 +; AVX256-NEXT: vmovdqa (%rsi), %xmm1 +; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX256-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX256-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX256-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX256-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX256-NEXT: vpmovsxbd %xmm3, %ymm3 +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX256-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2 +; AVX256-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX256-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %A = load <16 x i8>, ptr %Aptr %B = load <16 x i8>, ptr %Bptr %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> @@ -470,3 +1410,5 @@ %trunc = trunc <8 x i32> %min to <8 x i16> ret <8 x i16> %trunc } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX: {{.*}} diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -6,10 +6,16 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW define <4 x i16> @zext_mulhuw_v4i16(<4 x i16> %a, <4 x i16> %b) { -; SSE-LABEL: zext_mulhuw_v4i16: -; SSE: # %bb.0: -; SSE-NEXT: pmulhuw %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: zext_mulhuw_v4i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pmulhuw %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: retq +; +; SSE41-LABEL: zext_mulhuw_v4i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmulhuw %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: zext_mulhuw_v4i16: ; AVX: # %bb.0: @@ -26,37 +32,47 @@ define <4 x i16> @and_mulhuw_v4i16(<4 x i64> %a, <4 x i64> %b) { ; SSE2-LABEL: and_mulhuw_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: xorps %xmm3, %xmm3 -; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,0,0,65535,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pmuludq %xmm3, %xmm1 +; SSE2-NEXT: psrlq $16, %xmm1 +; SSE2-NEXT: psrlq $16, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE2-NEXT: pslld $16, %xmm0 ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm3, %xmm0 -; SSE2-NEXT: pmulhuw %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: packssdw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: and_mulhuw_v4i16: ; SSE41: # %bb.0: ; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: packusdw %xmm4, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4],xmm4[5],xmm3[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3,4],xmm4[5],xmm1[6,7] +; SSE41-NEXT: pmuldq %xmm3, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4],xmm4[5],xmm2[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4],xmm4[5],xmm0[6,7] +; SSE41-NEXT: pmuldq %xmm2, %xmm0 +; SSE41-NEXT: psrlq $16, %xmm0 +; SSE41-NEXT: psrlq $16, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] ; SSE41-NEXT: packusdw %xmm1, %xmm0 ; SSE41-NEXT: packusdw %xmm4, %xmm0 -; SSE41-NEXT: pmulhuw %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX2-LABEL: and_mulhuw_v4i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] +; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlq $16, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 @@ -65,7 +81,11 @@ ; ; AVX512-LABEL: and_mulhuw_v4i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] +; AVX512-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrlq $16, %ymm0, %ymm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -78,14 +98,32 @@ } define <4 x i16> @sext_mulhw_v4i16(<4 x i16> %a, <4 x i16> %b) { -; SSE-LABEL: sext_mulhw_v4i16: -; SSE: # %bb.0: -; SSE-NEXT: pmulhw %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: sext_mulhw_v4i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pmaddwd %xmm1, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sext_mulhw_v4i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: pmaddwd %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: packusdw %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: sext_mulhw_v4i16: ; AVX: # %bb.0: -; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %a1 = sext <4 x i16> %a to <4 x i32> %b1 = sext <4 x i16> %b to <4 x i32> @@ -103,6 +141,7 @@ ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: packssdw %xmm0, %xmm0 ; SSE2-NEXT: pmulhw %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: ashr_mulhw_v4i16: @@ -217,20 +256,40 @@ } define <8 x i16> @sextinreg_mulhw_v8i16(<8 x i32> %a, <8 x i32> %b) { -; SSE-LABEL: sextinreg_mulhw_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: pslld $24, %xmm1 -; SSE-NEXT: psrad $24, %xmm1 -; SSE-NEXT: pslld $24, %xmm0 -; SSE-NEXT: psrad $24, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: pslld $25, %xmm3 -; SSE-NEXT: psrad $25, %xmm3 -; SSE-NEXT: pslld $25, %xmm2 -; SSE-NEXT: psrad $25, %xmm2 -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: pmulhw %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: sextinreg_mulhw_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pslld $24, %xmm1 +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: pslld $24, %xmm0 +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: pslld $25, %xmm3 +; SSE2-NEXT: psrad $25, %xmm3 +; SSE2-NEXT: pslld $25, %xmm2 +; SSE2-NEXT: psrad $25, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: pmullw %xmm0, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sextinreg_mulhw_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pslld $24, %xmm1 +; SSE41-NEXT: psrad $24, %xmm1 +; SSE41-NEXT: pslld $24, %xmm0 +; SSE41-NEXT: psrad $24, %xmm0 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: pslld $25, %xmm3 +; SSE41-NEXT: psrad $25, %xmm3 +; SSE41-NEXT: pslld $25, %xmm2 +; SSE41-NEXT: psrad $25, %xmm2 +; SSE41-NEXT: packssdw %xmm3, %xmm2 +; SSE41-NEXT: pmulhw %xmm2, %xmm0 +; SSE41-NEXT: retq ; ; AVX2-LABEL: sextinreg_mulhw_v8i16: ; AVX2: # %bb.0: @@ -309,22 +368,26 @@ define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) { ; SSE2-LABEL: and_mulhuw_v16i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767] -; SSE2-NEXT: pand %xmm8, %xmm3 -; SSE2-NEXT: pand %xmm8, %xmm2 -; SSE2-NEXT: packssdw %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm8, %xmm1 -; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm8, %xmm7 -; SSE2-NEXT: pand %xmm8, %xmm6 -; SSE2-NEXT: packssdw %xmm7, %xmm6 -; SSE2-NEXT: pmulhw %xmm2, %xmm6 -; SSE2-NEXT: pand %xmm8, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm8 -; SSE2-NEXT: packssdw %xmm5, %xmm8 -; SSE2-NEXT: pmulhw %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767] +; SSE2-NEXT: pand %xmm1, %xmm8 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm5 +; SSE2-NEXT: pmaddwd %xmm8, %xmm5 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pmaddwd %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm7 +; SSE2-NEXT: pmaddwd %xmm3, %xmm7 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pmaddwd %xmm2, %xmm1 +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm5, %xmm0 +; SSE2-NEXT: psrad $16, %xmm7 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm7, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: and_mulhuw_v16i16: @@ -332,30 +395,36 @@ ; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767] ; SSE41-NEXT: pand %xmm8, %xmm3 ; SSE41-NEXT: pand %xmm8, %xmm2 -; SSE41-NEXT: packusdw %xmm3, %xmm2 ; SSE41-NEXT: pand %xmm8, %xmm1 ; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: packusdw %xmm1, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm7 +; SSE41-NEXT: pmaddwd %xmm3, %xmm7 ; SSE41-NEXT: pand %xmm8, %xmm6 -; SSE41-NEXT: packusdw %xmm7, %xmm6 -; SSE41-NEXT: pmulhw %xmm2, %xmm6 +; SSE41-NEXT: pmaddwd %xmm2, %xmm6 ; SSE41-NEXT: pand %xmm8, %xmm5 +; SSE41-NEXT: pmaddwd %xmm1, %xmm5 ; SSE41-NEXT: pand %xmm4, %xmm8 -; SSE41-NEXT: packusdw %xmm5, %xmm8 -; SSE41-NEXT: pmulhw %xmm8, %xmm0 +; SSE41-NEXT: pmaddwd %xmm8, %xmm0 +; SSE41-NEXT: psrld $16, %xmm7 +; SSE41-NEXT: psrld $16, %xmm6 +; SSE41-NEXT: packusdw %xmm7, %xmm6 +; SSE41-NEXT: psrld $16, %xmm5 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: packusdw %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm6, %xmm1 ; SSE41-NEXT: retq ; ; AVX2-LABEL: and_mulhuw_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32767,32767,32767,32767,32767,32767,32767,32767] -; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 -; AVX2-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq @@ -375,7 +444,8 @@ ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] ; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandd %zmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512BW-NEXT: retq %a1 = and <16 x i32> %a, @@ -678,15 +748,105 @@ } define <8 x i16> @zext_mulhuw_v8i16_i64(<8 x i16> %a, <8 x i16> %b) { -; SSE-LABEL: zext_mulhuw_v8i16_i64: -; SSE: # %bb.0: -; SSE-NEXT: pmulhuw %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: zext_mulhuw_v8i16_i64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,1,1,3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,1,3,3] +; SSE2-NEXT: pmuludq %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,1,3,3] +; SSE2-NEXT: pmuludq %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm6, %xmm1 +; SSE2-NEXT: psrlq $16, %xmm7 +; SSE2-NEXT: psrlq $16, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[0,2] +; SSE2-NEXT: psrlq $16, %xmm2 +; SSE2-NEXT: psrlq $16, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: retq ; -; AVX-LABEL: zext_mulhuw_v8i16_i64: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE41-LABEL: zext_mulhuw_v8i16_i64: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: pmuldq %xmm3, %xmm2 +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: pmuldq %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; SSE41-NEXT: pmuldq %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: pmuldq %xmm6, %xmm1 +; SSE41-NEXT: psrlq $16, %xmm2 +; SSE41-NEXT: psrlq $16, %xmm0 +; SSE41-NEXT: psrlq $16, %xmm3 +; SSE41-NEXT: psrlq $16, %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm1, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm2, %xmm0 +; SSE41-NEXT: packusdw %xmm3, %xmm0 +; SSE41-NEXT: retq +; +; AVX2-LABEL: zext_mulhuw_v8i16_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlq $16, %ymm2, %ymm1 +; AVX2-NEXT: vpsrlq $16, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_mulhuw_v8i16_i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %a1 = zext <8 x i16> %a to <8 x i64> %b1 = zext <8 x i16> %b to <8 x i64> %c = mul <8 x i64> %a1, %b1 @@ -696,15 +856,76 @@ } define <8 x i16> @sext_mulhuw_v8i16_i64(<8 x i16> %a, <8 x i16> %b) { -; SSE-LABEL: sext_mulhuw_v8i16_i64: -; SSE: # %bb.0: -; SSE-NEXT: pmulhw %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: sext_mulhuw_v8i16_i64: +; SSE2: # %bb.0: +; SSE2-NEXT: pmulhw %xmm1, %xmm0 +; SSE2-NEXT: retq ; -; AVX-LABEL: sext_mulhuw_v8i16_i64: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE41-LABEL: sext_mulhuw_v8i16_i64: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSE41-NEXT: pmovsxwq %xmm2, %xmm3 +; SSE41-NEXT: pmovsxwq %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE41-NEXT: pmovsxwq %xmm2, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: pmovsxwq %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE41-NEXT: pmovsxwq %xmm0, %xmm2 +; SSE41-NEXT: pmuldq %xmm3, %xmm2 +; SSE41-NEXT: pmovsxwq %xmm1, %xmm0 +; SSE41-NEXT: pmuldq %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; SSE41-NEXT: pmovsxwq %xmm3, %xmm3 +; SSE41-NEXT: pmuldq %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE41-NEXT: pmovsxwq %xmm1, %xmm1 +; SSE41-NEXT: pmuldq %xmm6, %xmm1 +; SSE41-NEXT: psrlq $16, %xmm2 +; SSE41-NEXT: psrlq $16, %xmm0 +; SSE41-NEXT: psrlq $16, %xmm3 +; SSE41-NEXT: psrlq $16, %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm1, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm2, %xmm0 +; SSE41-NEXT: packusdw %xmm3, %xmm0 +; SSE41-NEXT: retq +; +; AVX2-LABEL: sext_mulhuw_v8i16_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm2, %ymm2 +; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm3, %ymm3 +; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 +; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlq $16, %ymm2, %ymm1 +; AVX2-NEXT: vpsrlq $16, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_mulhuw_v8i16_i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %a1 = sext <8 x i16> %a to <8 x i64> %b1 = sext <8 x i16> %b to <8 x i64> %c = mul <8 x i64> %a1, %b1 @@ -742,21 +963,27 @@ define <4 x i32> @mulhsw_v4i16_lshr(<4 x i16> %a, <4 x i16> %b) { ; SSE2-LABEL: mulhsw_v4i16_lshr: ; SSE2: # %bb.0: -; SSE2-NEXT: pmulhw %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pmaddwd %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mulhsw_v4i16_lshr: ; SSE41: # %bb.0: -; SSE41-NEXT: pmulhw %xmm1, %xmm0 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: pmaddwd %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: mulhsw_v4i16_lshr: ; AVX: # %bb.0: -; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX-NEXT: retq %a1 = sext <4 x i16> %a to <4 x i32> %b1 = sext <4 x i16> %b to <4 x i32> @@ -768,21 +995,27 @@ define <4 x i32> @mulhsw_v4i16_ashr(<4 x i16> %a, <4 x i16> %b) { ; SSE2-LABEL: mulhsw_v4i16_ashr: ; SSE2: # %bb.0: -; SSE2-NEXT: pmulhw %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pmaddwd %xmm1, %xmm0 ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mulhsw_v4i16_ashr: ; SSE41: # %bb.0: -; SSE41-NEXT: pmulhw %xmm1, %xmm0 -; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: pmaddwd %xmm1, %xmm0 +; SSE41-NEXT: psrad $16, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: mulhsw_v4i16_ashr: ; AVX: # %bb.0: -; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX-NEXT: retq %a1 = sext <4 x i16> %a to <4 x i32> %b1 = sext <4 x i16> %b to <4 x i32> @@ -859,11 +1092,10 @@ ; SSE2-LABEL: mulhsw_v8i16_ashr: ; SSE2: # %bb.0: ; SSE2-NEXT: pmulhw %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mulhsw_v8i16_ashr: @@ -891,15 +1123,15 @@ ; SSE2-LABEL: zext_mulhuw_v16i16_lshr: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pmulhuw %xmm1, %xmm3 ; SSE2-NEXT: pmulhuw %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE2-NEXT: pmulhuw %xmm1, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: retq ; @@ -941,15 +1173,15 @@ ; SSE2-LABEL: mulhsw_v16i16_lshr: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pmulhw %xmm1, %xmm3 ; SSE2-NEXT: pmulhw %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE2-NEXT: pmulhw %xmm1, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: retq ; @@ -990,18 +1222,17 @@ define <16 x i32> @mulhsw_v16i16_ashr(<16 x i16> %a, <16 x i16> %b) { ; SSE2-LABEL: mulhsw_v16i16_ashr: ; SSE2: # %bb.0: -; SSE2-NEXT: pmulhw %xmm2, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: pmulhw %xmm3, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE2-NEXT: pmulhw %xmm2, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm4, %xmm2 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mulhsw_v16i16_ashr: @@ -1042,31 +1273,31 @@ ; SSE2-LABEL: zext_mulhuw_v32i16_lshr: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: pmulhuw %xmm7, %xmm3 +; SSE2-NEXT: pmulhuw %xmm6, %xmm2 +; SSE2-NEXT: pmulhuw %xmm5, %xmm1 ; SSE2-NEXT: pmulhuw %xmm4, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: pmulhuw %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: pmulhuw %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: pmulhuw %xmm7, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm2, %xmm7 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE2-NEXT: movdqa %xmm3, 112(%rdi) -; SSE2-NEXT: movdqa %xmm7, 96(%rdi) +; SSE2-NEXT: movdqa %xmm8, 96(%rdi) ; SSE2-NEXT: movdqa %xmm2, 80(%rdi) -; SSE2-NEXT: movdqa %xmm6, 64(%rdi) +; SSE2-NEXT: movdqa %xmm7, 64(%rdi) ; SSE2-NEXT: movdqa %xmm1, 48(%rdi) -; SSE2-NEXT: movdqa %xmm5, 32(%rdi) +; SSE2-NEXT: movdqa %xmm6, 32(%rdi) ; SSE2-NEXT: movdqa %xmm0, 16(%rdi) -; SSE2-NEXT: movdqa %xmm8, (%rdi) +; SSE2-NEXT: movdqa %xmm5, (%rdi) ; SSE2-NEXT: retq ; ; SSE41-LABEL: zext_mulhuw_v32i16_lshr: @@ -1137,31 +1368,31 @@ ; SSE2-LABEL: mulhsw_v32i16_lshr: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: pmulhw %xmm7, %xmm3 +; SSE2-NEXT: pmulhw %xmm6, %xmm2 +; SSE2-NEXT: pmulhw %xmm5, %xmm1 ; SSE2-NEXT: pmulhw %xmm4, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: pmulhw %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: pmulhw %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: pmulhw %xmm7, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm2, %xmm7 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE2-NEXT: movdqa %xmm3, 112(%rdi) -; SSE2-NEXT: movdqa %xmm7, 96(%rdi) +; SSE2-NEXT: movdqa %xmm8, 96(%rdi) ; SSE2-NEXT: movdqa %xmm2, 80(%rdi) -; SSE2-NEXT: movdqa %xmm6, 64(%rdi) +; SSE2-NEXT: movdqa %xmm7, 64(%rdi) ; SSE2-NEXT: movdqa %xmm1, 48(%rdi) -; SSE2-NEXT: movdqa %xmm5, 32(%rdi) +; SSE2-NEXT: movdqa %xmm6, 32(%rdi) ; SSE2-NEXT: movdqa %xmm0, 16(%rdi) -; SSE2-NEXT: movdqa %xmm8, (%rdi) +; SSE2-NEXT: movdqa %xmm5, (%rdi) ; SSE2-NEXT: retq ; ; SSE41-LABEL: mulhsw_v32i16_lshr: @@ -1232,34 +1463,34 @@ ; SSE2-LABEL: mulhsw_v32i16_ashr: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: pmulhw %xmm7, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pmulhw %xmm6, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pmulhw %xmm5, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pmulhw %xmm4, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pmulhw %xmm5, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: pmulhw %xmm6, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: pmulhw %xmm7, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: movdqa %xmm3, 112(%rdi) -; SSE2-NEXT: movdqa %xmm7, 96(%rdi) -; SSE2-NEXT: movdqa %xmm2, 80(%rdi) -; SSE2-NEXT: movdqa %xmm6, 64(%rdi) -; SSE2-NEXT: movdqa %xmm1, 48(%rdi) -; SSE2-NEXT: movdqa %xmm5, 32(%rdi) -; SSE2-NEXT: movdqa %xmm0, 16(%rdi) -; SSE2-NEXT: movdqa %xmm4, (%rdi) +; SSE2-NEXT: psrad $16, %xmm7 +; SSE2-NEXT: movdqa %xmm7, 112(%rdi) +; SSE2-NEXT: movdqa %xmm3, 96(%rdi) +; SSE2-NEXT: movdqa %xmm6, 80(%rdi) +; SSE2-NEXT: movdqa %xmm2, 64(%rdi) +; SSE2-NEXT: movdqa %xmm5, 48(%rdi) +; SSE2-NEXT: movdqa %xmm1, 32(%rdi) +; SSE2-NEXT: movdqa %xmm4, 16(%rdi) +; SSE2-NEXT: movdqa %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSE41-LABEL: mulhsw_v32i16_ashr: @@ -1334,40 +1565,40 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm7, %xmm8 ; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm6 +; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm5 +; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm4 +; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm3 +; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm2 +; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm1 ; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: pxor %xmm11, %xmm11 ; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] ; SSE2-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm11 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; SSE2-NEXT: movdqa %xmm2, %xmm10 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm3, %xmm12 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] -; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm4, %xmm13 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm5, %xmm14 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm6, %xmm15 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] -; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm8, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm8, 240(%rdi) ; SSE2-NEXT: movdqa %xmm7, 224(%rdi) ; SSE2-NEXT: movdqa %xmm6, 208(%rdi) @@ -1379,7 +1610,7 @@ ; SSE2-NEXT: movdqa %xmm3, 112(%rdi) ; SSE2-NEXT: movdqa %xmm12, 96(%rdi) ; SSE2-NEXT: movdqa %xmm2, 80(%rdi) -; SSE2-NEXT: movdqa %xmm11, 64(%rdi) +; SSE2-NEXT: movdqa %xmm10, 64(%rdi) ; SSE2-NEXT: movdqa %xmm1, 48(%rdi) ; SSE2-NEXT: movdqa %xmm9, 32(%rdi) ; SSE2-NEXT: movdqa %xmm0, 16(%rdi) @@ -1515,40 +1746,40 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm7, %xmm8 ; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: pxor %xmm11, %xmm11 ; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] ; SSE2-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm11 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; SSE2-NEXT: movdqa %xmm2, %xmm10 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm3, %xmm12 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm4, %xmm13 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm5, %xmm14 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm6, %xmm15 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm8, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm8, 240(%rdi) ; SSE2-NEXT: movdqa %xmm7, 224(%rdi) ; SSE2-NEXT: movdqa %xmm6, 208(%rdi) @@ -1560,7 +1791,7 @@ ; SSE2-NEXT: movdqa %xmm3, 112(%rdi) ; SSE2-NEXT: movdqa %xmm12, 96(%rdi) ; SSE2-NEXT: movdqa %xmm2, 80(%rdi) -; SSE2-NEXT: movdqa %xmm11, 64(%rdi) +; SSE2-NEXT: movdqa %xmm10, 64(%rdi) ; SSE2-NEXT: movdqa %xmm1, 48(%rdi) ; SSE2-NEXT: movdqa %xmm9, 32(%rdi) ; SSE2-NEXT: movdqa %xmm0, 16(%rdi) @@ -1695,62 +1926,62 @@ ; SSE2-LABEL: mulhsw_v64i16_ashr: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: psrad $16, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSE2-NEXT: psrad $16, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm15 ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] -; SSE2-NEXT: psrad $16, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm14 ; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] -; SSE2-NEXT: psrad $16, %xmm11 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm13 ; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] ; SSE2-NEXT: psrad $16, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; SSE2-NEXT: psrad $16, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm11 ; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; SSE2-NEXT: psrad $16, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm10 ; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] -; SSE2-NEXT: psrad $16, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm9 ; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: movdqa %xmm7, 240(%rdi) -; SSE2-NEXT: movdqa %xmm15, 224(%rdi) -; SSE2-NEXT: movdqa %xmm6, 208(%rdi) -; SSE2-NEXT: movdqa %xmm14, 192(%rdi) -; SSE2-NEXT: movdqa %xmm5, 176(%rdi) -; SSE2-NEXT: movdqa %xmm13, 160(%rdi) -; SSE2-NEXT: movdqa %xmm4, 144(%rdi) -; SSE2-NEXT: movdqa %xmm12, 128(%rdi) -; SSE2-NEXT: movdqa %xmm3, 112(%rdi) -; SSE2-NEXT: movdqa %xmm11, 96(%rdi) -; SSE2-NEXT: movdqa %xmm2, 80(%rdi) -; SSE2-NEXT: movdqa %xmm10, 64(%rdi) -; SSE2-NEXT: movdqa %xmm1, 48(%rdi) -; SSE2-NEXT: movdqa %xmm9, 32(%rdi) -; SSE2-NEXT: movdqa %xmm0, 16(%rdi) -; SSE2-NEXT: movdqa %xmm8, (%rdi) +; SSE2-NEXT: psrad $16, %xmm8 +; SSE2-NEXT: movdqa %xmm8, 240(%rdi) +; SSE2-NEXT: movdqa %xmm7, 224(%rdi) +; SSE2-NEXT: movdqa %xmm9, 208(%rdi) +; SSE2-NEXT: movdqa %xmm6, 192(%rdi) +; SSE2-NEXT: movdqa %xmm10, 176(%rdi) +; SSE2-NEXT: movdqa %xmm5, 160(%rdi) +; SSE2-NEXT: movdqa %xmm11, 144(%rdi) +; SSE2-NEXT: movdqa %xmm4, 128(%rdi) +; SSE2-NEXT: movdqa %xmm12, 112(%rdi) +; SSE2-NEXT: movdqa %xmm3, 96(%rdi) +; SSE2-NEXT: movdqa %xmm13, 80(%rdi) +; SSE2-NEXT: movdqa %xmm2, 64(%rdi) +; SSE2-NEXT: movdqa %xmm14, 48(%rdi) +; SSE2-NEXT: movdqa %xmm1, 32(%rdi) +; SSE2-NEXT: movdqa %xmm15, 16(%rdi) +; SSE2-NEXT: movdqa %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSE41-LABEL: mulhsw_v64i16_ashr: @@ -1876,45 +2107,79 @@ define <8 x i64> @zext_mulhuw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: zext_mulhuw_v8i16_lshr_i64: ; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pmulhuw %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,1,3,3] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,1,3,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,3,3] +; SSE2-NEXT: pmuludq %xmm6, %xmm3 +; SSE2-NEXT: psrlq $16, %xmm0 +; SSE2-NEXT: psrlq $16, %xmm4 +; SSE2-NEXT: psrlq $16, %xmm2 +; SSE2-NEXT: psrlq $16, %xmm3 +; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: zext_mulhuw_v8i16_lshr_i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pmulhuw %xmm1, %xmm0 -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: pmuldq %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; SSE41-NEXT: pmuldq %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; SSE41-NEXT: pmuldq %xmm5, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: pmuldq %xmm6, %xmm3 +; SSE41-NEXT: psrlq $16, %xmm0 +; SSE41-NEXT: psrlq $16, %xmm4 +; SSE41-NEXT: psrlq $16, %xmm2 +; SSE41-NEXT: psrlq $16, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm1 ; SSE41-NEXT: retq ; ; AVX2-LABEL: zext_mulhuw_v8i16_lshr_i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmulhuw %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpsrlq $16, %ymm2, %ymm0 +; AVX2-NEXT: vpsrlq $16, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: zext_mulhuw_v8i16_lshr_i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm0 ; AVX512-NEXT: retq %a1 = zext <8 x i16> %a to <8 x i64> %b1 = zext <8 x i16> %b to <8 x i64> @@ -1942,29 +2207,51 @@ ; ; SSE41-LABEL: sext_mulhsw_v8i16_lshr_i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pmulhw %xmm1, %xmm0 -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; SSE41-NEXT: pmovsxwq %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; SSE41-NEXT: pmovsxwq %xmm3, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE41-NEXT: pmovsxwq %xmm4, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pmovsxwq %xmm0, %xmm6 +; SSE41-NEXT: pmovsxwq %xmm1, %xmm0 +; SSE41-NEXT: pmuldq %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; SSE41-NEXT: pmovsxwq %xmm2, %xmm4 +; SSE41-NEXT: pmuldq %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; SSE41-NEXT: pmovsxwq %xmm2, %xmm2 +; SSE41-NEXT: pmuldq %xmm5, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE41-NEXT: pmovsxwq %xmm1, %xmm3 +; SSE41-NEXT: pmuldq %xmm6, %xmm3 +; SSE41-NEXT: psrlq $16, %xmm0 +; SSE41-NEXT: psrlq $16, %xmm4 +; SSE41-NEXT: psrlq $16, %xmm2 +; SSE41-NEXT: psrlq $16, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm1 ; SSE41-NEXT: retq ; ; AVX2-LABEL: sext_mulhsw_v8i16_lshr_i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpmovsxwq %xmm0, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 +; AVX2-NEXT: vpmovsxwq %xmm1, %ymm3 +; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 +; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpsrlq $16, %ymm2, %ymm0 +; AVX2-NEXT: vpsrlq $16, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: sext_mulhsw_v8i16_lshr_i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm0 ; AVX512-NEXT: retq %a1 = sext <8 x i16> %a to <8 x i64> %b1 = sext <8 x i16> %b to <8 x i64> @@ -1996,29 +2283,67 @@ ; ; SSE41-LABEL: sext_mulhsw_v8i16_ashr_i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pmulhw %xmm1, %xmm0 -; SSE41-NEXT: pmovsxwq %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE41-NEXT: pmovsxwq %xmm1, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pmovsxwq %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; SSE41-NEXT: pmovsxwq %xmm2, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE41-NEXT: pmovsxwq %xmm4, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE41-NEXT: pmovsxwq %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pmovsxwq %xmm0, %xmm5 +; SSE41-NEXT: pmovsxwq %xmm3, %xmm0 +; SSE41-NEXT: pmuldq %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE41-NEXT: pmovsxwq %xmm1, %xmm1 +; SSE41-NEXT: pmuldq %xmm2, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; SSE41-NEXT: pmovsxwq %xmm2, %xmm2 +; SSE41-NEXT: pmuldq %xmm4, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; SSE41-NEXT: pmovsxwq %xmm3, %xmm3 +; SSE41-NEXT: pmuldq %xmm5, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psrad $16, %xmm4 +; SSE41-NEXT: psrlq $16, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: psrad $16, %xmm4 +; SSE41-NEXT: psrlq $16, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psrad $16, %xmm4 +; SSE41-NEXT: psrlq $16, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psrad $16, %xmm4 +; SSE41-NEXT: psrlq $16, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] ; SSE41-NEXT: retq ; ; AVX2-LABEL: sext_mulhsw_v8i16_ashr_i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpmovsxwq %xmm1, %ymm0 +; AVX2-NEXT: vpmovsxwq %xmm0, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 +; AVX2-NEXT: vpmovsxwq %xmm1, %ymm3 +; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 +; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpsrad $16, %ymm2, %ymm0 +; AVX2-NEXT: vpsrlq $16, %ymm2, %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] +; AVX2-NEXT: vpsrad $16, %ymm1, %ymm2 +; AVX2-NEXT: vpsrlq $16, %ymm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX2-NEXT: retq ; ; AVX512-LABEL: sext_mulhsw_v8i16_ashr_i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsraq $16, %zmm0, %zmm0 ; AVX512-NEXT: retq %a1 = sext <8 x i16> %a to <8 x i64> %b1 = sext <8 x i16> %b to <8 x i64> diff --git a/llvm/test/CodeGen/X86/pr15267.ll b/llvm/test/CodeGen/X86/pr15267.ll --- a/llvm/test/CodeGen/X86/pr15267.ll +++ b/llvm/test/CodeGen/X86/pr15267.ll @@ -7,18 +7,14 @@ ; CHECK-NEXT: movzwl (%rdi), %eax ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $3, %ecx -; CHECK-NEXT: andl $7, %ecx -; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: andl $7, %edx -; CHECK-NEXT: vmovd %edx, %xmm0 -; CHECK-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; CHECK-NEXT: vmovd %eax, %xmm0 +; CHECK-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $6, %ecx -; CHECK-NEXT: andl $7, %ecx -; CHECK-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 +; CHECK-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: shrl $9, %eax -; CHECK-NEXT: andl $7, %eax -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 +; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NEXT: retq %ret = load <4 x i3>, ptr %in, align 1 ret <4 x i3> %ret @@ -50,28 +46,28 @@ ; CHECK-LABEL: test3: ; CHECK: # %bb.0: ; CHECK-NEXT: movzbl (%rdi), %eax +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shrb $3, %cl +; CHECK-NEXT: movzbl %cl, %ecx +; CHECK-NEXT: negq %rcx +; CHECK-NEXT: vmovq %rcx, %xmm0 ; CHECK-NEXT: movzbl %al, %ecx -; CHECK-NEXT: shrb %al +; CHECK-NEXT: shrb $2, %al ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: negl %eax -; CHECK-NEXT: movl %ecx, %edx -; CHECK-NEXT: andl $1, %edx -; CHECK-NEXT: negl %edx -; CHECK-NEXT: vmovd %edx, %xmm0 -; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; CHECK-NEXT: negq %rax +; CHECK-NEXT: vmovq %rax, %xmm1 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: shrb $2, %al -; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: negl %eax -; CHECK-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; CHECK-NEXT: shrb $3, %cl +; CHECK-NEXT: negq %rax +; CHECK-NEXT: vmovq %rax, %xmm1 +; CHECK-NEXT: shrb %cl ; CHECK-NEXT: movzbl %cl, %eax -; CHECK-NEXT: negl %eax -; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: negq %rax +; CHECK-NEXT: vmovq %rax, %xmm2 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %wide.load35 = load <4 x i1>, ptr %in, align 1 diff --git a/llvm/test/CodeGen/X86/pr22338.ll b/llvm/test/CodeGen/X86/pr22338.ll --- a/llvm/test/CodeGen/X86/pr22338.ll +++ b/llvm/test/CodeGen/X86/pr22338.ll @@ -8,17 +8,20 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: .cfi_offset %ebx, -8 +; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $1, {{[0-9]+}}(%esp) ; X86-NEXT: sete %cl ; X86-NEXT: setne %al +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: cmpl $1, {{[0-9]+}}(%esp) ; X86-NEXT: sete %dl +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: addl %edx, %edx ; X86-NEXT: negl %eax -; X86-NEXT: addb %cl, %cl ; X86-NEXT: movl %eax, %ebx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NEXT: shll %cl, %ebx -; X86-NEXT: addb %dl, %dl ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: shll %cl, %eax ; X86-NEXT: .p2align 4, 0x90 @@ -33,17 +36,20 @@ ; ; X64-LABEL: fn: ; X64: # %bb.0: # %entry +; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl $1, %edi ; X64-NEXT: sete %cl ; X64-NEXT: setne %al +; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpl $1, %esi ; X64-NEXT: sete %dl +; X64-NEXT: addl %ecx, %ecx +; X64-NEXT: addl %edx, %edx ; X64-NEXT: negl %eax -; X64-NEXT: addb %cl, %cl ; X64-NEXT: movl %eax, %esi +; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shll %cl, %esi -; X64-NEXT: addb %dl, %dl ; X64-NEXT: movl %edx, %ecx ; X64-NEXT: shll %cl, %eax ; X64-NEXT: .p2align 4, 0x90 diff --git a/llvm/test/CodeGen/X86/pr23258.ll b/llvm/test/CodeGen/X86/pr23258.ll --- a/llvm/test/CodeGen/X86/pr23258.ll +++ b/llvm/test/CodeGen/X86/pr23258.ll @@ -22,11 +22,11 @@ ; HAS-RAX-LABEL: bar: ; HAS-RAX: # %bb.0: ; HAS-RAX-NEXT: subq $56, %rsp -; HAS-RAX-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) -; HAS-RAX-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; HAS-RAX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; HAS-RAX-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; HAS-RAX-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; HAS-RAX-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; HAS-RAX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; HAS-RAX-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; HAS-RAX-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) ; HAS-RAX-NEXT: testb %al, %al ; HAS-RAX-NEXT: je .LBB1_2 ; HAS-RAX-NEXT: # %bb.1: @@ -39,28 +39,28 @@ ; HAS-RAX-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) ; HAS-RAX-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) ; HAS-RAX-NEXT: .LBB1_2: -; HAS-RAX-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; HAS-RAX-NEXT: movq %rax, 8 ; HAS-RAX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax ; HAS-RAX-NEXT: movq %rax, 16 -; HAS-RAX-NEXT: movl $8, 0 +; HAS-RAX-NEXT: leaq {{[0-9]+}}(%rsp), %rax +; HAS-RAX-NEXT: movq %rax, 8 ; HAS-RAX-NEXT: movl $48, 4 +; HAS-RAX-NEXT: movl $8, 0 ; HAS-RAX-NEXT: addq $56, %rsp ; HAS-RAX-NEXT: retq ; ; NO-RAX-LABEL: bar: ; NO-RAX: # %bb.0: -; NO-RAX-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) -; NO-RAX-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; NO-RAX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; NO-RAX-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; NO-RAX-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; NO-RAX-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; NO-RAX-NEXT: movq %rax, 8 +; NO-RAX-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; NO-RAX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; NO-RAX-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; NO-RAX-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) ; NO-RAX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax ; NO-RAX-NEXT: movq %rax, 16 -; NO-RAX-NEXT: movl $8, 0 +; NO-RAX-NEXT: leaq {{[0-9]+}}(%rsp), %rax +; NO-RAX-NEXT: movq %rax, 8 ; NO-RAX-NEXT: movl $48, 4 +; NO-RAX-NEXT: movl $8, 0 ; NO-RAX-NEXT: retq call void @llvm.va_start(ptr null) ret void diff --git a/llvm/test/CodeGen/X86/pr28472.ll b/llvm/test/CodeGen/X86/pr28472.ll --- a/llvm/test/CodeGen/X86/pr28472.ll +++ b/llvm/test/CodeGen/X86/pr28472.ll @@ -4,6 +4,11 @@ define float @same_dynamic_index_fp_vector_type(float %val, i32 %idx) { ; CHECK-LABEL: same_dynamic_index_fp_vector_type: ; CHECK: # %bb.0: # %bb +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: andl $3, %edi +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: retq bb: %tmp0 = insertelement <4 x float> undef, float %val, i32 %idx diff --git a/llvm/test/CodeGen/X86/pr31045.ll b/llvm/test/CodeGen/X86/pr31045.ll --- a/llvm/test/CodeGen/X86/pr31045.ll +++ b/llvm/test/CodeGen/X86/pr31045.ll @@ -19,7 +19,7 @@ ; CHECK-LABEL: _Z1av: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl struct_obj_3+8(%rip), %eax -; CHECK-NEXT: movzbl var_46(%rip), %ecx +; CHECK-NEXT: movsbl var_46(%rip), %ecx ; CHECK-NEXT: movzbl var_49(%rip), %edx ; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: addl %eax, %eax diff --git a/llvm/test/CodeGen/X86/pr32284.ll b/llvm/test/CodeGen/X86/pr32284.ll --- a/llvm/test/CodeGen/X86/pr32284.ll +++ b/llvm/test/CodeGen/X86/pr32284.ll @@ -162,17 +162,17 @@ ; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: setne -{{[0-9]+}}(%rsp) ; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: cmpq $-1, %rax -; X64-NEXT: sete %cl -; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpl $-1, %eax -; X64-NEXT: sete %dl +; X64-NEXT: sete %cl ; X64-NEXT: addq $7093, %rax # imm = 0x1BB5 -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpq %rax, %rdx -; X64-NEXT: setg %sil -; X64-NEXT: movq %rsi, var_57(%rip) -; X64-NEXT: movq %rcx, _ZN8struct_210member_2_0E(%rip) +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: cmpq %rax, %rcx +; X64-NEXT: setg %dl +; X64-NEXT: movq %rdx, var_57(%rip) +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl $-1, var_5(%rip) +; X64-NEXT: sete %al +; X64-NEXT: movq %rax, _ZN8struct_210member_2_0E(%rip) ; X64-NEXT: retq ; ; X86-O0-LABEL: f1: @@ -213,35 +213,28 @@ ; ; X86-LABEL: f1: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebx -; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 12 +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: subl $1, %esp -; X86-NEXT: .cfi_def_cfa_offset 13 -; X86-NEXT: .cfi_offset %esi, -12 -; X86-NEXT: .cfi_offset %ebx, -8 -; X86-NEXT: movl var_5, %eax -; X86-NEXT: movl %eax, %edx +; X86-NEXT: .cfi_def_cfa_offset 9 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl var_5, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sarl $31, %eax +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: xorl $208307499, %edx # imm = 0xC6A852B -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: sarl $31, %ecx -; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl %eax, %esi ; X86-NEXT: xorl $-2, %esi ; X86-NEXT: orl %edx, %esi ; X86-NEXT: setne (%esp) -; X86-NEXT: movl %eax, %esi -; X86-NEXT: andl %ecx, %esi +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: addl $7093, %esi # imm = 0x1BB5 +; X86-NEXT: adcl $0, %eax ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpl $-1, %esi +; X86-NEXT: cmpl $-1, %ecx ; X86-NEXT: sete %dl -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpl $-1, %eax -; X86-NEXT: sete %bl -; X86-NEXT: addl $7093, %eax # imm = 0x1BB5 -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: cmpl %ebx, %eax -; X86-NEXT: sbbl $0, %ecx +; X86-NEXT: cmpl %edx, %esi +; X86-NEXT: sbbl $0, %eax ; X86-NEXT: setl %al ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: movl %eax, var_57 @@ -249,10 +242,8 @@ ; X86-NEXT: movl %edx, _ZN8struct_210member_2_0E ; X86-NEXT: movl $0, _ZN8struct_210member_2_0E+4 ; X86-NEXT: addl $1, %esp -; X86-NEXT: .cfi_def_cfa_offset 12 -; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: popl %ebx +; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl entry: @@ -451,14 +442,16 @@ ; X64: # %bb.0: # %entry ; X64-NEXT: movl var_13(%rip), %eax ; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testl %eax, %eax -; X64-NEXT: notl %eax +; X64-NEXT: testq %rax, %rax ; X64-NEXT: sete %cl -; X64-NEXT: movl var_16(%rip), %edx -; X64-NEXT: xorl %eax, %edx -; X64-NEXT: andl %edx, %ecx -; X64-NEXT: orl %eax, %ecx +; X64-NEXT: movl $4294967295, %edx # imm = 0xFFFFFFFF +; X64-NEXT: xorq %rax, %rdx +; X64-NEXT: movl var_16(%rip), %esi +; X64-NEXT: xorl %edx, %esi +; X64-NEXT: andl %esi, %ecx +; X64-NEXT: orq %rdx, %rcx ; X64-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NEXT: notl %eax ; X64-NEXT: movl %eax, var_46(%rip) ; X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/pr32345.ll b/llvm/test/CodeGen/X86/pr32345.ll --- a/llvm/test/CodeGen/X86/pr32345.ll +++ b/llvm/test/CodeGen/X86/pr32345.ll @@ -72,9 +72,9 @@ ; ; X64-LABEL: foo: ; X64: # %bb.0: # %bb -; X64-NEXT: movzbl var_27(%rip), %ecx ; X64-NEXT: movzwl var_22(%rip), %eax ; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-NEXT: movzbl var_27(%rip), %ecx ; X64-NEXT: addb $30, %cl ; X64-NEXT: shrq %cl, %rax ; X64-NEXT: movb %al, (%rax) @@ -89,10 +89,10 @@ ; 686-NEXT: .cfi_def_cfa_register %ebp ; 686-NEXT: andl $-8, %esp ; 686-NEXT: subl $8, %esp -; 686-NEXT: movzbl var_27, %ecx ; 686-NEXT: movzwl var_22, %eax ; 686-NEXT: movl %eax, (%esp) ; 686-NEXT: movl $0, {{[0-9]+}}(%esp) +; 686-NEXT: movzbl var_27, %ecx ; 686-NEXT: addb $30, %cl ; 686-NEXT: xorl %edx, %edx ; 686-NEXT: shrdl %cl, %edx, %eax diff --git a/llvm/test/CodeGen/X86/pr33290.ll b/llvm/test/CodeGen/X86/pr33290.ll --- a/llvm/test/CodeGen/X86/pr33290.ll +++ b/llvm/test/CodeGen/X86/pr33290.ll @@ -14,8 +14,8 @@ ; X86-NEXT: .LBB0_1: # %for.cond ; X86-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NEXT: movzbl c, %ecx -; X86-NEXT: movb $0, c ; X86-NEXT: leal a+2(%ecx), %ecx +; X86-NEXT: movb $0, c ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: jmp .LBB0_1 ; diff --git a/llvm/test/CodeGen/X86/pr33828.ll b/llvm/test/CodeGen/X86/pr33828.ll --- a/llvm/test/CodeGen/X86/pr33828.ll +++ b/llvm/test/CodeGen/X86/pr33828.ll @@ -7,18 +7,18 @@ define void @foo(i8 %a0) { ; X86-LABEL: foo: ; X86: # %bb.0: # %entry -; X86-NEXT: movsbl var_580, %eax -; X86-NEXT: testl $-536870913, %eax # imm = 0xDFFFFFFF -; X86-NEXT: jne .LBB0_1 +; X86-NEXT: movb $1, %al +; X86-NEXT: testb %al, %al +; X86-NEXT: je .LBB0_1 ; X86-NEXT: # %bb.2: # %if.end13 ; X86-NEXT: retl ; X86-NEXT: .LBB0_1: # %if.then11 ; ; X64-LABEL: foo: ; X64: # %bb.0: # %entry -; X64-NEXT: movsbl var_580(%rip), %eax -; X64-NEXT: testl $-536870913, %eax # imm = 0xDFFFFFFF -; X64-NEXT: jne .LBB0_1 +; X64-NEXT: movb $1, %al +; X64-NEXT: testb %al, %al +; X64-NEXT: je .LBB0_1 ; X64-NEXT: # %bb.2: # %if.end13 ; X64-NEXT: retq ; X64-NEXT: .LBB0_1: # %if.then11 diff --git a/llvm/test/CodeGen/X86/pr34137.ll b/llvm/test/CodeGen/X86/pr34137.ll --- a/llvm/test/CodeGen/X86/pr34137.ll +++ b/llvm/test/CodeGen/X86/pr34137.ll @@ -8,16 +8,13 @@ define void @pr34127() { ; CHECK-LABEL: pr34127: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movzwl var_13(%rip), %eax -; CHECK-NEXT: movzwl var_3(%rip), %ecx -; CHECK-NEXT: andw %ax, %cx -; CHECK-NEXT: movzwl %cx, %ecx -; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzwl var_3(%rip), %ecx +; CHECK-NEXT: movzwl var_3(%rip), %eax +; CHECK-NEXT: movzwl var_13(%rip), %ecx ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: testl %eax, %ecx +; CHECK-NEXT: andl %eax, %ecx +; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: sete %dl -; CHECK-NEXT: andl %ecx, %edx +; CHECK-NEXT: andl %eax, %edx ; CHECK-NEXT: movq %rdx, var_212(%rip) ; CHECK-NEXT: movw $0, (%rax) ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr34592.ll b/llvm/test/CodeGen/X86/pr34592.ll --- a/llvm/test/CodeGen/X86/pr34592.ll +++ b/llvm/test/CodeGen/X86/pr34592.ll @@ -22,7 +22,7 @@ ; CHECK-NEXT: vmovaps 80(%rbp), %ymm11 ; CHECK-NEXT: vmovaps 48(%rbp), %ymm11 ; CHECK-NEXT: vmovaps 16(%rbp), %ymm11 -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm2[4,5,6,7] ; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] ; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,1] diff --git a/llvm/test/CodeGen/X86/pr35316.ll b/llvm/test/CodeGen/X86/pr35316.ll --- a/llvm/test/CodeGen/X86/pr35316.ll +++ b/llvm/test/CodeGen/X86/pr35316.ll @@ -25,20 +25,20 @@ ; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl a(%rip), %esi ; CHECK-NEXT: movl $0, b(%rip) -; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi ; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi ; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d -; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl c(%rip), %eax ; CHECK-NEXT: cltd -; CHECK-NEXT: idivl a(%rip) +; CHECK-NEXT: idivl -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: movl c(%rip), %eax +; CHECK-NEXT: andl %r8d, %ecx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: cltd -; CHECK-NEXT: idivl %r8d -; CHECK-NEXT: andl %edi, %eax +; CHECK-NEXT: idivl %esi ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: andl %esi, %eax +; CHECK-NEXT: andl %edi, %eax ; CHECK-NEXT: movl %eax, (%rax) ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/pr35443.ll b/llvm/test/CodeGen/X86/pr35443.ll --- a/llvm/test/CodeGen/X86/pr35443.ll +++ b/llvm/test/CodeGen/X86/pr35443.ll @@ -9,11 +9,10 @@ ; CHECK-LABEL: pr35443: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vpbroadcastb ac+4(%rip), %xmm0 -; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpsubq %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vpmovqd %ymm0, ai3+16(%rip) -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, ai3+16(%rip) ; CHECK-NEXT: retq entry: %wide.masked.load66 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr getelementptr inbounds ([20 x i8], ptr @ac, i64 0, i64 4), i32 1, <4 x i1> , <4 x i8> undef) diff --git a/llvm/test/CodeGen/X86/pr35765.ll b/llvm/test/CodeGen/X86/pr35765.ll --- a/llvm/test/CodeGen/X86/pr35765.ll +++ b/llvm/test/CodeGen/X86/pr35765.ll @@ -15,10 +15,9 @@ ; CHECK-NEXT: shll %cl, %eax ; CHECK-NEXT: movzwl x(%rip), %ecx ; CHECK-NEXT: movzwl s2(%rip), %edx -; CHECK-NEXT: notl %edx -; CHECK-NEXT: orl $63488, %edx # imm = 0xF800 -; CHECK-NEXT: movzwl %dx, %edx +; CHECK-NEXT: xorl $2047, %edx # imm = 0x7FF ; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: orl $63488, %edx # imm = 0xF800 ; CHECK-NEXT: xorl %eax, %edx ; CHECK-NEXT: movslq %edx, %rax ; CHECK-NEXT: movq %rax, ll(%rip) diff --git a/llvm/test/CodeGen/X86/pr35982.ll b/llvm/test/CodeGen/X86/pr35982.ll --- a/llvm/test/CodeGen/X86/pr35982.ll +++ b/llvm/test/CodeGen/X86/pr35982.ll @@ -5,8 +5,14 @@ define float @PR35982_emms(<1 x i64>) nounwind { ; NO-POSTRA-LABEL: PR35982_emms: ; NO-POSTRA: # %bb.0: -; NO-POSTRA-NEXT: subl $8, %esp -; NO-POSTRA-NEXT: movl {{[0-9]+}}(%esp), %eax +; NO-POSTRA-NEXT: pushl %ebp +; NO-POSTRA-NEXT: movl %esp, %ebp +; NO-POSTRA-NEXT: andl $-8, %esp +; NO-POSTRA-NEXT: subl $16, %esp +; NO-POSTRA-NEXT: movl 8(%ebp), %eax +; NO-POSTRA-NEXT: movl 12(%ebp), %ecx +; NO-POSTRA-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; NO-POSTRA-NEXT: movl %eax, {{[0-9]+}}(%esp) ; NO-POSTRA-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; NO-POSTRA-NEXT: punpckhdq %mm0, %mm0 # mm0 = mm0[1,1] ; NO-POSTRA-NEXT: movd %mm0, %ecx @@ -15,14 +21,21 @@ ; NO-POSTRA-NEXT: fildl (%esp) ; NO-POSTRA-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; NO-POSTRA-NEXT: fiaddl {{[0-9]+}}(%esp) -; NO-POSTRA-NEXT: addl $8, %esp +; NO-POSTRA-NEXT: movl %ebp, %esp +; NO-POSTRA-NEXT: popl %ebp ; NO-POSTRA-NEXT: retl ; ; POSTRA-LABEL: PR35982_emms: ; POSTRA: # %bb.0: -; POSTRA-NEXT: subl $8, %esp +; POSTRA-NEXT: pushl %ebp +; POSTRA-NEXT: movl %esp, %ebp +; POSTRA-NEXT: andl $-8, %esp +; POSTRA-NEXT: subl $16, %esp +; POSTRA-NEXT: movl 8(%ebp), %eax +; POSTRA-NEXT: movl 12(%ebp), %ecx +; POSTRA-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; POSTRA-NEXT: movl %eax, {{[0-9]+}}(%esp) ; POSTRA-NEXT: movq {{[0-9]+}}(%esp), %mm0 -; POSTRA-NEXT: movl {{[0-9]+}}(%esp), %eax ; POSTRA-NEXT: punpckhdq %mm0, %mm0 # mm0 = mm0[1,1] ; POSTRA-NEXT: movd %mm0, %ecx ; POSTRA-NEXT: emms @@ -30,7 +43,8 @@ ; POSTRA-NEXT: fildl (%esp) ; POSTRA-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; POSTRA-NEXT: fiaddl {{[0-9]+}}(%esp) -; POSTRA-NEXT: addl $8, %esp +; POSTRA-NEXT: movl %ebp, %esp +; POSTRA-NEXT: popl %ebp ; POSTRA-NEXT: retl %2 = bitcast <1 x i64> %0 to <2 x i32> %3 = extractelement <2 x i32> %2, i32 0 @@ -49,8 +63,14 @@ define float @PR35982_femms(<1 x i64>) nounwind { ; NO-POSTRA-LABEL: PR35982_femms: ; NO-POSTRA: # %bb.0: -; NO-POSTRA-NEXT: subl $8, %esp -; NO-POSTRA-NEXT: movl {{[0-9]+}}(%esp), %eax +; NO-POSTRA-NEXT: pushl %ebp +; NO-POSTRA-NEXT: movl %esp, %ebp +; NO-POSTRA-NEXT: andl $-8, %esp +; NO-POSTRA-NEXT: subl $16, %esp +; NO-POSTRA-NEXT: movl 8(%ebp), %eax +; NO-POSTRA-NEXT: movl 12(%ebp), %ecx +; NO-POSTRA-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; NO-POSTRA-NEXT: movl %eax, {{[0-9]+}}(%esp) ; NO-POSTRA-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; NO-POSTRA-NEXT: punpckhdq %mm0, %mm0 # mm0 = mm0[1,1] ; NO-POSTRA-NEXT: movd %mm0, %ecx @@ -59,14 +79,21 @@ ; NO-POSTRA-NEXT: fildl (%esp) ; NO-POSTRA-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; NO-POSTRA-NEXT: fiaddl {{[0-9]+}}(%esp) -; NO-POSTRA-NEXT: addl $8, %esp +; NO-POSTRA-NEXT: movl %ebp, %esp +; NO-POSTRA-NEXT: popl %ebp ; NO-POSTRA-NEXT: retl ; ; POSTRA-LABEL: PR35982_femms: ; POSTRA: # %bb.0: -; POSTRA-NEXT: subl $8, %esp +; POSTRA-NEXT: pushl %ebp +; POSTRA-NEXT: movl %esp, %ebp +; POSTRA-NEXT: andl $-8, %esp +; POSTRA-NEXT: subl $16, %esp +; POSTRA-NEXT: movl 8(%ebp), %eax +; POSTRA-NEXT: movl 12(%ebp), %ecx +; POSTRA-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; POSTRA-NEXT: movl %eax, {{[0-9]+}}(%esp) ; POSTRA-NEXT: movq {{[0-9]+}}(%esp), %mm0 -; POSTRA-NEXT: movl {{[0-9]+}}(%esp), %eax ; POSTRA-NEXT: punpckhdq %mm0, %mm0 # mm0 = mm0[1,1] ; POSTRA-NEXT: movd %mm0, %ecx ; POSTRA-NEXT: femms @@ -74,7 +101,8 @@ ; POSTRA-NEXT: fildl (%esp) ; POSTRA-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; POSTRA-NEXT: fiaddl {{[0-9]+}}(%esp) -; POSTRA-NEXT: addl $8, %esp +; POSTRA-NEXT: movl %ebp, %esp +; POSTRA-NEXT: popl %ebp ; POSTRA-NEXT: retl %2 = bitcast <1 x i64> %0 to <2 x i32> %3 = extractelement <2 x i32> %2, i32 0 diff --git a/llvm/test/CodeGen/X86/pr38185.ll b/llvm/test/CodeGen/X86/pr38185.ll --- a/llvm/test/CodeGen/X86/pr38185.ll +++ b/llvm/test/CodeGen/X86/pr38185.ll @@ -14,11 +14,10 @@ ; CHECK-NEXT: # %bb.2: # %body ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: movl $1, (%rdx,%rax,4) -; CHECK-NEXT: movzbl (%rdi,%rax,4), %r8d -; CHECK-NEXT: movzbl (%rsi,%rax,4), %r9d -; CHECK-NEXT: andl %r8d, %r9d -; CHECK-NEXT: andl $1, %r9d -; CHECK-NEXT: movl %r9d, (%rdi,%rax,4) +; CHECK-NEXT: movl (%rdi,%rax,4), %r8d +; CHECK-NEXT: andl (%rsi,%rax,4), %r8d +; CHECK-NEXT: andl $1, %r8d +; CHECK-NEXT: movl %r8d, (%rdi,%rax,4) ; CHECK-NEXT: incq %rax ; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: jmp .LBB0_1 diff --git a/llvm/test/CodeGen/X86/pr38217.ll b/llvm/test/CodeGen/X86/pr38217.ll --- a/llvm/test/CodeGen/X86/pr38217.ll +++ b/llvm/test/CodeGen/X86/pr38217.ll @@ -17,11 +17,11 @@ ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: mulq %r8 ; CHECK-NEXT: shrq $11, %rdx -; CHECK-NEXT: imulq $10000, %rdx, %rax # imm = 0x2710 -; CHECK-NEXT: movq %rdi, %r9 -; CHECK-NEXT: subq %rax, %r9 -; CHECK-NEXT: imulq $42949673, %r9, %rax # imm = 0x28F5C29 -; CHECK-NEXT: shrq $32, %rax +; CHECK-NEXT: imull $10000, %edx, %eax # imm = 0x2710 +; CHECK-NEXT: movl %edi, %r9d +; CHECK-NEXT: subl %eax, %r9d +; CHECK-NEXT: imulq $1374389535, %r9, %rax # imm = 0x51EB851F +; CHECK-NEXT: shrq $37, %rax ; CHECK-NEXT: imull $100, %eax, %r10d ; CHECK-NEXT: subl %r10d, %r9d ; CHECK-NEXT: movl %ecx, %r10d diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll --- a/llvm/test/CodeGen/X86/pr38539.ll +++ b/llvm/test/CodeGen/X86/pr38539.ll @@ -6,13 +6,256 @@ define void @f() { ; X64-LABEL: f: ; X64: # %bb.0: # %BB -; X64-NEXT: movzbl (%rax), %eax +; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-NEXT: movzbl (%rax), %ecx ; X64-NEXT: cmpb $0, (%rax) ; X64-NEXT: setne (%rax) -; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax ; X64-NEXT: movq %rax, (%rax) ; X64-NEXT: movb $0, (%rax) ; X64-NEXT: retq +; +; X86-LABEL: f: +; X86: # %bb.0: # %BB_udiv-special-cases +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $176, %esp +; X86-NEXT: .cfi_offset %esi, -20 +; X86-NEXT: .cfi_offset %edi, -16 +; X86-NEXT: .cfi_offset %ebx, -12 +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movzbl (%eax), %eax +; X86-NEXT: movzbl (%eax), %ebx +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: divb %bl +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shll $30, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: sarl $30, %ecx +; X86-NEXT: sarl $31, %eax +; X86-NEXT: shrdl $1, %eax, %ecx +; X86-NEXT: xorl %eax, %edx +; X86-NEXT: xorl %eax, %edi +; X86-NEXT: xorl %ecx, %esi +; X86-NEXT: subl %ecx, %esi +; X86-NEXT: sbbl %eax, %edi +; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: andl $3, %edx +; X86-NEXT: testl %edi, %edi +; X86-NEXT: jne .LBB0_1 +; X86-NEXT: # %bb.2: # %BB_udiv-special-cases +; X86-NEXT: bsrl %esi, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: addl $32, %ecx +; X86-NEXT: jmp .LBB0_3 +; X86-NEXT: .LBB0_1: +; X86-NEXT: bsrl %edi, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: .LBB0_3: # %BB_udiv-special-cases +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: testl %edx, %edx +; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: jne .LBB0_4 +; X86-NEXT: # %bb.5: # %BB_udiv-special-cases +; X86-NEXT: addl $64, %ecx +; X86-NEXT: jmp .LBB0_6 +; X86-NEXT: .LBB0_4: +; X86-NEXT: bsrl %edx, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: addl $32, %ecx +; X86-NEXT: .LBB0_6: # %BB_udiv-special-cases +; X86-NEXT: subl $62, %ecx +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %ebx, %ebx +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: addl $-66, %ecx +; X86-NEXT: adcl $-1, %ebx +; X86-NEXT: adcl $3, %eax +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movb $1, %al +; X86-NEXT: testb %al, %al +; X86-NEXT: jne .LBB0_11 +; X86-NEXT: # %bb.7: # %BB_udiv-special-cases +; X86-NEXT: andl $3, %edi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: xorl $65, %eax +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edi, %eax +; X86-NEXT: orl %ebx, %eax +; X86-NEXT: je .LBB0_11 +; X86-NEXT: # %bb.8: # %udiv-bb1 +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: addl $1, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl $0, %esi +; X86-NEXT: andl $3, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movb $65, %cl +; X86-NEXT: subb %al, %cl +; X86-NEXT: movb %cl, %ch +; X86-NEXT: andb $7, %ch +; X86-NEXT: shrb $3, %cl +; X86-NEXT: andb $15, %cl +; X86-NEXT: negb %cl +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movsbl %cl, %edx +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl 136(%esp,%edx), %eax +; X86-NEXT: movb %ch, %cl +; X86-NEXT: shll %cl, %eax +; X86-NEXT: notb %cl +; X86-NEXT: movl 128(%esp,%edx), %edi +; X86-NEXT: movl 132(%esp,%edx), %esi +; X86-NEXT: movl %esi, %edx +; X86-NEXT: shrl %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: movb %ch, %cl +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: orl %ebx, %ecx +; X86-NEXT: je .LBB0_11 +; X86-NEXT: # %bb.9: # %udiv-preheader +; X86-NEXT: orl %edx, %eax +; X86-NEXT: andl $3, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movb %dl, %ch +; X86-NEXT: andb $7, %ch +; X86-NEXT: movb %dl, %cl +; X86-NEXT: shrb $3, %cl +; X86-NEXT: andb $15, %cl +; X86-NEXT: movzbl %cl, %esi +; X86-NEXT: movl 80(%esp,%esi), %eax +; X86-NEXT: movl 84(%esp,%esi), %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movb %ch, %cl +; X86-NEXT: shrl %cl, %edi +; X86-NEXT: notb %cl +; X86-NEXT: movl 88(%esp,%esi), %esi +; X86-NEXT: addl %esi, %esi +; X86-NEXT: shll %cl, %esi +; X86-NEXT: orl %edi, %esi +; X86-NEXT: movb %ch, %cl +; X86-NEXT: shrdl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: addl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: adcl $3, %eax +; X86-NEXT: andl $3, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: .p2align 4, 0x90 +; X86-NEXT: .LBB0_10: # %udiv-do-while +; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $1, %esi, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: andl $2, %edx +; X86-NEXT: shrl %edx +; X86-NEXT: leal (%edx,%eax,2), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl $1, %edi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: orl %esi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %edi +; X86-NEXT: orl %esi, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %edx, %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl $3, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: sbbl %ecx, %ebx +; X86-NEXT: shll $30, %ebx +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: sarl $30, %edx +; X86-NEXT: sarl $31, %ebx +; X86-NEXT: shrdl $1, %ebx, %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: andl $1, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: subl %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: sbbl %edi, %ecx +; X86-NEXT: andl $3, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: addl $-1, %edx +; X86-NEXT: adcl $-1, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl $3, %edi +; X86-NEXT: andl $3, %edi +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edi, %edx +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: jne .LBB0_10 +; X86-NEXT: .LBB0_11: # %udiv-end +; X86-NEXT: cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload +; X86-NEXT: setne (%eax) +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%eax) +; X86-NEXT: movb $0, (%eax) +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 +; X86-NEXT: retl BB: %A30 = alloca i66 %L17 = load i66, ptr %A30 @@ -41,10 +284,10 @@ define void @g() { ; X64-LABEL: g: ; X64: # %bb.0: # %BB -; X64-NEXT: movzbl (%rax), %eax +; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-NEXT: movzbl (%rax), %ecx ; X64-NEXT: cmpb $0, (%rax) ; X64-NEXT: setne (%rax) -; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax ; X64-NEXT: movq %rax, (%rax) ; X64-NEXT: movb $0, (%rax) ; X64-NEXT: retq @@ -58,10 +301,10 @@ ; X86-NEXT: .cfi_def_cfa_register %ebp ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $8, %esp -; X86-NEXT: movzbl (%eax), %eax +; X86-NEXT: leal -{{[0-9]+}}(%esp), %eax +; X86-NEXT: movzbl (%eax), %ecx ; X86-NEXT: cmpb $0, (%eax) ; X86-NEXT: setne (%eax) -; X86-NEXT: leal -{{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, (%eax) ; X86-NEXT: movb $0, (%eax) ; X86-NEXT: movl %ebp, %esp diff --git a/llvm/test/CodeGen/X86/pr38639.ll b/llvm/test/CodeGen/X86/pr38639.ll --- a/llvm/test/CodeGen/X86/pr38639.ll +++ b/llvm/test/CodeGen/X86/pr38639.ll @@ -10,6 +10,7 @@ ; CHECK-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [8.2071743224100002E-1,8.2071743224100002E-1] ; CHECK-NEXT: # xmm2 = mem[0,0] +; CHECK-NEXT: vblendps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; CHECK-NEXT: retq %1 = shufflevector <4 x double> %a, <4 x double> , <8 x i32> diff --git a/llvm/test/CodeGen/X86/pr38738.ll b/llvm/test/CodeGen/X86/pr38738.ll --- a/llvm/test/CodeGen/X86/pr38738.ll +++ b/llvm/test/CodeGen/X86/pr38738.ll @@ -130,22 +130,22 @@ ; X86SSE2-LABEL: tryset: ; X86SSE2: # %bb.0: ; X86SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86SSE2-NEXT: movl $0, 4(%eax) -; X86SSE2-NEXT: movl $0, (%eax) -; X86SSE2-NEXT: movl $0, 12(%eax) -; X86SSE2-NEXT: movl $0, 8(%eax) -; X86SSE2-NEXT: movl $0, 20(%eax) -; X86SSE2-NEXT: movl $0, 16(%eax) -; X86SSE2-NEXT: movl $0, 28(%eax) -; X86SSE2-NEXT: movl $0, 24(%eax) -; X86SSE2-NEXT: movl $0, 36(%eax) -; X86SSE2-NEXT: movl $0, 32(%eax) -; X86SSE2-NEXT: movl $0, 44(%eax) -; X86SSE2-NEXT: movl $0, 40(%eax) -; X86SSE2-NEXT: movl $0, 52(%eax) -; X86SSE2-NEXT: movl $0, 48(%eax) ; X86SSE2-NEXT: movl $0, 60(%eax) ; X86SSE2-NEXT: movl $0, 56(%eax) +; X86SSE2-NEXT: movl $0, 52(%eax) +; X86SSE2-NEXT: movl $0, 48(%eax) +; X86SSE2-NEXT: movl $0, 44(%eax) +; X86SSE2-NEXT: movl $0, 40(%eax) +; X86SSE2-NEXT: movl $0, 36(%eax) +; X86SSE2-NEXT: movl $0, 32(%eax) +; X86SSE2-NEXT: movl $0, 28(%eax) +; X86SSE2-NEXT: movl $0, 24(%eax) +; X86SSE2-NEXT: movl $0, 20(%eax) +; X86SSE2-NEXT: movl $0, 16(%eax) +; X86SSE2-NEXT: movl $0, 12(%eax) +; X86SSE2-NEXT: movl $0, 8(%eax) +; X86SSE2-NEXT: movl $0, 4(%eax) +; X86SSE2-NEXT: movl $0, (%eax) ; X86SSE2-NEXT: retl ; ; X64AVX-LABEL: tryset: diff --git a/llvm/test/CodeGen/X86/pr39666.ll b/llvm/test/CodeGen/X86/pr39666.ll --- a/llvm/test/CodeGen/X86/pr39666.ll +++ b/llvm/test/CodeGen/X86/pr39666.ll @@ -4,7 +4,8 @@ define <2 x i64> @test5(ptr %base, <2 x i64> %src0) { ; CHECK-LABEL: test5: ; CHECK: # %bb.0: -; CHECK-NEXT: vpinsrq $1, (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: vmovddup (%rdi), %xmm1 # xmm1 = mem[0,0] +; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq %res = call <2 x i64> @llvm.masked.expandload.v2i64(ptr %base, <2 x i1> , <2 x i64> %src0) ret <2 x i64>%res diff --git a/llvm/test/CodeGen/X86/pr40730.ll b/llvm/test/CodeGen/X86/pr40730.ll --- a/llvm/test/CodeGen/X86/pr40730.ll +++ b/llvm/test/CodeGen/X86/pr40730.ll @@ -17,10 +17,14 @@ } ; CHECK: .LCPI1_0: -; CHECK-NEXT: .quad 0x0000000e0000000d -; CHECK-NEXT: .quad 0x0000000e0000000d -; CHECK-NEXT: .quad 0x0000001000000000 -; CHECK-NEXT: .quad 0x0000000e0000000d +; CHECK-NEXT: .zero 4 +; CHECK-NEXT: .zero 4 +; CHECK-NEXT: .zero 4 +; CHECK-NEXT: .zero 4 +; CHECK-NEXT: .long 13 +; CHECK-NEXT: .long 14 +; CHECK-NEXT: .zero 4 +; CHECK-NEXT: .long 16 define <8 x i32> @shuffle_v8i32_0dcd3f14_constant(<8 x i32> %a0) { ; CHECK-LABEL: shuffle_v8i32_0dcd3f14_constant: @@ -29,7 +33,9 @@ ; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] ; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,1,1,0] ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5],ymm0[6,7] +; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; CHECK-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] +; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5],ymm0[6,7] ; CHECK-NEXT: retq %res = shufflevector <8 x i32> %a0, <8 x i32> , <8 x i32> ret <8 x i32> %res diff --git a/llvm/test/CodeGen/X86/pr42727.ll b/llvm/test/CodeGen/X86/pr42727.ll --- a/llvm/test/CodeGen/X86/pr42727.ll +++ b/llvm/test/CodeGen/X86/pr42727.ll @@ -7,8 +7,8 @@ ; CHECK-LABEL: _ZN14simd_test_avx216c_imm_v256_alignILi1EEE6c_v256S1_S1_: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmovdqu {{[0-9]+}}(%esp), %xmm0 -; CHECK-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vpbroadcastd (%eax), %ymm1 +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; CHECK-NEXT: vpsllq $56, %ymm0, %ymm0 ; CHECK-NEXT: vmovdqu %ymm0, (%eax) ; CHECK-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/pr42905.ll b/llvm/test/CodeGen/X86/pr42905.ll --- a/llvm/test/CodeGen/X86/pr42905.ll +++ b/llvm/test/CodeGen/X86/pr42905.ll @@ -4,16 +4,10 @@ define <4 x double> @autogen_SD30452(i1 %L230) { ; CHECK-LABEL: autogen_SD30452: ; CHECK: # %bb.0: # %BB -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [151829,151829] -; CHECK-NEXT: movq %xmm0, %rax -; CHECK-NEXT: cvtsi2sd %rax, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; CHECK-NEXT: movq %xmm2, %rax -; CHECK-NEXT: xorps %xmm2, %xmm2 -; CHECK-NEXT: cvtsi2sd %rax, %xmm2 -; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-NEXT: cvtdq2pd %xmm1, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [151829,151829] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: cvtdq2pd %xmm0, %xmm0 +; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: retq BB: %I = insertelement <4 x i64> zeroinitializer, i64 151829, i32 3 diff --git a/llvm/test/CodeGen/X86/pr44976.ll b/llvm/test/CodeGen/X86/pr44976.ll --- a/llvm/test/CodeGen/X86/pr44976.ll +++ b/llvm/test/CodeGen/X86/pr44976.ll @@ -65,7 +65,7 @@ ; CHECK-NEXT: por %xmm5, %xmm3 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] ; CHECK-NEXT: movdqa %xmm3, %xmm4 -; CHECK-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] +; CHECK-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[1,1] ; CHECK-NEXT: movdqa %xmm0, %xmm5 ; CHECK-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm4[2,0] ; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,3,3,3] diff --git a/llvm/test/CodeGen/X86/pr45563-2.ll b/llvm/test/CodeGen/X86/pr45563-2.ll --- a/llvm/test/CodeGen/X86/pr45563-2.ll +++ b/llvm/test/CodeGen/X86/pr45563-2.ll @@ -28,23 +28,24 @@ ; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] -; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 -; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm4 -; CHECK-NEXT: vblendvps %ymm1, %ymm4, %ymm0, %ymm0 -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8,u,u,u],zero,xmm2[u,u,u],zero,xmm2[u,u,u],zero,xmm2[u,u,u] +; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; CHECK-NEXT: vmaskmovps (%rdx), %ymm1, %ymm2 +; CHECK-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vmovd %ecx, %xmm1 ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vmaskmovps 32(%rcx), %ymm1, %ymm2 +; CHECK-NEXT: vmaskmovps 32(%rdx), %ymm1, %ymm2 +; CHECK-NEXT: vblendvps %xmm1, %xmm2, %xmm3, %xmm1 +; CHECK-NEXT: vmovss %xmm1, 32(%rdi) ; CHECK-NEXT: vmovaps %ymm0, (%rdi) -; CHECK-NEXT: vblendvps %xmm1, %xmm2, %xmm3, %xmm0 -; CHECK-NEXT: vmovss %xmm0, 32(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call <9 x float> @llvm.masked.load.v9f32.p0(ptr %addr, i32 4, <9 x i1>%mask, <9 x float> %dst) @@ -70,38 +71,47 @@ ; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm3 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: vpinsrb $9, %edx, %xmm2, %xmm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: vpinsrb $10, %esi, %xmm2, %xmm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: vpinsrb $11, %edi, %xmm2, %xmm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d +; CHECK-NEXT: vpinsrb $12, %r8d, %xmm2, %xmm2 +; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] ; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],mem[0] -; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r9 ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] -; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero -; CHECK-NEXT: vpslld $31, %xmm6, %xmm6 -; CHECK-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 -; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm6 -; CHECK-NEXT: vblendvps %ymm1, %ymm6, %ymm0, %ymm0 -; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm3 -; CHECK-NEXT: vmaskmovps 32(%rcx), %ymm3, %ymm3 -; CHECK-NEXT: vmovaps %ymm0, (%rdi) -; CHECK-NEXT: vblendvps %xmm1, %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vmovaps %xmm0, 32(%rdi) -; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm0 -; CHECK-NEXT: vblendvps %xmm2, %xmm0, %xmm5, %xmm0 -; CHECK-NEXT: vmovss %xmm0, 48(%rdi) +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; CHECK-NEXT: vmaskmovps (%r9), %ymm1, %ymm2 +; CHECK-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vmovd %ecx, %xmm1 +; CHECK-NEXT: vpinsrw $1, %edx, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrw $2, %esi, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrw $3, %edi, %xmm1, %xmm1 +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 +; CHECK-NEXT: vpinsrw $4, %r8d, %xmm1, %xmm1 +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm5 +; CHECK-NEXT: vmaskmovps 32(%r9), %ymm5, %ymm5 +; CHECK-NEXT: vblendvps %xmm2, %xmm5, %xmm3, %xmm2 +; CHECK-NEXT: vmovaps %xmm2, 32(%rax) +; CHECK-NEXT: vextractf128 $1, %ymm5, %xmm2 +; CHECK-NEXT: vblendvps %xmm1, %xmm2, %xmm4, %xmm1 +; CHECK-NEXT: vmovss %xmm1, 48(%rax) +; CHECK-NEXT: vmovaps %ymm0, (%rax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call <13 x float> @llvm.masked.load.v13f32.p0(ptr %addr, i32 4, <13 x i1>%mask, <13 x float> %dst) @@ -127,40 +137,51 @@ ; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: vpinsrb $9, %edx, %xmm2, %xmm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: vpinsrb $10, %esi, %xmm2, %xmm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: vpinsrb $11, %edi, %xmm2, %xmm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d +; CHECK-NEXT: vpinsrb $12, %r8d, %xmm2, %xmm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d +; CHECK-NEXT: vpinsrb $13, %r9d, %xmm2, %xmm2 ; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] ; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3] -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] -; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero -; CHECK-NEXT: vpslld $31, %xmm5, %xmm5 -; CHECK-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 -; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm5 -; CHECK-NEXT: vblendvps %ymm1, %ymm5, %ymm0, %ymm0 -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8,u,9,u,10,u,11,u,12,u,13,u],zero,xmm2[u],zero,xmm2[u] +; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; CHECK-NEXT: vmaskmovps (%r10), %ymm1, %ymm2 +; CHECK-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vmovd %ecx, %xmm1 +; CHECK-NEXT: vpinsrw $1, %edx, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrw $2, %esi, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrw $3, %edi, %xmm1, %xmm1 ; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 +; CHECK-NEXT: vpinsrw $4, %r8d, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrw $5, %r9d, %xmm1, %xmm1 ; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm5 -; CHECK-NEXT: vmaskmovps 32(%rcx), %ymm5, %ymm5 -; CHECK-NEXT: vmovaps %ymm0, (%rdi) -; CHECK-NEXT: vextractf128 $1, %ymm5, %xmm0 -; CHECK-NEXT: vblendvps %xmm1, %xmm0, %xmm4, %xmm0 -; CHECK-NEXT: vmovlps %xmm0, 48(%rdi) -; CHECK-NEXT: vblendvps %xmm2, %xmm5, %xmm3, %xmm0 -; CHECK-NEXT: vmovaps %xmm0, 32(%rdi) +; CHECK-NEXT: vmaskmovps 32(%r10), %ymm5, %ymm5 +; CHECK-NEXT: vextractf128 $1, %ymm5, %xmm6 +; CHECK-NEXT: vblendvps %xmm1, %xmm6, %xmm4, %xmm1 +; CHECK-NEXT: vmovlps %xmm1, 48(%rax) +; CHECK-NEXT: vblendvps %xmm2, %xmm5, %xmm3, %xmm1 +; CHECK-NEXT: vmovaps %xmm1, 32(%rax) +; CHECK-NEXT: vmovaps %ymm0, (%rax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = call <14 x float> @llvm.masked.load.v14f32.p0(ptr %addr, i32 4, <14 x i1>%mask, <14 x float> %dst) @@ -188,7 +209,6 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; CHECK-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; CHECK-NEXT: vmovd %esi, %xmm3 ; CHECK-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3 @@ -221,8 +241,9 @@ ; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; CHECK-NEXT: vmaskmovps 32(%rdi), %ymm3, %ymm4 ; CHECK-NEXT: vblendvps %ymm3, %ymm4, %ymm1, %ymm1 -; CHECK-NEXT: vmovd %r10d, %xmm3 +; CHECK-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 +; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; CHECK-NEXT: vmaskmovps 64(%rdi), %ymm3, %ymm4 ; CHECK-NEXT: vblendvps %xmm3, %xmm4, %xmm0, %xmm0 ; CHECK-NEXT: vmovss %xmm0, 64(%rax) diff --git a/llvm/test/CodeGen/X86/pr45833.ll b/llvm/test/CodeGen/X86/pr45833.ll --- a/llvm/test/CodeGen/X86/pr45833.ll +++ b/llvm/test/CodeGen/X86/pr45833.ll @@ -19,22 +19,23 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovd %esi, %xmm2 -; CHECK-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $3, %r8d, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm3 -; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[8,u,u,u],zero,xmm3[u,u,u],zero,xmm3[u,u,u],zero,xmm3[u,u,u] +; CHECK-NEXT: vmovd %esi, %xmm1 +; CHECK-NEXT: vpinsrb $1, %edx, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $3, %r8d, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $4, %r9d, %xmm1, %xmm2 +; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovd %eax, %xmm4 ; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 -; CHECK-NEXT: vmaskmovps %ymm1, %ymm4, 32(%rdi) -; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; CHECK-NEXT: vmaskmovps %ymm3, %ymm4, 32(%rdi) +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -55,35 +56,44 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] -; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; CHECK-NEXT: vmovd %esi, %xmm2 -; CHECK-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $3, %r8d, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm3 -; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm4 -; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 -; CHECK-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vmovd %esi, %xmm1 +; CHECK-NEXT: vpinsrb $1, %edx, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $3, %r8d, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $4, %r9d, %xmm1, %xmm2 +; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: vpinsrb $10, %edx, %xmm2, %xmm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: vpinsrb $11, %esi, %xmm2, %xmm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d +; CHECK-NEXT: vpinsrb $12, %r8d, %xmm2, %xmm2 +; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] +; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; CHECK-NEXT: vmovd %eax, %xmm4 +; CHECK-NEXT: vpinsrw $1, %ecx, %xmm4, %xmm4 +; CHECK-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; CHECK-NEXT: vpinsrw $3, %esi, %xmm4, %xmm4 +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero ; CHECK-NEXT: vpslld $31, %xmm5, %xmm5 -; CHECK-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; CHECK-NEXT: vmaskmovps %ymm1, %ymm3, 32(%rdi) -; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; CHECK-NEXT: vpinsrw $4, %r8d, %xmm4, %xmm4 +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 +; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; CHECK-NEXT: vmaskmovps %ymm3, %ymm4, 32(%rdi) +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -104,41 +114,52 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] -; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; CHECK-NEXT: vmovd %esi, %xmm2 -; CHECK-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $3, %r8d, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm3 -; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] -; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; CHECK-NEXT: vmovd %esi, %xmm1 +; CHECK-NEXT: vpinsrb $1, %edx, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $3, %r8d, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $4, %r9d, %xmm1, %xmm2 +; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: vpinsrb $10, %edx, %xmm2, %xmm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %esi +; CHECK-NEXT: vpinsrb $11, %esi, %xmm2, %xmm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d +; CHECK-NEXT: vpinsrb $12, %r8d, %xmm2, %xmm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d +; CHECK-NEXT: vpinsrb $13, %r9d, %xmm2, %xmm2 +; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] +; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3] +; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; CHECK-NEXT: vmovd %eax, %xmm4 +; CHECK-NEXT: vpinsrw $1, %ecx, %xmm4, %xmm4 +; CHECK-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4 +; CHECK-NEXT: vpinsrw $3, %esi, %xmm4, %xmm4 +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; CHECK-NEXT: vpslld $31, %xmm5, %xmm5 +; CHECK-NEXT: vpinsrw $4, %r8d, %xmm4, %xmm4 +; CHECK-NEXT: vpinsrw $5, %r9d, %xmm4, %xmm4 +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 -; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; CHECK-NEXT: vmaskmovps %ymm0, %ymm2, (%rdi) -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[8,u,9,u,10,u,11,u,12,u,13,u],zero,xmm3[u],zero,xmm3[u] -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; CHECK-NEXT: vmaskmovps %ymm3, %ymm4, 32(%rdi) +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; CHECK-NEXT: vmaskmovps %ymm1, %ymm0, 32(%rdi) +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; CHECK-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.masked.store.v14f32.p0(<14 x float> %value, ptr %addr, i32 4, <14 x i1>%mask) @@ -165,9 +186,9 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: vmovd %eax, %xmm3 +; CHECK-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 +; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; CHECK-NEXT: vmaskmovps %ymm2, %ymm3, 64(%rdi) ; CHECK-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/pr46820.ll b/llvm/test/CodeGen/X86/pr46820.ll --- a/llvm/test/CodeGen/X86/pr46820.ll +++ b/llvm/test/CodeGen/X86/pr46820.ll @@ -11,13 +11,15 @@ ; CHECK-LABEL: load23: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: vmovups (%rsi), %zmm0 -; CHECK-NEXT: vmovaps 64(%rsi), %xmm1 -; CHECK-NEXT: vmovdqa 80(%rsi), %xmm2 -; CHECK-NEXT: vextractps $2, %xmm2, 88(%rdi) -; CHECK-NEXT: vmovq %xmm2, 80(%rdi) -; CHECK-NEXT: vmovaps %xmm1, 64(%rdi) -; CHECK-NEXT: vmovaps %zmm0, (%rdi) +; CHECK-NEXT: vmovups 64(%rsi), %ymm0 +; CHECK-NEXT: vmovups (%rsi), %zmm1 +; CHECK-NEXT: vmovaps 64(%rsi), %xmm2 +; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovss %xmm3, 88(%rdi) +; CHECK-NEXT: vmovaps %xmm2, 64(%rdi) +; CHECK-NEXT: vmovaps %zmm1, (%rdi) +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovlps %xmm0, 80(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %t0 = load <23 x float>, ptr %p, align 16 diff --git a/llvm/test/CodeGen/X86/pr46877.ll b/llvm/test/CodeGen/X86/pr46877.ll --- a/llvm/test/CodeGen/X86/pr46877.ll +++ b/llvm/test/CodeGen/X86/pr46877.ll @@ -12,7 +12,7 @@ ; CHECK-NEXT: vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero ; CHECK-NEXT: vsubss %xmm1, %xmm0, %xmm12 ; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm3 * xmm1) + xmm0 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm12 * xmm5) + xmm0 ; CHECK-NEXT: vmulss %xmm4, %xmm5, %xmm2 ; CHECK-NEXT: vmulss %xmm1, %xmm2, %xmm1 @@ -187,7 +187,7 @@ ; CHECK-NEXT: vmovss {{.*#+}} xmm12 = mem[0],zero,zero,zero ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm11 = -(xmm12 * xmm11) + xmm0 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm10 = -(xmm12 * xmm10) + xmm0 -; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm7 = (xmm13 * xmm7) - xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm7 = -(xmm13 * xmm7) + xmm0 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm1 = -(xmm1 * mem) + xmm0 ; CHECK-NEXT: vmulss %xmm3, %xmm1, %xmm0 ; CHECK-NEXT: vmulss %xmm0, %xmm8, %xmm0 diff --git a/llvm/test/CodeGen/X86/pr47517.ll b/llvm/test/CodeGen/X86/pr47517.ll --- a/llvm/test/CodeGen/X86/pr47517.ll +++ b/llvm/test/CodeGen/X86/pr47517.ll @@ -6,7 +6,19 @@ ; CHECK-LABEL: test: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq $0, (%rdi) -; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps %xmm1, %xmm2 +; CHECK-NEXT: mulss %xmm0, %xmm2 +; CHECK-NEXT: addss %xmm1, %xmm2 +; CHECK-NEXT: movaps %xmm2, %xmm1 +; CHECK-NEXT: addss %xmm2, %xmm1 +; CHECK-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-NEXT: mulss %xmm1, %xmm2 +; CHECK-NEXT: addss %xmm1, %xmm0 +; CHECK-NEXT: mulss %xmm1, %xmm1 +; CHECK-NEXT: addss %xmm2, %xmm0 +; CHECK-NEXT: mulss %xmm1, %xmm0 ; CHECK-NEXT: retq entry: %a1 = getelementptr inbounds float, ptr %p, i32 1 diff --git a/llvm/test/CodeGen/X86/pr49162.ll b/llvm/test/CodeGen/X86/pr49162.ll --- a/llvm/test/CodeGen/X86/pr49162.ll +++ b/llvm/test/CodeGen/X86/pr49162.ll @@ -6,8 +6,7 @@ ; X86-LABEL: PR49162: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl 8(%eax), %ecx -; X86-NEXT: shll $16, %ecx +; X86-NEXT: movl 6(%eax), %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sarl $31, %eax ; X86-NEXT: shldl $16, %ecx, %eax @@ -17,10 +16,7 @@ ; ; X64-LABEL: PR49162: ; X64: # %bb.0: -; X64-NEXT: movl 8(%rsi), %eax -; X64-NEXT: shll $16, %eax -; X64-NEXT: cltq -; X64-NEXT: sarq $16, %rax +; X64-NEXT: movswq 8(%rsi), %rax ; X64-NEXT: leaq (%rdi,%rax,4), %rax ; X64-NEXT: retq %load160 = load i160, ptr %ptr160, align 4 diff --git a/llvm/test/CodeGen/X86/pr49451.ll b/llvm/test/CodeGen/X86/pr49451.ll --- a/llvm/test/CodeGen/X86/pr49451.ll +++ b/llvm/test/CodeGen/X86/pr49451.ll @@ -10,7 +10,7 @@ ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl $-1, %ecx ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: xorl %ebx, %ebx @@ -49,7 +49,6 @@ ; X64-NEXT: leal -23090(%rax), %edi ; X64-NEXT: movw %di, s_0(%rip) ; X64-NEXT: incq %rax -; X64-NEXT: leal -23091(%rax), %edi ; X64-NEXT: cmpw $73, %di ; X64-NEXT: jl .LBB0_1 ; X64-NEXT: # %bb.4: # %for.body1703 diff --git a/llvm/test/CodeGen/X86/pr50609.ll b/llvm/test/CodeGen/X86/pr50609.ll --- a/llvm/test/CodeGen/X86/pr50609.ll +++ b/llvm/test/CodeGen/X86/pr50609.ll @@ -4,10 +4,11 @@ define void @PR50609(ptr noalias nocapture %RET, ptr noalias %aFOO, <16 x i32> %__mask) nounwind { ; CHECK-LABEL: PR50609: ; CHECK: # %bb.0: # %allocas -; CHECK-NEXT: leal 40(%rsi), %eax ; CHECK-NEXT: vmovq %rsi, %xmm2 -; CHECK-NEXT: vmovd %eax, %xmm3 +; CHECK-NEXT: addq $40, %rsi +; CHECK-NEXT: vmovq %rsi, %xmm3 ; CHECK-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; CHECK-NEXT: vpsrad $31, %xmm2, %xmm3 ; CHECK-NEXT: vpsrld $30, %xmm3, %xmm3 ; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/pr51615.ll b/llvm/test/CodeGen/X86/pr51615.ll --- a/llvm/test/CodeGen/X86/pr51615.ll +++ b/llvm/test/CodeGen/X86/pr51615.ll @@ -11,14 +11,15 @@ ; AVX-LABEL: volatile_load_2_elts: ; AVX: # %bb.0: ; AVX-NEXT: vmovaps g0(%rip), %xmm0 -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX-NEXT: vmovaps %ymm0, (%rax) -; AVX-NEXT: vmovaps %ymm1, (%rax) +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,3,2] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3] +; AVX-NEXT: vmovapd %ymm0, (%rax) +; AVX-NEXT: vmovaps %ymm2, (%rax) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/pr53419.ll b/llvm/test/CodeGen/X86/pr53419.ll --- a/llvm/test/CodeGen/X86/pr53419.ll +++ b/llvm/test/CodeGen/X86/pr53419.ll @@ -13,19 +13,14 @@ ; All four versions are semantically equivalent and should produce same asm as scalar version. define i1 @intrinsic_v2i8(ptr align 1 %arg, ptr align 1 %arg1) { -; X64-LABEL: intrinsic_v2i8: -; X64: # %bb.0: # %bb -; X64-NEXT: movzwl (%rsi), %eax -; X64-NEXT: cmpw (%rdi), %ax -; X64-NEXT: sete %al -; X64-NEXT: retq -; ; X86-LABEL: intrinsic_v2i8: ; X86: # %bb.0: # %bb ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: cmpw (%eax), %cx +; X86-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-NEXT: vptest %xmm0, %xmm0 ; X86-NEXT: sete %al ; X86-NEXT: retl bb: @@ -37,19 +32,14 @@ } define i1 @intrinsic_v4i8(ptr align 1 %arg, ptr align 1 %arg1) { -; X64-LABEL: intrinsic_v4i8: -; X64: # %bb.0: # %bb -; X64-NEXT: movl (%rsi), %eax -; X64-NEXT: cmpl (%rdi), %eax -; X64-NEXT: sete %al -; X64-NEXT: retq -; ; X86-LABEL: intrinsic_v4i8: ; X86: # %bb.0: # %bb ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: cmpl (%eax), %ecx +; X86-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-NEXT: vptest %xmm0, %xmm0 ; X86-NEXT: sete %al ; X86-NEXT: retl bb: @@ -61,22 +51,16 @@ } define i1 @intrinsic_v8i8(ptr align 1 %arg, ptr align 1 %arg1) { -; X64-LABEL: intrinsic_v8i8: -; X64: # %bb.0: # %bb -; X64-NEXT: movq (%rsi), %rax -; X64-NEXT: cmpq (%rdi), %rax -; X64-NEXT: sete %al -; X64-NEXT: retq -; ; X86-LABEL: intrinsic_v8i8: ; X86: # %bb.0: # %bb ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 4(%ecx), %ecx -; X86-NEXT: xorl 4(%eax), %ecx -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: orl %ecx, %edx +; X86-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; X86-NEXT: vpmovmskb %xmm0, %eax +; X86-NEXT: cmpb $-1, %al ; X86-NEXT: sete %al ; X86-NEXT: retl bb: @@ -88,19 +72,14 @@ } define i1 @vector_version_v2i8(ptr align 1 %arg, ptr align 1 %arg1) { -; X64-LABEL: vector_version_v2i8: -; X64: # %bb.0: # %bb -; X64-NEXT: movzwl (%rsi), %eax -; X64-NEXT: cmpw (%rdi), %ax -; X64-NEXT: sete %al -; X64-NEXT: retq -; ; X86-LABEL: vector_version_v2i8: ; X86: # %bb.0: # %bb ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: cmpw (%eax), %cx +; X86-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-NEXT: vptest %xmm0, %xmm0 ; X86-NEXT: sete %al ; X86-NEXT: retl bb: @@ -113,19 +92,14 @@ } define i1 @vector_version_v4i8(ptr align 1 %arg, ptr align 1 %arg1) { -; X64-LABEL: vector_version_v4i8: -; X64: # %bb.0: # %bb -; X64-NEXT: movl (%rsi), %eax -; X64-NEXT: cmpl (%rdi), %eax -; X64-NEXT: sete %al -; X64-NEXT: retq -; ; X86-LABEL: vector_version_v4i8: ; X86: # %bb.0: # %bb ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: cmpl (%eax), %ecx +; X86-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-NEXT: vptest %xmm0, %xmm0 ; X86-NEXT: sete %al ; X86-NEXT: retl bb: @@ -138,22 +112,16 @@ } define i1 @vector_version_v8i8(ptr align 1 %arg, ptr align 1 %arg1) { -; X64-LABEL: vector_version_v8i8: -; X64: # %bb.0: # %bb -; X64-NEXT: movq (%rsi), %rax -; X64-NEXT: cmpq (%rdi), %rax -; X64-NEXT: sete %al -; X64-NEXT: retq -; ; X86-LABEL: vector_version_v8i8: ; X86: # %bb.0: # %bb ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 4(%ecx), %ecx -; X86-NEXT: xorl 4(%eax), %ecx -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: orl %ecx, %edx +; X86-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; X86-NEXT: vpmovmskb %xmm0, %eax +; X86-NEXT: xorb $-1, %al ; X86-NEXT: sete %al ; X86-NEXT: retl bb: diff --git a/llvm/test/CodeGen/X86/pr53842.ll b/llvm/test/CodeGen/X86/pr53842.ll --- a/llvm/test/CodeGen/X86/pr53842.ll +++ b/llvm/test/CodeGen/X86/pr53842.ll @@ -5,20 +5,6 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s define void @PR53842() { -; CHECK-LABEL: PR53842: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB0_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmovzxbq {{.*#+}} zmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %ymm3 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %ymm2 -; CHECK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; CHECK-NEXT: vpsubq %zmm2, %zmm0, %zmm0 -; CHECK-NEXT: jmp .LBB0_1 entry: br label %vector.body @@ -36,3 +22,5 @@ unreachable } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/X86/pr56103.ll b/llvm/test/CodeGen/X86/pr56103.ll --- a/llvm/test/CodeGen/X86/pr56103.ll +++ b/llvm/test/CodeGen/X86/pr56103.ll @@ -16,7 +16,7 @@ ; CHECK-NEXT: movq b@GOTPCREL(%rip), %rax ; CHECK-NEXT: movq $1, (%rax) ; CHECK-NEXT: movq a@GOTPCREL(%rip), %rax -; CHECK-NEXT: movl (%rax), %ecx +; CHECK-NEXT: movslq (%rax), %rcx ; CHECK-NEXT: movl $-2, %eax ; CHECK-NEXT: sarl %cl, %eax ; CHECK-NEXT: movq c@GOTPCREL(%rip), %rdx diff --git a/llvm/test/CodeGen/X86/pr57340.ll b/llvm/test/CodeGen/X86/pr57340.ll --- a/llvm/test/CodeGen/X86/pr57340.ll +++ b/llvm/test/CodeGen/X86/pr57340.ll @@ -4,12 +4,11 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-LABEL: main.41: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpbroadcastw (%rax), %xmm0 -; CHECK-NEXT: vmovdqu (%rax), %ymm2 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm1 +; CHECK-NEXT: vpinsrw $0, (%rax), %xmm0, %xmm0 ; CHECK-NEXT: vpextrw $0, %xmm0, %eax +; CHECK-NEXT: vmovdqu (%rax), %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [16,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm1 ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm0 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/pr57402.ll b/llvm/test/CodeGen/X86/pr57402.ll --- a/llvm/test/CodeGen/X86/pr57402.ll +++ b/llvm/test/CodeGen/X86/pr57402.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: PR57402: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: notl %eax -; CHECK-NEXT: andl $-2, %eax +; CHECK-NEXT: andl $65534, %eax # imm = 0xFFFE ; CHECK-NEXT: leal 1(%rax,%rax,2), %ecx ; CHECK-NEXT: movswq %cx, %rsi ; CHECK-NEXT: xorl %edi, %edi diff --git a/llvm/test/CodeGen/X86/pr57658.ll b/llvm/test/CodeGen/X86/pr57658.ll --- a/llvm/test/CodeGen/X86/pr57658.ll +++ b/llvm/test/CodeGen/X86/pr57658.ll @@ -6,9 +6,8 @@ ; CHECK: # %bb.0: # %BB ; CHECK-NEXT: movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] ; CHECK-NEXT: xorpd %xmm0, %xmm1 +; CHECK-NEXT: mulpd %xmm1, %xmm0 ; CHECK-NEXT: mulpd %xmm0, %xmm1 -; CHECK-NEXT: mulpd %xmm0, %xmm1 -; CHECK-NEXT: mulpd %xmm0, %xmm0 ; CHECK-NEXT: mulpd %xmm1, %xmm0 ; CHECK-NEXT: retq BB: diff --git a/llvm/test/CodeGen/X86/pr61923.ll b/llvm/test/CodeGen/X86/pr61923.ll --- a/llvm/test/CodeGen/X86/pr61923.ll +++ b/llvm/test/CodeGen/X86/pr61923.ll @@ -17,9 +17,12 @@ ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_2: # %memcmp.loop ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups (%rsi,%rcx), %ymm0 -; CHECK-NEXT: vxorps (%rdi,%rcx), %ymm0, %ymm0 -; CHECK-NEXT: vptest %ymm0, %ymm0 +; CHECK-NEXT: vmovdqu (%rsi,%rcx), %xmm0 +; CHECK-NEXT: vmovdqu 16(%rsi,%rcx), %xmm1 +; CHECK-NEXT: vpxor (%rdi,%rcx), %xmm0, %xmm0 +; CHECK-NEXT: vpxor 16(%rdi,%rcx), %xmm1, %xmm1 +; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vptest %xmm0, %xmm0 ; CHECK-NEXT: jne .LBB0_4 ; CHECK-NEXT: # %bb.3: # %memcmp.loop.latch ; CHECK-NEXT: # in Loop: Header=BB0_2 Depth=1 @@ -27,7 +30,6 @@ ; CHECK-NEXT: cmpq %rax, %rcx ; CHECK-NEXT: jb .LBB0_2 ; CHECK-NEXT: .LBB0_4: # %done -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: %len.wide = zext i32 %len to i64 diff --git a/llvm/test/CodeGen/X86/pr62286.ll b/llvm/test/CodeGen/X86/pr62286.ll --- a/llvm/test/CodeGen/X86/pr62286.ll +++ b/llvm/test/CodeGen/X86/pr62286.ll @@ -28,32 +28,32 @@ ; AVX1-LABEL: PR62286: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] -; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm1 -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR62286: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 -; AVX2-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] -; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] +; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -68,11 +68,12 @@ ; AVX512-NEXT: movw $4369, %ax # imm = 0x1111 ; AVX512-NEXT: kmovd %eax, %k1 ; AVX512-NEXT: vpaddd %zmm0, %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vpmovsxdq %ymm1, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm0 +; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; AVX512-NEXT: vpmovsxdq %ymm1, %zmm1 +; AVX512-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax diff --git a/llvm/test/CodeGen/X86/pr63439.ll b/llvm/test/CodeGen/X86/pr63439.ll --- a/llvm/test/CodeGen/X86/pr63439.ll +++ b/llvm/test/CodeGen/X86/pr63439.ll @@ -34,8 +34,9 @@ ; ; AVX-LABEL: mulhu: ; AVX: # %bb.0: +; AVX-NEXT: movzwl %si, %eax ; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vmovd %esi, %xmm1 +; AVX-NEXT: vmovd %eax, %xmm1 ; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/pr64593.ll b/llvm/test/CodeGen/X86/pr64593.ll --- a/llvm/test/CodeGen/X86/pr64593.ll +++ b/llvm/test/CodeGen/X86/pr64593.ll @@ -6,10 +6,8 @@ define void @pr64593(ptr %p) { ; CHECK-LABEL: pr64593: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpmovqb %zmm0, (%rdi) -; CHECK-NEXT: movq $0, 8(%rdi) -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovups %xmm0, (%rdi) ; CHECK-NEXT: retq %v = insertelement <8 x i64> zeroinitializer, i64 0, i32 1 %trunc = trunc <8 x i64> %v to <8 x i8> diff --git a/llvm/test/CodeGen/X86/pr67333.ll b/llvm/test/CodeGen/X86/pr67333.ll --- a/llvm/test/CodeGen/X86/pr67333.ll +++ b/llvm/test/CodeGen/X86/pr67333.ll @@ -10,18 +10,17 @@ ; CHECK-NEXT: movbel 0, %eax ; CHECK-NEXT: movbel 12(%rdi), %ecx ; CHECK-NEXT: vmovd %eax, %xmm0 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,0,1,2,3,128,128,128,128,128,128,128,128] -; CHECK-NEXT: vpshufb %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vpsrld $17, %xmm2, %xmm0 -; CHECK-NEXT: vpslld $15, %xmm2, %xmm3 -; CHECK-NEXT: vpor %xmm0, %xmm3, %xmm0 -; CHECK-NEXT: vpsrld $19, %xmm2, %xmm3 -; CHECK-NEXT: vpslld $13, %xmm2, %xmm4 -; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 -; CHECK-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; CHECK-NEXT: vpslld $15, %xmm2, %xmm1 +; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vpsrld $19, %xmm2, %xmm1 +; CHECK-NEXT: vpslld $13, %xmm2, %xmm3 +; CHECK-NEXT: vpor %xmm1, %xmm3, %xmm1 +; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %ecx, %xmm3 -; CHECK-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; CHECK-NEXT: vmovd %ecx, %xmm1 +; CHECK-NEXT: vpsllq $32, %xmm1, %xmm1 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm1 ; CHECK-NEXT: vpsrld $17, %xmm1, %xmm0 ; CHECK-NEXT: vpslld $15, %xmm1, %xmm3 @@ -63,18 +62,9 @@ ; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3 ; CHECK-NEXT: vpsrld $10, %xmm2, %xmm2 ; CHECK-NEXT: vpxor %xmm2, %xmm3, %xmm2 -; CHECK-NEXT: vpsrlq $32, %xmm1, %xmm3 -; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm1 -; CHECK-NEXT: vpsrld $17, %xmm1, %xmm2 -; CHECK-NEXT: vpslld $15, %xmm1, %xmm4 -; CHECK-NEXT: vpor %xmm2, %xmm4, %xmm2 -; CHECK-NEXT: vpsrld $19, %xmm1, %xmm4 -; CHECK-NEXT: vpslld $13, %xmm1, %xmm5 -; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4 -; CHECK-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; CHECK-NEXT: vpsrld $10, %xmm1, %xmm4 -; CHECK-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vpsrlq $32, %xmm1, %xmm1 +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm2 +; CHECK-NEXT: vmovq %xmm2, 132(%rdi) ; CHECK-NEXT: vpsrld $17, %xmm2, %xmm3 ; CHECK-NEXT: vpslld $15, %xmm2, %xmm4 ; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 @@ -82,25 +72,33 @@ ; CHECK-NEXT: vpslld $13, %xmm2, %xmm5 ; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4 ; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; CHECK-NEXT: vpsrld $10, %xmm2, %xmm2 -; CHECK-NEXT: vpxor %xmm2, %xmm3, %xmm2 -; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; CHECK-NEXT: vpsrld $17, %xmm0, %xmm2 +; CHECK-NEXT: vpsrld $10, %xmm2, %xmm4 +; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; CHECK-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; CHECK-NEXT: movq $0, 140(%rdi) +; CHECK-NEXT: vpsrld $17, %xmm1, %xmm3 +; CHECK-NEXT: vpslld $15, %xmm1, %xmm4 +; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vpsrld $19, %xmm1, %xmm4 +; CHECK-NEXT: vpslld $13, %xmm1, %xmm5 +; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4 +; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; CHECK-NEXT: vpsrld $10, %xmm1, %xmm1 +; CHECK-NEXT: vpxor %xmm1, %xmm3, %xmm1 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpsrld $17, %xmm0, %xmm1 ; CHECK-NEXT: vpslld $15, %xmm0, %xmm3 -; CHECK-NEXT: vpor %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vpor %xmm1, %xmm3, %xmm1 ; CHECK-NEXT: vpsrld $19, %xmm0, %xmm3 ; CHECK-NEXT: vpslld $13, %xmm0, %xmm4 ; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 -; CHECK-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; CHECK-NEXT: vpxor %xmm3, %xmm1, %xmm1 ; CHECK-NEXT: vpsrld $10, %xmm0, %xmm3 -; CHECK-NEXT: vpxor %xmm3, %xmm2, %xmm2 -; CHECK-NEXT: vpsllq $32, %xmm1, %xmm3 -; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; CHECK-NEXT: vmovdqu %ymm0, 132(%rdi) -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; CHECK-NEXT: vpsllq $32, %xmm2, %xmm2 +; CHECK-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: vmovdqu %xmm0, 148(%rdi) ; CHECK-NEXT: retq entry: %0 = load i32, ptr null, align 4 diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll --- a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll @@ -30,9 +30,7 @@ ; AVX256VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4],xmm1[5],xmm2[6,7] ; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm1 ; AVX256VL-NEXT: vpslld $31, %ymm1, %ymm1 -; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k0 -; AVX256VL-NEXT: kunpckbw %k1, %k0, %k0 -; AVX256VL-NEXT: kshiftrw $8, %k0, %k2 +; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k2 ; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z} ; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1 ; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} @@ -159,9 +157,7 @@ ; AVX256VL-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm1 ; AVX256VL-NEXT: vpslld $31, %ymm1, %ymm1 -; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k0 -; AVX256VL-NEXT: kunpckbw %k1, %k0, %k0 -; AVX256VL-NEXT: kshiftrw $8, %k0, %k2 +; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k2 ; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z} ; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1 ; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} diff --git a/llvm/test/CodeGen/X86/promote-vec3.ll b/llvm/test/CodeGen/X86/promote-vec3.ll --- a/llvm/test/CodeGen/X86/promote-vec3.ll +++ b/llvm/test/CodeGen/X86/promote-vec3.ll @@ -59,7 +59,7 @@ ; SSE3-LABEL: sext_i8: ; SSE3: # %bb.0: ; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; SSE3-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; SSE3-NEXT: shll $24, %ecx ; SSE3-NEXT: shll $8, %eax ; SSE3-NEXT: orl %ecx, %eax diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -744,87 +744,118 @@ define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind { ; SSE2-LABEL: test13: ; SSE2: # %bb.0: # %vector.ph -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: por %xmm2, %xmm6 -; SSE2-NEXT: pslld $16, %xmm6 -; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pslld $16, %xmm5 -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: packssdw %xmm6, %xmm5 -; SSE2-NEXT: psubusw %xmm5, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: pxor %xmm6, %xmm7 +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm6, %xmm3 +; SSE2-NEXT: por %xmm6, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-NEXT: packssdw %xmm7, %xmm3 +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: psubw %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test13: ; SSSE3: # %bb.0: # %vector.ph -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pxor %xmm3, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 -; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm6 -; SSSE3-NEXT: por %xmm2, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm2, %xmm6 -; SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 -; SSSE3-NEXT: pxor %xmm5, %xmm4 -; SSSE3-NEXT: pand %xmm1, %xmm5 -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pshufb %xmm2, %xmm5 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; SSSE3-NEXT: psubusw %xmm5, %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm2, %xmm7 +; SSSE3-NEXT: pxor %xmm6, %xmm7 +; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pxor %xmm6, %xmm3 +; SSSE3-NEXT: por %xmm6, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 +; SSSE3-NEXT: packssdw %xmm7, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm4, %xmm2 +; SSSE3-NEXT: pshufb %xmm4, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: psubw %xmm1, %xmm0 +; SSSE3-NEXT: pandn %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: test13: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] -; SSE41-NEXT: pminud %xmm3, %xmm2 -; SSE41-NEXT: pminud %xmm3, %xmm1 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pmaxud %xmm2, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE41-NEXT: movdqa %xmm5, %xmm4 +; SSE41-NEXT: pmaxud %xmm1, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE41-NEXT: packssdw %xmm6, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7] ; SSE41-NEXT: packusdw %xmm2, %xmm1 -; SSE41-NEXT: psubusw %xmm1, %xmm0 +; SSE41-NEXT: psubw %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test13: ; AVX1: # %bb.0: # %vector.ph -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535] -; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpmaxud %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmaxud %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test13: ; AVX2: # %bb.0: # %vector.ph -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmaxud %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpandn %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test13: ; AVX512: # %bb.0: # %vector.ph -; AVX512-NEXT: vpmovusdw %ymm1, %xmm1 -; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpcmpnltud %ymm1, %ymm2, %k1 +; AVX512-NEXT: vpmovdw %ymm1, %xmm1 +; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq vector.ph: @@ -998,87 +1029,117 @@ define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind { ; SSE2-LABEL: test15: ; SSE2: # %bb.0: # %vector.ph +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: por %xmm2, %xmm6 -; SSE2-NEXT: pslld $16, %xmm6 -; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pslld $16, %xmm5 -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: packssdw %xmm6, %xmm5 -; SSE2-NEXT: psubusw %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: pxor %xmm3, %xmm6 +; SSE2-NEXT: por %xmm3, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm3, %xmm6 +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE2-NEXT: packssdw %xmm5, %xmm4 +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: psubw %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test15: ; SSSE3: # %bb.0: # %vector.ph +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pxor %xmm3, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 -; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm6 -; SSSE3-NEXT: por %xmm2, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm2, %xmm6 -; SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 -; SSSE3-NEXT: pxor %xmm5, %xmm4 -; SSSE3-NEXT: pand %xmm1, %xmm5 -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pshufb %xmm2, %xmm5 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; SSSE3-NEXT: psubusw %xmm5, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm6 +; SSSE3-NEXT: pxor %xmm3, %xmm6 +; SSSE3-NEXT: por %xmm3, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 +; SSSE3-NEXT: movdqa %xmm1, %xmm6 +; SSSE3-NEXT: pxor %xmm3, %xmm6 +; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4 +; SSSE3-NEXT: packssdw %xmm5, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm3, %xmm2 +; SSSE3-NEXT: pshufb %xmm3, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: psubw %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm4, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: test15: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] -; SSE41-NEXT: pminud %xmm3, %xmm2 -; SSE41-NEXT: pminud %xmm3, %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: movdqa %xmm3, %xmm6 +; SSE41-NEXT: pminud %xmm2, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 +; SSE41-NEXT: movdqa %xmm5, %xmm3 +; SSE41-NEXT: pminud %xmm1, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE41-NEXT: packssdw %xmm6, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] ; SSE41-NEXT: packusdw %xmm2, %xmm1 -; SSE41-NEXT: psubusw %xmm1, %xmm0 +; SSE41-NEXT: psubw %xmm1, %xmm0 +; SSE41-NEXT: pandn %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test15: ; AVX1: # %bb.0: # %vector.ph -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535] -; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpminud %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpandn %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test15: ; AVX2: # %bb.0: # %vector.ph -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpminud %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test15: ; AVX512: # %bb.0: # %vector.ph -; AVX512-NEXT: vpmovusdw %ymm1, %xmm1 -; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpcmpnleud %ymm1, %ymm2, %k1 +; AVX512-NEXT: vpmovdw %ymm1, %xmm1 +; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq vector.ph: @@ -1146,14 +1207,16 @@ ; SSE41-LABEL: test16: ; SSE41: # %bb.0: # %vector.ph ; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pmaxud %xmm2, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE41-NEXT: pmaxud %xmm1, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE41-NEXT: packssdw %xmm5, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: movdqa %xmm3, %xmm6 +; SSE41-NEXT: pminud %xmm2, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 +; SSE41-NEXT: movdqa %xmm5, %xmm3 +; SSE41-NEXT: pminud %xmm1, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE41-NEXT: packssdw %xmm6, %xmm3 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] ; SSE41-NEXT: packusdw %xmm2, %xmm1 @@ -1168,10 +1231,10 @@ ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpmaxud %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpmaxud %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpminud %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 @@ -1184,8 +1247,8 @@ ; AVX2-LABEL: test16: ; AVX2: # %bb.0: # %vector.ph ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmaxud %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpminud %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 @@ -1200,7 +1263,7 @@ ; AVX512-LABEL: test16: ; AVX512: # %bb.0: # %vector.ph ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vpcmpltud %ymm2, %ymm1, %k1 +; AVX512-NEXT: vpcmpnleud %ymm1, %ymm2, %k1 ; AVX512-NEXT: vpmovdw %ymm1, %xmm1 ; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vzeroupper @@ -1614,107 +1677,106 @@ ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] ; SSE2OR3-NEXT: movdqa %xmm4, %xmm7 ; SSE2OR3-NEXT: pxor %xmm5, %xmm7 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] -; SSE2OR3-NEXT: movdqa %xmm6, %xmm9 -; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm8 -; SSE2OR3-NEXT: pand %xmm9, %xmm8 +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991] +; SSE2OR3-NEXT: movdqa %xmm6, %xmm8 +; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] +; SSE2OR3-NEXT: pand %xmm8, %xmm9 ; SSE2OR3-NEXT: pcmpeqd %xmm7, %xmm7 -; SSE2OR3-NEXT: pand %xmm8, %xmm4 -; SSE2OR3-NEXT: pxor %xmm7, %xmm8 -; SSE2OR3-NEXT: por %xmm4, %xmm8 +; SSE2OR3-NEXT: pand %xmm9, %xmm4 +; SSE2OR3-NEXT: pxor %xmm7, %xmm9 +; SSE2OR3-NEXT: por %xmm4, %xmm9 ; SSE2OR3-NEXT: movdqa %xmm3, %xmm4 ; SSE2OR3-NEXT: pxor %xmm5, %xmm4 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2] -; SSE2OR3-NEXT: movdqa %xmm6, %xmm10 -; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2OR3-NEXT: movdqa %xmm6, %xmm8 +; SSE2OR3-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE2OR3-NEXT: pand %xmm10, %xmm4 +; SSE2OR3-NEXT: pand %xmm8, %xmm4 ; SSE2OR3-NEXT: pand %xmm4, %xmm3 ; SSE2OR3-NEXT: pxor %xmm7, %xmm4 ; SSE2OR3-NEXT: por %xmm3, %xmm4 -; SSE2OR3-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm8[0,2] +; SSE2OR3-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm9[0,2] ; SSE2OR3-NEXT: pslld $16, %xmm4 ; SSE2OR3-NEXT: psrad $16, %xmm4 ; SSE2OR3-NEXT: movdqa %xmm2, %xmm3 ; SSE2OR3-NEXT: pxor %xmm5, %xmm3 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] -; SSE2OR3-NEXT: movdqa %xmm6, %xmm9 -; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2OR3-NEXT: movdqa %xmm6, %xmm8 +; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm8 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm3 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm3 -; SSE2OR3-NEXT: pand %xmm9, %xmm3 +; SSE2OR3-NEXT: pand %xmm8, %xmm3 ; SSE2OR3-NEXT: pand %xmm3, %xmm2 ; SSE2OR3-NEXT: pxor %xmm7, %xmm3 ; SSE2OR3-NEXT: por %xmm2, %xmm3 -; SSE2OR3-NEXT: movdqa %xmm1, %xmm2 -; SSE2OR3-NEXT: pxor %xmm5, %xmm2 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,2,2] -; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm6 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE2OR3-NEXT: pand %xmm6, %xmm2 -; SSE2OR3-NEXT: pxor %xmm2, %xmm7 -; SSE2OR3-NEXT: pand %xmm1, %xmm2 -; SSE2OR3-NEXT: por %xmm7, %xmm2 -; SSE2OR3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] -; SSE2OR3-NEXT: pslld $16, %xmm2 -; SSE2OR3-NEXT: psrad $16, %xmm2 -; SSE2OR3-NEXT: packssdw %xmm4, %xmm2 -; SSE2OR3-NEXT: psubusw %xmm2, %xmm0 +; SSE2OR3-NEXT: pxor %xmm1, %xmm5 +; SSE2OR3-NEXT: movdqa %xmm6, %xmm2 +; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm2 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2OR3-NEXT: pand %xmm2, %xmm5 +; SSE2OR3-NEXT: pxor %xmm5, %xmm7 +; SSE2OR3-NEXT: pand %xmm1, %xmm5 +; SSE2OR3-NEXT: por %xmm7, %xmm5 +; SSE2OR3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm3[0,2] +; SSE2OR3-NEXT: pslld $16, %xmm5 +; SSE2OR3-NEXT: psrad $16, %xmm5 +; SSE2OR3-NEXT: packssdw %xmm4, %xmm5 +; SSE2OR3-NEXT: psubusw %xmm5, %xmm0 ; SSE2OR3-NEXT: retq ; ; SSE41-LABEL: psubus_8i64_max: ; SSE41: # %bb.0: # %vector.ph ; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: movapd {{.*#+}} xmm8 = [65535,65535] -; SSE41-NEXT: movapd %xmm8, %xmm10 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm10 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002324991,9223372039002324991] +; SSE41-NEXT: movdqa %xmm8, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm8, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm6 = [65535,65535] +; SSE41-NEXT: movapd %xmm6, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm9 ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm4 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: movapd %xmm8, %xmm4 +; SSE41-NEXT: movapd %xmm6, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4 -; SSE41-NEXT: packusdw %xmm10, %xmm4 +; SSE41-NEXT: packusdw %xmm9, %xmm4 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm3 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm8, %xmm3 +; SSE41-NEXT: movapd %xmm6, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pand %xmm7, %xmm6 -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 -; SSE41-NEXT: packusdw %xmm3, %xmm8 -; SSE41-NEXT: packusdw %xmm4, %xmm8 -; SSE41-NEXT: psubusw %xmm8, %xmm5 +; SSE41-NEXT: pxor %xmm1, %xmm7 +; SSE41-NEXT: movdqa %xmm8, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm7, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 +; SSE41-NEXT: packusdw %xmm3, %xmm6 +; SSE41-NEXT: packusdw %xmm4, %xmm6 +; SSE41-NEXT: psubusw %xmm6, %xmm5 ; SSE41-NEXT: movdqa %xmm5, %xmm0 ; SSE41-NEXT: retq ; @@ -1784,102 +1846,112 @@ define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind { ; SSE2-LABEL: psubus_16i32_max: ; SSE2: # %bb.0: # %vector.ph -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm3, %xmm8 -; SSE2-NEXT: pxor %xmm7, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm6, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2-NEXT: pand %xmm9, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: por %xmm3, %xmm9 -; SSE2-NEXT: pslld $16, %xmm9 -; SSE2-NEXT: psrad $16, %xmm9 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm7, %xmm3 -; SSE2-NEXT: movdqa %xmm6, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm10 -; SSE2-NEXT: pand %xmm10, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm10 -; SSE2-NEXT: por %xmm2, %xmm10 -; SSE2-NEXT: pslld $16, %xmm10 -; SSE2-NEXT: psrad $16, %xmm10 -; SSE2-NEXT: packssdw %xmm9, %xmm10 -; SSE2-NEXT: psubusw %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: pxor %xmm7, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: pslld $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pxor %xmm6, %xmm8 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: psubd %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm4 +; SSE2-NEXT: por %xmm8, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: psubd %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: por %xmm8, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: por %xmm8, %xmm6 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psubd %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: por %xmm8, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: por %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE2-NEXT: psubd %xmm3, %xmm7 +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pslld $16, %xmm7 +; SSE2-NEXT: psrad $16, %xmm7 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm7, %xmm0 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: packssdw %xmm3, %xmm6 -; SSE2-NEXT: psubusw %xmm6, %xmm1 +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm6, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: psubus_16i32_max: ; SSSE3: # %bb.0: # %vector.ph -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm3, %xmm8 -; SSSE3-NEXT: pxor %xmm7, %xmm8 -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm6, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm8 -; SSSE3-NEXT: pand %xmm9, %xmm3 -; SSSE3-NEXT: pxor %xmm8, %xmm9 -; SSSE3-NEXT: por %xmm3, %xmm9 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm3, %xmm9 -; SSSE3-NEXT: movdqa %xmm2, %xmm10 -; SSSE3-NEXT: pxor %xmm7, %xmm10 -; SSSE3-NEXT: movdqa %xmm6, %xmm11 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm11 -; SSSE3-NEXT: pand %xmm11, %xmm2 -; SSSE3-NEXT: pxor %xmm8, %xmm11 -; SSSE3-NEXT: por %xmm2, %xmm11 -; SSSE3-NEXT: pshufb %xmm3, %xmm11 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm9[0] -; SSSE3-NEXT: psubusw %xmm11, %xmm0 -; SSSE3-NEXT: movdqa %xmm5, %xmm2 -; SSSE3-NEXT: pxor %xmm7, %xmm2 -; SSSE3-NEXT: movdqa %xmm6, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm9 -; SSSE3-NEXT: pand %xmm9, %xmm5 -; SSSE3-NEXT: pxor %xmm8, %xmm9 -; SSSE3-NEXT: por %xmm5, %xmm9 -; SSSE3-NEXT: pshufb %xmm3, %xmm9 -; SSSE3-NEXT: pxor %xmm4, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm6 -; SSSE3-NEXT: pxor %xmm6, %xmm8 +; SSSE3-NEXT: pxor %xmm8, %xmm8 +; SSSE3-NEXT: movdqa %xmm0, %xmm7 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSSE3-NEXT: movdqa %xmm1, %xmm6 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm1, %xmm9 +; SSSE3-NEXT: psubd %xmm4, %xmm1 +; SSSE3-NEXT: pxor %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm8, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm9 +; SSSE3-NEXT: pand %xmm9, %xmm1 +; SSSE3-NEXT: movdqa %xmm6, %xmm4 +; SSSE3-NEXT: psubd %xmm5, %xmm6 +; SSSE3-NEXT: pxor %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm8, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 ; SSSE3-NEXT: pand %xmm4, %xmm6 -; SSSE3-NEXT: por %xmm8, %xmm6 -; SSSE3-NEXT: pshufb %xmm3, %xmm6 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm9[0] -; SSSE3-NEXT: psubusw %xmm6, %xmm1 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: psubd %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm8, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSSE3-NEXT: por %xmm7, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm8 +; SSSE3-NEXT: psubd %xmm3, %xmm7 +; SSSE3-NEXT: pand %xmm8, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm2, %xmm7 +; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; SSSE3-NEXT: pshufb %xmm2, %xmm6 +; SSSE3-NEXT: pshufb %xmm2, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm6[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: psubus_16i32_max: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535] -; SSE41-NEXT: pminud %xmm6, %xmm3 -; SSE41-NEXT: pminud %xmm6, %xmm2 -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: psubusw %xmm2, %xmm0 -; SSE41-NEXT: pminud %xmm6, %xmm5 -; SSE41-NEXT: pminud %xmm6, %xmm4 -; SSE41-NEXT: packusdw %xmm5, %xmm4 -; SSE41-NEXT: psubusw %xmm4, %xmm1 +; SSE41-NEXT: pxor %xmm8, %xmm8 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE41-NEXT: pmaxud %xmm5, %xmm1 +; SSE41-NEXT: psubd %xmm5, %xmm1 +; SSE41-NEXT: pmaxud %xmm4, %xmm7 +; SSE41-NEXT: psubd %xmm4, %xmm7 +; SSE41-NEXT: packusdw %xmm1, %xmm7 +; SSE41-NEXT: pmaxud %xmm3, %xmm0 +; SSE41-NEXT: psubd %xmm3, %xmm0 +; SSE41-NEXT: pmaxud %xmm2, %xmm6 +; SSE41-NEXT: psubd %xmm2, %xmm6 +; SSE41-NEXT: packusdw %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: psubus_16i32_max: @@ -2708,29 +2780,29 @@ ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] ; SSE2OR3-NEXT: movdqa %xmm3, %xmm8 ; SSE2OR3-NEXT: pxor %xmm6, %xmm8 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647] -; SSE2OR3-NEXT: movdqa %xmm7, %xmm10 -; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm10 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm9 -; SSE2OR3-NEXT: pand %xmm10, %xmm9 +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259455,9223372039002259455] +; SSE2OR3-NEXT: movdqa %xmm7, %xmm9 +; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm7, %xmm8 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,1,3,3] +; SSE2OR3-NEXT: pand %xmm9, %xmm10 ; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2OR3-NEXT: pand %xmm9, %xmm3 -; SSE2OR3-NEXT: pxor %xmm8, %xmm9 -; SSE2OR3-NEXT: por %xmm3, %xmm9 +; SSE2OR3-NEXT: pand %xmm10, %xmm3 +; SSE2OR3-NEXT: pxor %xmm8, %xmm10 +; SSE2OR3-NEXT: por %xmm3, %xmm10 ; SSE2OR3-NEXT: movdqa %xmm2, %xmm3 ; SSE2OR3-NEXT: pxor %xmm6, %xmm3 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] -; SSE2OR3-NEXT: movdqa %xmm7, %xmm11 -; SSE2OR3-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2OR3-NEXT: movdqa %xmm7, %xmm9 +; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm7, %xmm3 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm3 -; SSE2OR3-NEXT: pand %xmm11, %xmm3 +; SSE2OR3-NEXT: pand %xmm9, %xmm3 ; SSE2OR3-NEXT: pand %xmm3, %xmm2 ; SSE2OR3-NEXT: pxor %xmm8, %xmm3 ; SSE2OR3-NEXT: por %xmm2, %xmm3 -; SSE2OR3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm9[0,2] +; SSE2OR3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[0,2] ; SSE2OR3-NEXT: movdqa %xmm0, %xmm2 ; SSE2OR3-NEXT: psubd %xmm3, %xmm2 ; SSE2OR3-NEXT: pxor %xmm6, %xmm3 @@ -2739,22 +2811,23 @@ ; SSE2OR3-NEXT: pand %xmm2, %xmm0 ; SSE2OR3-NEXT: movdqa %xmm5, %xmm2 ; SSE2OR3-NEXT: pxor %xmm6, %xmm2 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2OR3-NEXT: movdqa %xmm7, %xmm9 -; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE2OR3-NEXT: movdqa %xmm7, %xmm3 +; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm7, %xmm2 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE2OR3-NEXT: pand %xmm9, %xmm2 +; SSE2OR3-NEXT: pand %xmm3, %xmm2 ; SSE2OR3-NEXT: pand %xmm2, %xmm5 ; SSE2OR3-NEXT: pxor %xmm8, %xmm2 ; SSE2OR3-NEXT: por %xmm5, %xmm2 ; SSE2OR3-NEXT: movdqa %xmm4, %xmm3 ; SSE2OR3-NEXT: pxor %xmm6, %xmm3 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] -; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2OR3-NEXT: movdqa %xmm7, %xmm5 +; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm7, %xmm3 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm3 -; SSE2OR3-NEXT: pand %xmm7, %xmm3 +; SSE2OR3-NEXT: pand %xmm5, %xmm3 ; SSE2OR3-NEXT: pxor %xmm3, %xmm8 ; SSE2OR3-NEXT: pand %xmm4, %xmm3 ; SSE2OR3-NEXT: por %xmm8, %xmm3 @@ -2769,55 +2842,54 @@ ; ; SSE41-LABEL: test33: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm0, %xmm6 +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647,2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd {{.*#+}} xmm9 = [4294967295,4294967295] -; SSE41-NEXT: movapd %xmm9, %xmm11 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm11 +; SSE41-NEXT: pxor %xmm8, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455] +; SSE41-NEXT: movdqa %xmm9, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm9, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm7 = [4294967295,4294967295] +; SSE41-NEXT: movapd %xmm7, %xmm10 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm10 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm3 +; SSE41-NEXT: pxor %xmm8, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm12, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm3 +; SSE41-NEXT: movapd %xmm7, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm11[0,2] -; SSE41-NEXT: pmaxud %xmm3, %xmm7 -; SSE41-NEXT: psubd %xmm3, %xmm7 +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[0,2] +; SSE41-NEXT: pmaxud %xmm3, %xmm6 +; SSE41-NEXT: psubd %xmm3, %xmm6 ; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm2 +; SSE41-NEXT: pxor %xmm8, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm2 +; SSE41-NEXT: movapd %xmm7, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pand %xmm8, %xmm6 +; SSE41-NEXT: pxor %xmm4, %xmm8 +; SSE41-NEXT: movdqa %xmm9, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7 +; SSE41-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[0,2] +; SSE41-NEXT: pmaxud %xmm7, %xmm1 +; SSE41-NEXT: psubd %xmm7, %xmm1 ; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm9 -; SSE41-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[0,2] -; SSE41-NEXT: pmaxud %xmm9, %xmm1 -; SSE41-NEXT: psubd %xmm9, %xmm1 -; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test33: @@ -2931,29 +3003,29 @@ ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] ; SSE2OR3-NEXT: movdqa %xmm3, %xmm8 ; SSE2OR3-NEXT: pxor %xmm6, %xmm8 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647] -; SSE2OR3-NEXT: movdqa %xmm7, %xmm10 -; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm10 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm9 -; SSE2OR3-NEXT: pand %xmm10, %xmm9 +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259455,9223372039002259455] +; SSE2OR3-NEXT: movdqa %xmm7, %xmm9 +; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm7, %xmm8 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,1,3,3] +; SSE2OR3-NEXT: pand %xmm9, %xmm10 ; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2OR3-NEXT: pand %xmm9, %xmm3 -; SSE2OR3-NEXT: pxor %xmm8, %xmm9 -; SSE2OR3-NEXT: por %xmm3, %xmm9 +; SSE2OR3-NEXT: pand %xmm10, %xmm3 +; SSE2OR3-NEXT: pxor %xmm8, %xmm10 +; SSE2OR3-NEXT: por %xmm3, %xmm10 ; SSE2OR3-NEXT: movdqa %xmm2, %xmm3 ; SSE2OR3-NEXT: pxor %xmm6, %xmm3 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] -; SSE2OR3-NEXT: movdqa %xmm7, %xmm11 -; SSE2OR3-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2OR3-NEXT: movdqa %xmm7, %xmm9 +; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm7, %xmm3 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm3 -; SSE2OR3-NEXT: pand %xmm11, %xmm3 +; SSE2OR3-NEXT: pand %xmm9, %xmm3 ; SSE2OR3-NEXT: pand %xmm3, %xmm2 ; SSE2OR3-NEXT: pxor %xmm8, %xmm3 ; SSE2OR3-NEXT: por %xmm2, %xmm3 -; SSE2OR3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm9[0,2] +; SSE2OR3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[0,2] ; SSE2OR3-NEXT: movdqa %xmm0, %xmm2 ; SSE2OR3-NEXT: psubd %xmm3, %xmm2 ; SSE2OR3-NEXT: pxor %xmm6, %xmm3 @@ -2962,22 +3034,23 @@ ; SSE2OR3-NEXT: pand %xmm2, %xmm0 ; SSE2OR3-NEXT: movdqa %xmm5, %xmm2 ; SSE2OR3-NEXT: pxor %xmm6, %xmm2 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2OR3-NEXT: movdqa %xmm7, %xmm9 -; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE2OR3-NEXT: movdqa %xmm7, %xmm3 +; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm7, %xmm2 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE2OR3-NEXT: pand %xmm9, %xmm2 +; SSE2OR3-NEXT: pand %xmm3, %xmm2 ; SSE2OR3-NEXT: pand %xmm2, %xmm5 ; SSE2OR3-NEXT: pxor %xmm8, %xmm2 ; SSE2OR3-NEXT: por %xmm5, %xmm2 ; SSE2OR3-NEXT: movdqa %xmm4, %xmm3 ; SSE2OR3-NEXT: pxor %xmm6, %xmm3 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] -; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2OR3-NEXT: movdqa %xmm7, %xmm5 +; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm7, %xmm3 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm3 -; SSE2OR3-NEXT: pand %xmm7, %xmm3 +; SSE2OR3-NEXT: pand %xmm5, %xmm3 ; SSE2OR3-NEXT: pxor %xmm3, %xmm8 ; SSE2OR3-NEXT: pand %xmm4, %xmm3 ; SSE2OR3-NEXT: por %xmm8, %xmm3 @@ -2996,53 +3069,52 @@ ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] ; SSE41-NEXT: pand %xmm0, %xmm1 ; SSE41-NEXT: pand %xmm0, %xmm6 -; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd {{.*#+}} xmm9 = [4294967295,4294967295] -; SSE41-NEXT: movapd %xmm9, %xmm11 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm11 +; SSE41-NEXT: pxor %xmm8, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455] +; SSE41-NEXT: movdqa %xmm9, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm9, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm7 = [4294967295,4294967295] +; SSE41-NEXT: movapd %xmm7, %xmm10 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm10 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm3 +; SSE41-NEXT: pxor %xmm8, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm12, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm3 +; SSE41-NEXT: movapd %xmm7, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm11[0,2] +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[0,2] ; SSE41-NEXT: pmaxud %xmm3, %xmm6 ; SSE41-NEXT: psubd %xmm3, %xmm6 ; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm2 +; SSE41-NEXT: pxor %xmm8, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm2 +; SSE41-NEXT: movapd %xmm7, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE41-NEXT: pand %xmm8, %xmm7 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm9 -; SSE41-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[0,2] -; SSE41-NEXT: pmaxud %xmm9, %xmm1 -; SSE41-NEXT: psubd %xmm9, %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm8 +; SSE41-NEXT: movdqa %xmm9, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7 +; SSE41-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[0,2] +; SSE41-NEXT: pmaxud %xmm7, %xmm1 +; SSE41-NEXT: psubd %xmm7, %xmm1 ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/pull-binop-through-shift.ll b/llvm/test/CodeGen/X86/pull-binop-through-shift.ll --- a/llvm/test/CodeGen/X86/pull-binop-through-shift.ll +++ b/llvm/test/CodeGen/X86/pull-binop-through-shift.ll @@ -16,7 +16,7 @@ ; X86-LABEL: and_signbit_shl: ; X86: # %bb.0: ; X86-NEXT: movl 8(%esp), %ecx -; X86-NEXT: movzbl 6(%esp), %eax +; X86-NEXT: movzwl 6(%esp), %eax ; X86-NEXT: shll $24, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/rotate-extract-vector.ll b/llvm/test/CodeGen/X86/rotate-extract-vector.ll --- a/llvm/test/CodeGen/X86/rotate-extract-vector.ll +++ b/llvm/test/CodeGen/X86/rotate-extract-vector.ll @@ -10,11 +10,17 @@ ; with one of the shifts from the rotate idiom define <4 x i32> @vroll_v4i32_extract_shl(<4 x i32> %i) { -; CHECK-LABEL: vroll_v4i32_extract_shl: -; CHECK: # %bb.0: -; CHECK-NEXT: vpslld $3, %xmm0, %xmm0 -; CHECK-NEXT: vprold $7, %xmm0, %xmm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-LABEL: vroll_v4i32_extract_shl: +; X86: # %bb.0: +; X86-NEXT: vprold $10, %xmm0, %xmm0 +; X86-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: vroll_v4i32_extract_shl: +; X64: # %bb.0: +; X64-NEXT: vprold $10, %xmm0, %xmm0 +; X64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-NEXT: retq %lhs_mul = shl <4 x i32> %i, %rhs_mul = shl <4 x i32> %i, %lhs_shift = lshr <4 x i32> %lhs_mul, @@ -23,11 +29,17 @@ } define <4 x i64> @vrolq_v4i64_extract_shrl(<4 x i64> %i) nounwind { -; CHECK-LABEL: vrolq_v4i64_extract_shrl: -; CHECK: # %bb.0: -; CHECK-NEXT: vpsrlq $5, %ymm0, %ymm0 -; CHECK-NEXT: vprolq $29, %ymm0, %ymm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-LABEL: vrolq_v4i64_extract_shrl: +; X86: # %bb.0: +; X86-NEXT: vprolq $24, %ymm0, %ymm0 +; X86-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %ymm0, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: vrolq_v4i64_extract_shrl: +; X64: # %bb.0: +; X64-NEXT: vprolq $24, %ymm0, %ymm0 +; X64-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; X64-NEXT: retq %lhs_div = lshr <4 x i64> %i, %rhs_div = lshr <4 x i64> %i, %rhs_shift = shl <4 x i64> %rhs_div, diff --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll --- a/llvm/test/CodeGen/X86/rotate-extract.ll +++ b/llvm/test/CodeGen/X86/rotate-extract.ll @@ -12,20 +12,19 @@ define i64 @rolq_extract_shl(i64 %i) nounwind { ; X86-LABEL: rolq_extract_shl: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: shldl $3, %edx, %ecx -; X86-NEXT: shll $3, %eax -; X86-NEXT: shll $3, %edx -; X86-NEXT: shrdl $25, %edx, %eax -; X86-NEXT: shrdl $25, %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: shldl $10, %eax, %edx +; X86-NEXT: shldl $10, %ecx, %eax +; X86-NEXT: andl $-897, %eax # imm = 0xFC7F ; X86-NEXT: retl ; ; X64-LABEL: rolq_extract_shl: ; X64: # %bb.0: -; X64-NEXT: leaq (,%rdi,8), %rax -; X64-NEXT: rolq $7, %rax +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: rolq $10, %rax +; X64-NEXT: andq $-897, %rax # imm = 0xFC7F ; X64-NEXT: retq %lhs_mul = shl i64 %i, 3 %rhs_mul = shl i64 %i, 10 @@ -38,16 +37,16 @@ ; X86-LABEL: rolw_extract_shrl: ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: rolw $12, %ax +; X86-NEXT: rolw $9, %ax +; X86-NEXT: andl $61951, %eax # imm = 0xF1FF ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; ; X64-LABEL: rolw_extract_shrl: ; X64: # %bb.0: -; X64-NEXT: movzwl %di, %eax -; X64-NEXT: shrl $3, %eax -; X64-NEXT: rolw $12, %ax +; X64-NEXT: movl %edi, %eax +; X64-NEXT: rolw $9, %ax +; X64-NEXT: andl $61951, %eax # imm = 0xF1FF ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %lhs_div = lshr i16 %i, 7 @@ -82,18 +81,24 @@ ; X86-LABEL: rolb_extract_udiv: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull $171, %eax, %eax +; X86-NEXT: imull $171, %eax, %ecx +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: shrl $9, %eax -; X86-NEXT: rolb $4, %al +; X86-NEXT: shrl $13, %ecx +; X86-NEXT: shlb $4, %al +; X86-NEXT: orb %cl, %al ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; ; X64-LABEL: rolb_extract_udiv: ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: imull $171, %eax, %eax +; X64-NEXT: imull $171, %eax, %ecx +; X64-NEXT: movl %ecx, %eax ; X64-NEXT: shrl $9, %eax -; X64-NEXT: rolb $4, %al +; X64-NEXT: shrl $13, %ecx +; X64-NEXT: shlb $4, %al +; X64-NEXT: orb %cl, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %lhs_div = udiv i8 %i, 3 @@ -119,8 +124,12 @@ ; X64-LABEL: rolq_extract_mul_with_mask: ; X64: # %bb.0: ; X64-NEXT: leaq (%rdi,%rdi,8), %rax -; X64-NEXT: rolq $7, %rax -; X64-NEXT: movzbl %al, %eax +; X64-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi +; X64-NEXT: shll $7, %edi +; X64-NEXT: leal (%rdi,%rdi,8), %ecx +; X64-NEXT: movzbl %cl, %ecx +; X64-NEXT: shrq $57, %rax +; X64-NEXT: orq %rcx, %rax ; X64-NEXT: retq %lhs_mul = mul i64 %i, 1152 %rhs_mul = mul i64 %i, 9 @@ -223,33 +232,32 @@ define i8 @no_extract_udiv(i8 %i) nounwind { ; X86-LABEL: no_extract_udiv: ; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull $171, %eax, %ecx -; X86-NEXT: imull $79, %eax, %edx -; X86-NEXT: subb %dh, %al -; X86-NEXT: shrb %al -; X86-NEXT: addb %dh, %al -; X86-NEXT: shrb $5, %al -; X86-NEXT: shlb $3, %ch -; X86-NEXT: orb %al, %ch -; X86-NEXT: andb $-9, %ch -; X86-NEXT: movb %ch, %al +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: imull $171, %ecx, %eax +; X86-NEXT: shrl $9, %eax +; X86-NEXT: imull $79, %ecx, %edx +; X86-NEXT: subb %dh, %cl +; X86-NEXT: shrb %cl +; X86-NEXT: addb %dh, %cl +; X86-NEXT: shrb $5, %cl +; X86-NEXT: shlb $4, %al +; X86-NEXT: orb %cl, %al +; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; ; X64-LABEL: no_extract_udiv: ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %ecx ; X64-NEXT: imull $171, %ecx, %eax -; X64-NEXT: shrl $8, %eax +; X64-NEXT: shrl $9, %eax ; X64-NEXT: imull $79, %ecx, %edx ; X64-NEXT: shrl $8, %edx ; X64-NEXT: subb %dl, %cl ; X64-NEXT: shrb %cl ; X64-NEXT: addb %dl, %cl ; X64-NEXT: shrb $5, %cl -; X64-NEXT: shlb $3, %al +; X64-NEXT: shlb $4, %al ; X64-NEXT: orb %cl, %al -; X64-NEXT: andb $-9, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %lhs_div = udiv i8 %i, 3 diff --git a/llvm/test/CodeGen/X86/rotate.ll b/llvm/test/CodeGen/X86/rotate.ll --- a/llvm/test/CodeGen/X86/rotate.ll +++ b/llvm/test/CodeGen/X86/rotate.ll @@ -569,11 +569,11 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl (%eax), %ecx ; X86-NEXT: movl 4(%eax), %edx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: shldl $31, %edx, %esi -; X86-NEXT: shldl $31, %ecx, %edx -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shldl $31, %ecx, %esi +; X86-NEXT: shldl $31, %edx, %ecx +; X86-NEXT: movl %ecx, 4(%eax) +; X86-NEXT: movl %esi, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/rotate4.ll b/llvm/test/CodeGen/X86/rotate4.ll --- a/llvm/test/CodeGen/X86/rotate4.ll +++ b/llvm/test/CodeGen/X86/rotate4.ll @@ -244,32 +244,32 @@ ; X86-NEXT: .cfi_offset %ebp, -8 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %esi +; X86-NEXT: movl (%eax), %edx ; X86-NEXT: movl 4(%eax), %ebx -; X86-NEXT: movl %esi, %edx -; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shll %cl, %esi ; X86-NEXT: movl %ebx, %edi -; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: shldl %cl, %edx, %edi ; X86-NEXT: testb $32, %cl ; X86-NEXT: je .LBB6_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edx, %edi -; X86-NEXT: xorl %edx, %edx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: xorl %esi, %esi ; X86-NEXT: .LBB6_2: ; X86-NEXT: negb %cl ; X86-NEXT: movl %ebx, %ebp ; X86-NEXT: shrl %cl, %ebp -; X86-NEXT: shrdl %cl, %ebx, %esi +; X86-NEXT: shrdl %cl, %ebx, %edx ; X86-NEXT: testb $32, %cl ; X86-NEXT: je .LBB6_4 ; X86-NEXT: # %bb.3: -; X86-NEXT: movl %ebp, %esi +; X86-NEXT: movl %ebp, %edx ; X86-NEXT: xorl %ebp, %ebp ; X86-NEXT: .LBB6_4: -; X86-NEXT: orl %esi, %edx ; X86-NEXT: orl %ebp, %edi -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: orl %edx, %esi ; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %esi, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: popl %edi @@ -336,10 +336,10 @@ ; X86-NEXT: movl %ebp, %esi ; X86-NEXT: xorl %ebp, %ebp ; X86-NEXT: .LBB7_4: -; X86-NEXT: orl %ebp, %edi ; X86-NEXT: orl %esi, %edx -; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: orl %ebp, %edi ; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %edi, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/rotate_vec.ll b/llvm/test/CodeGen/X86/rotate_vec.ll --- a/llvm/test/CodeGen/X86/rotate_vec.ll +++ b/llvm/test/CodeGen/X86/rotate_vec.ll @@ -36,16 +36,34 @@ } define <4 x i32> @rot_v4i32_splat_2masks(<4 x i32> %x) { -; XOP-LABEL: rot_v4i32_splat_2masks: -; XOP: # %bb.0: -; XOP-NEXT: vprotd $31, %xmm0, %xmm0 -; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: rot_v4i32_splat_2masks: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; XOPAVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5,6],xmm0[7] +; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: rot_v4i32_splat_2masks: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpsrld $1, %xmm0, %xmm1 +; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; XOPAVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5,6],xmm0[7] +; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: rot_v4i32_splat_2masks: ; AVX512: # %bb.0: -; AVX512-NEXT: vprold $31, %xmm0, %xmm0 -; AVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $1, %xmm0, %xmm1 +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5,6],xmm0[7] +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = lshr <4 x i32> %x, %2 = and <4 x i32> %1, @@ -57,16 +75,34 @@ } define <4 x i32> @rot_v4i32_non_splat_2masks(<4 x i32> %x) { -; XOP-LABEL: rot_v4i32_non_splat_2masks: -; XOP: # %bb.0: -; XOP-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: rot_v4i32_non_splat_2masks: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5,6],xmm0[7] +; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: rot_v4i32_non_splat_2masks: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; XOPAVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5,6],xmm0[7] +; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: rot_v4i32_non_splat_2masks: ; AVX512: # %bb.0: -; AVX512-NEXT: vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX512-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5,6],xmm0[7] +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = lshr <4 x i32> %x, %2 = and <4 x i32> %1, diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll --- a/llvm/test/CodeGen/X86/sad.ll +++ b/llvm/test/CodeGen/X86/sad.ll @@ -78,9 +78,9 @@ ; AVX2-NEXT: # %bb.2: # %middle.block ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -103,9 +103,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -233,9 +233,9 @@ ; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -260,9 +260,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -430,9 +430,9 @@ ; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -462,9 +462,9 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, %eax @@ -491,9 +491,9 @@ ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax @@ -692,16 +692,40 @@ ; SSE2-LABEL: sad_nonloop_4i8: ; SSE2: # %bb.0: ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psubd %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-NEXT: psubd %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; ; AVX-LABEL: sad_nonloop_4i8: ; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpabsd %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq %v1 = load <4 x i8>, <4 x i8>* %p, align 1 @@ -729,13 +753,55 @@ ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; -; AVX-LABEL: sad_nonloop_8i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq +; AVX1-LABEL: sad_nonloop_8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpabsd %xmm0, %xmm0 +; AVX1-NEXT: vpabsd %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: sad_nonloop_8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpabsd %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: sad_nonloop_8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpabsd %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %v1 = load <8 x i8>, <8 x i8>* %p, align 1 %z1 = zext <8 x i8> %v1 to <8 x i32> %v2 = load <8 x i8>, <8 x i8>* %q, align 1 @@ -759,20 +825,83 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqu (%rdi), %xmm0 ; SSE2-NEXT: movdqu (%rdx), %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pminub %xmm1, %xmm2 +; SSE2-NEXT: pmaxub %xmm1, %xmm0 +; SSE2-NEXT: psubb %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: psadbw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddq %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq ; -; AVX-LABEL: sad_nonloop_16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqu (%rdi), %xmm0 -; AVX-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq +; AVX1-LABEL: sad_nonloop_16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpabsd %xmm0, %xmm0 +; AVX1-NEXT: vpabsd %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpabsd %xmm2, %xmm1 +; AVX1-NEXT: vpabsd %xmm3, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: sad_nonloop_16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpabsd %ymm0, %ymm0 +; AVX2-NEXT: vpabsd %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: sad_nonloop_16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsd %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %v1 = load <16 x i8>, <16 x i8>* %p, align 1 %z1 = zext <16 x i8> %v1 to <16 x i32> %v2 = load <16 x i8>, <16 x i8>* %q, align 1 @@ -810,36 +939,102 @@ ; ; AVX1-LABEL: sad_nonloop_32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqu (%rdi), %xmm0 -; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 -; AVX1-NEXT: vpsadbw 16(%rdx), %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm8, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm8, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm8, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm8, %xmm3, %xmm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm8, %xmm4, %xmm4 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm8, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm8, %xmm6, %xmm6 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm8, %xmm7, %xmm7 +; AVX1-NEXT: vpabsd %xmm0, %xmm0 +; AVX1-NEXT: vpabsd %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpabsd %xmm2, %xmm1 +; AVX1-NEXT: vpabsd %xmm3, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpabsd %xmm4, %xmm1 +; AVX1-NEXT: vpabsd %xmm5, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpabsd %xmm6, %xmm2 +; AVX1-NEXT: vpabsd %xmm7, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: retq ; ; AVX2-LABEL: sad_nonloop_32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; AVX2-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpabsd %ymm0, %ymm0 +; AVX2-NEXT: vpabsd %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpabsd %ymm2, %ymm1 +; AVX2-NEXT: vpabsd %ymm3, %ymm2 +; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: sad_nonloop_32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpsubd %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpsubd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpabsd %zmm0, %zmm0 +; AVX512-NEXT: vpabsd %zmm1, %zmm1 +; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -890,66 +1085,176 @@ ; ; AVX1-LABEL: sad_nonloop_64i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqu (%rdi), %xmm0 -; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqu 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqu 48(%rdi), %xmm3 -; AVX1-NEXT: vpsadbw 48(%rdx), %xmm3, %xmm3 -; AVX1-NEXT: vpsadbw 16(%rdx), %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw 32(%rdx), %xmm2, %xmm2 -; AVX1-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm3, %xmm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm4, %xmm4 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm6, %xmm6 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm7, %xmm7 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm8, %xmm8 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm9, %xmm9 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm10, %xmm10 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm11, %xmm11 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm12, %xmm12 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm13, %xmm13 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm14, %xmm14 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm0, %xmm15, %xmm0 +; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-NEXT: vpabsd %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm15, %xmm1 +; AVX1-NEXT: vpabsd %xmm2, %xmm2 +; AVX1-NEXT: vpabsd %xmm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpabsd %xmm4, %xmm2 +; AVX1-NEXT: vpabsd %xmm5, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpabsd %xmm6, %xmm3 +; AVX1-NEXT: vpabsd %xmm7, %xmm4 +; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpabsd %xmm8, %xmm2 +; AVX1-NEXT: vpabsd %xmm9, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpabsd %xmm10, %xmm3 +; AVX1-NEXT: vpabsd %xmm11, %xmm4 +; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpabsd %xmm12, %xmm3 +; AVX1-NEXT: vpabsd %xmm13, %xmm4 +; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpabsd %xmm14, %xmm4 +; AVX1-NEXT: vpabsd %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: retq ; ; AVX2-LABEL: sad_nonloop_64i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; AVX2-NEXT: vpsadbw 32(%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm8, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm8, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm8, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm8, %ymm3, %ymm3 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm8, %ymm4, %ymm4 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm8, %ymm5, %ymm5 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm8, %ymm6, %ymm6 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm8, %ymm7, %ymm7 +; AVX2-NEXT: vpabsd %ymm0, %ymm0 +; AVX2-NEXT: vpabsd %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpabsd %ymm2, %ymm1 +; AVX2-NEXT: vpabsd %ymm3, %ymm2 +; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpabsd %ymm4, %ymm1 +; AVX2-NEXT: vpabsd %ymm5, %ymm2 +; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpabsd %ymm6, %ymm2 +; AVX2-NEXT: vpabsd %ymm7, %ymm3 +; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: sad_nonloop_64i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu 32(%rdi), %ymm1 -; AVX512F-NEXT: vpsadbw 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: sad_nonloop_64i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpsadbw (%rdx), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: sad_nonloop_64i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpsubd %zmm4, %zmm0, %zmm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpsubd %zmm4, %zmm1, %zmm1 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpsubd %zmm4, %zmm2, %zmm2 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpsubd %zmm4, %zmm3, %zmm3 +; AVX512-NEXT: vpabsd %zmm0, %zmm0 +; AVX512-NEXT: vpabsd %zmm1, %zmm1 +; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpabsd %zmm2, %zmm1 +; AVX512-NEXT: vpabsd %zmm3, %zmm2 +; AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %v1 = load <64 x i8>, <64 x i8>* %p, align 1 %z1 = zext <64 x i8> %v1 to <64 x i32> %v2 = load <64 x i8>, <64 x i8>* %q, align 1 diff --git a/llvm/test/CodeGen/X86/sad_variations.ll b/llvm/test/CodeGen/X86/sad_variations.ll --- a/llvm/test/CodeGen/X86/sad_variations.ll +++ b/llvm/test/CodeGen/X86/sad_variations.ll @@ -13,14 +13,6 @@ ; SSE2-NEXT: psadbw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq -; -; AVX-LABEL: sad8_32bit_icmp_sge: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq entry: %idx.ext = zext i32 %stride to i64 @@ -53,14 +45,6 @@ ; SSE2-NEXT: psadbw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq -; -; AVX-LABEL: sad8_32bit_icmp_sgt: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq entry: %idx.ext = zext i32 %stride to i64 br label %for.body @@ -92,14 +76,6 @@ ; SSE2-NEXT: psadbw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq -; -; AVX-LABEL: sad8_32bit_icmp_sle: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq entry: %idx.ext = zext i32 %stride to i64 br label %for.body @@ -131,14 +107,6 @@ ; SSE2-NEXT: psadbw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq -; -; AVX-LABEL: sad8_32bit_icmp_slt: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq entry: %idx.ext = zext i32 %stride to i64 br label %for.body @@ -170,14 +138,6 @@ ; SSE2-NEXT: psadbw %xmm0, %xmm1 ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: retq -; -; AVX-LABEL: sad8_64bit_icmp_sext_slt: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq entry: br label %for.body @@ -209,14 +169,6 @@ ; SSE2-NEXT: psadbw %xmm0, %xmm1 ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: retq -; -; AVX-LABEL: sad8_64bit_icmp_zext_slt: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq entry: br label %for.body @@ -248,14 +200,6 @@ ; SSE2-NEXT: psadbw %xmm0, %xmm1 ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: retq -; -; AVX-LABEL: sad8_early_64bit_icmp_zext_slt: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq entry: br label %for.body @@ -277,3 +221,5 @@ %8 = extractelement <8 x i64> %bin.rdx239, i32 0 ret i64 %8 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX: {{.*}} diff --git a/llvm/test/CodeGen/X86/sadd_sat.ll b/llvm/test/CodeGen/X86/sadd_sat.ll --- a/llvm/test/CodeGen/X86/sadd_sat.ll +++ b/llvm/test/CodeGen/X86/sadd_sat.ll @@ -75,7 +75,7 @@ ; X86-NEXT: movl %eax, %edx ; X86-NEXT: addw %cx, %dx ; X86-NEXT: movswl %dx, %edx -; X86-NEXT: sarl $15, %edx +; X86-NEXT: shrl $15, %edx ; X86-NEXT: xorl $-32768, %edx # imm = 0x8000 ; X86-NEXT: addw %cx, %ax ; X86-NEXT: cmovol %edx, %eax @@ -88,7 +88,7 @@ ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: leal (%rdi,%rsi), %eax ; X64-NEXT: cwtl -; X64-NEXT: sarl $15, %eax +; X64-NEXT: shrl $15, %eax ; X64-NEXT: xorl $-32768, %eax # imm = 0x8000 ; X64-NEXT: addw %si, %di ; X64-NEXT: cmovnol %edi, %eax diff --git a/llvm/test/CodeGen/X86/sadd_sat_plus.ll b/llvm/test/CodeGen/X86/sadd_sat_plus.ll --- a/llvm/test/CodeGen/X86/sadd_sat_plus.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_plus.ll @@ -79,7 +79,7 @@ ; X86-NEXT: movl %eax, %edx ; X86-NEXT: addw %cx, %dx ; X86-NEXT: movswl %dx, %edx -; X86-NEXT: sarl $15, %edx +; X86-NEXT: shrl $15, %edx ; X86-NEXT: xorl $-32768, %edx # imm = 0x8000 ; X86-NEXT: addw %cx, %ax ; X86-NEXT: cmovol %edx, %eax @@ -93,7 +93,7 @@ ; X64-NEXT: imull %edx, %esi ; X64-NEXT: leal (%rdi,%rsi), %eax ; X64-NEXT: cwtl -; X64-NEXT: sarl $15, %eax +; X64-NEXT: shrl $15, %eax ; X64-NEXT: xorl $-32768, %eax # imm = 0x8000 ; X64-NEXT: addw %si, %di ; X64-NEXT: cmovnol %edi, %eax diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll --- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -468,7 +468,7 @@ ; SSE-NEXT: movzwl (%rsi), %ecx ; SSE-NEXT: leal (%rax,%rcx), %esi ; SSE-NEXT: movswl %si, %esi -; SSE-NEXT: sarl $15, %esi +; SSE-NEXT: shrl $15, %esi ; SSE-NEXT: xorl $-32768, %esi # imm = 0x8000 ; SSE-NEXT: addw %cx, %ax ; SSE-NEXT: cmovol %esi, %eax @@ -481,7 +481,7 @@ ; AVX-NEXT: movzwl (%rsi), %ecx ; AVX-NEXT: leal (%rax,%rcx), %esi ; AVX-NEXT: movswl %si, %esi -; AVX-NEXT: sarl $15, %esi +; AVX-NEXT: shrl $15, %esi ; AVX-NEXT: xorl $-32768, %esi # imm = 0x8000 ; AVX-NEXT: addw %cx, %ax ; AVX-NEXT: cmovol %esi, %eax @@ -1175,22 +1175,23 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pxor %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 ; SSE41-NEXT: paddq %xmm1, %xmm2 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm3 -; SSE41-NEXT: por %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm3 -; SSE41-NEXT: movapd {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pxor %xmm0, %xmm1 +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 ; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: retq ; @@ -1352,40 +1353,42 @@ ; SSE41-LABEL: v4i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: paddq %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pxor %xmm6, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 ; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm5 -; SSE41-NEXT: por %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm2, %xmm5 -; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807] -; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movapd %xmm8, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm6 +; SSE41-NEXT: por %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pxor %xmm6, %xmm2 +; SSE41-NEXT: movapd {{.*#+}} xmm6 = [9223372036854775807,9223372036854775807] +; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: movapd %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm2 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: paddq %xmm3, %xmm1 -; SSE41-NEXT: pxor %xmm1, %xmm6 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm2 -; SSE41-NEXT: pxor %xmm3, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm5 +; SSE41-NEXT: por %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE41-NEXT: pxor %xmm5, %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 ; SSE41-NEXT: movapd %xmm4, %xmm0 ; SSE41-NEXT: retq ; @@ -1650,74 +1653,78 @@ ; SSE41-LABEL: v8i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: paddq %xmm4, %xmm8 -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pxor %xmm10, %xmm9 +; SSE41-NEXT: movdqa %xmm8, %xmm10 +; SSE41-NEXT: pxor %xmm9, %xmm10 ; SSE41-NEXT: movdqa %xmm0, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm11 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm11, %xmm9 -; SSE41-NEXT: por %xmm0, %xmm9 -; SSE41-NEXT: pxor %xmm4, %xmm9 -; SSE41-NEXT: movapd {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807] -; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movapd %xmm11, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm11, %xmm10 +; SSE41-NEXT: por %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: pxor %xmm10, %xmm4 +; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807] +; SSE41-NEXT: movapd {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: movapd %xmm10, %xmm12 ; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm4 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm8 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: paddq %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE41-NEXT: pxor %xmm9, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm12 ; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm4 -; SSE41-NEXT: por %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm5, %xmm4 -; SSE41-NEXT: movapd %xmm11, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm12, %xmm13 +; SSE41-NEXT: por %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE41-NEXT: pxor %xmm13, %xmm4 +; SSE41-NEXT: movapd %xmm10, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: paddq %xmm6, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 +; SSE41-NEXT: pxor %xmm9, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm4 -; SSE41-NEXT: por %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm6, %xmm4 -; SSE41-NEXT: movapd %xmm11, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm12 +; SSE41-NEXT: por %xmm0, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; SSE41-NEXT: pxor %xmm12, %xmm4 +; SSE41-NEXT: movapd %xmm10, %xmm5 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: paddq %xmm7, %xmm3 -; SSE41-NEXT: pxor %xmm3, %xmm10 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm4 -; SSE41-NEXT: por %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm7, %xmm4 +; SSE41-NEXT: pxor %xmm3, %xmm9 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: por %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSE41-NEXT: pxor %xmm5, %xmm4 ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm11 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm10 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3 ; SSE41-NEXT: movapd %xmm8, %xmm0 ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll --- a/llvm/test/CodeGen/X86/sat-add.ll +++ b/llvm/test/CodeGen/X86/sat-add.ll @@ -383,10 +383,19 @@ ; SSE-NEXT: paddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: unsigned_sat_constant_v16i8_using_cmp_sum: -; AVX: # %bb.0: -; AVX-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: unsigned_sat_constant_v16i8_using_cmp_sum: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: unsigned_sat_constant_v16i8_using_cmp_sum: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq $222, %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: retq %a = add <16 x i8> %x, %c = icmp ugt <16 x i8> %x, %a %r = select <16 x i1> %c, <16 x i8> , <16 x i8> %a @@ -399,10 +408,18 @@ ; SSE-NEXT: paddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: unsigned_sat_constant_v16i8_using_cmp_notval: -; AVX: # %bb.0: -; AVX-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: unsigned_sat_constant_v16i8_using_cmp_notval: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: unsigned_sat_constant_v16i8_using_cmp_notval: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX512-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %a = add <16 x i8> %x, %c = icmp ugt <16 x i8> %x, %r = select <16 x i1> %c, <16 x i8> , <16 x i8> %a @@ -441,10 +458,19 @@ ; SSE-NEXT: paddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: unsigned_sat_constant_v8i16_using_cmp_sum: -; AVX: # %bb.0: -; AVX-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: unsigned_sat_constant_v8i16_using_cmp_sum: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: unsigned_sat_constant_v8i16_using_cmp_sum: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq $222, %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: retq %a = add <8 x i16> %x, %c = icmp ugt <8 x i16> %x, %a %r = select <8 x i1> %c, <8 x i16> , <8 x i16> %a @@ -457,10 +483,18 @@ ; SSE-NEXT: paddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: unsigned_sat_constant_v8i16_using_cmp_notval: -; AVX: # %bb.0: -; AVX-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: unsigned_sat_constant_v8i16_using_cmp_notval: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: unsigned_sat_constant_v8i16_using_cmp_notval: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512-NEXT: vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX512-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %a = add <8 x i16> %x, %c = icmp ugt <8 x i16> %x, %r = select <8 x i1> %c, <8 x i16> , <8 x i16> %a @@ -611,12 +645,13 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: pxor %xmm0, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372034707292117,9223372034707292117] -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 @@ -747,11 +782,13 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [42,42] ; SSE2-NEXT: paddq %xmm0, %xmm1 ; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372034707292117,9223372034707292117] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -760,11 +797,13 @@ ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42,42] ; SSE41-NEXT: paddq %xmm0, %xmm1 ; SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372034707292117,9223372034707292117] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -837,10 +876,19 @@ ; SSE-NEXT: paddusb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: unsigned_sat_variable_v16i8_using_cmp_sum: -; AVX: # %bb.0: -; AVX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: unsigned_sat_variable_v16i8_using_cmp_sum: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: unsigned_sat_variable_v16i8_using_cmp_sum: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq $222, %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: retq %a = add <16 x i8> %x, %y %c = icmp ugt <16 x i8> %x, %a %r = select <16 x i1> %c, <16 x i8> , <16 x i8> %a @@ -934,10 +982,19 @@ ; SSE-NEXT: paddusw %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: unsigned_sat_variable_v8i16_using_cmp_sum: -; AVX: # %bb.0: -; AVX-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: unsigned_sat_variable_v8i16_using_cmp_sum: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: unsigned_sat_variable_v8i16_using_cmp_sum: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq $222, %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: retq %a = add <8 x i16> %x, %y %c = icmp ugt <8 x i16> %x, %a %r = select <8 x i1> %c, <8 x i16> , <8 x i16> %a diff --git a/llvm/test/CodeGen/X86/scalar_widen_div.ll b/llvm/test/CodeGen/X86/scalar_widen_div.ll --- a/llvm/test/CodeGen/X86/scalar_widen_div.ll +++ b/llvm/test/CodeGen/X86/scalar_widen_div.ll @@ -393,26 +393,27 @@ ; CHECK-NEXT: jle .LBB12_3 ; CHECK-NEXT: # %bb.1: # %bb.nph ; CHECK-NEXT: movl %edx, %ecx -; CHECK-NEXT: xorl %r10d, %r10d +; CHECK-NEXT: xorl %r11d, %r11d ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB12_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl (%rdi,%r10), %r8d -; CHECK-NEXT: movl 4(%rdi,%r10), %eax +; CHECK-NEXT: movl 8(%rdi,%r11), %r8d +; CHECK-NEXT: movl (%rdi,%r11), %r9d +; CHECK-NEXT: movl 4(%rdi,%r11), %eax ; CHECK-NEXT: cltd -; CHECK-NEXT: idivl 4(%rsi,%r10) -; CHECK-NEXT: movl %eax, %r9d -; CHECK-NEXT: movl %r8d, %eax +; CHECK-NEXT: idivl 4(%rsi,%r11) +; CHECK-NEXT: movl %eax, %r10d +; CHECK-NEXT: movl %r9d, %eax ; CHECK-NEXT: cltd -; CHECK-NEXT: idivl (%rsi,%r10) +; CHECK-NEXT: idivl (%rsi,%r11) ; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: pinsrd $1, %r9d, %xmm0 -; CHECK-NEXT: movl 8(%rdi,%r10), %eax +; CHECK-NEXT: pinsrd $1, %r10d, %xmm0 +; CHECK-NEXT: movl %r8d, %eax ; CHECK-NEXT: cltd -; CHECK-NEXT: idivl 8(%rsi,%r10) -; CHECK-NEXT: movl %eax, 8(%rdi,%r10) -; CHECK-NEXT: movq %xmm0, (%rdi,%r10) -; CHECK-NEXT: addq $16, %r10 +; CHECK-NEXT: idivl 8(%rsi,%r11) +; CHECK-NEXT: movl %eax, 8(%rdi,%r11) +; CHECK-NEXT: movq %xmm0, (%rdi,%r11) +; CHECK-NEXT: addq $16, %r11 ; CHECK-NEXT: decl %ecx ; CHECK-NEXT: jne .LBB12_2 ; CHECK-NEXT: .LBB12_3: # %for.end diff --git a/llvm/test/CodeGen/X86/scalarize-fp.ll b/llvm/test/CodeGen/X86/scalarize-fp.ll --- a/llvm/test/CodeGen/X86/scalarize-fp.ll +++ b/llvm/test/CodeGen/X86/scalarize-fp.ll @@ -671,7 +671,12 @@ ; ; AVX-LABEL: splat0_fdiv_v8f32: ; AVX: # %bb.0: -; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vrcpps %ymm1, %ymm2 +; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; AVX-NEXT: vmulss %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq @@ -745,6 +750,11 @@ ; ; AVX-LABEL: splat0_fdiv_const_op1_v8f32: ; AVX: # %bb.0: +; AVX-NEXT: vrcpps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vsubss %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq @@ -764,8 +774,12 @@ ; ; AVX-LABEL: splat0_fdiv_const_op0_v8f32: ; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-NEXT: vsubss %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sdiv-exact.ll b/llvm/test/CodeGen/X86/sdiv-exact.ll --- a/llvm/test/CodeGen/X86/sdiv-exact.ll +++ b/llvm/test/CodeGen/X86/sdiv-exact.ll @@ -83,11 +83,12 @@ ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: psrad $3, %xmm1 ; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; X86-NEXT: movdqa {{.*#+}} xmm0 = [2863311531,2863311531,3264175145,3264175145] -; X86-NEXT: pmuludq %xmm1, %xmm0 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,3264175145,3264175145] +; X86-NEXT: movaps %xmm1, %xmm0 +; X86-NEXT: pmuludq %xmm2, %xmm0 ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-NEXT: pmuludq %xmm2, %xmm1 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: retl @@ -108,11 +109,12 @@ ; X86-NEXT: psrad $3, %xmm1 ; X86-NEXT: psrad $1, %xmm0 ; X86-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X86-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,3303820997,3303820997] -; X86-NEXT: pmuludq %xmm0, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,3303820997,3303820997] +; X86-NEXT: movapd %xmm0, %xmm1 +; X86-NEXT: pmuludq %xmm2, %xmm1 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: pmuludq %xmm2, %xmm0 ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X86-NEXT: movdqa %xmm1, %xmm0 @@ -130,11 +132,12 @@ define <4 x i32> @test7(<4 x i32> %x) { ; X86-LABEL: test7: ; X86: # %bb.0: -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,1749801491,1749801491] +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-NEXT: pmuludq %xmm1, %xmm0 ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-NEXT: pmuludq %xmm1, %xmm2 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: retl ; @@ -152,11 +155,12 @@ ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: psrad $3, %xmm1 ; X86-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; X86-NEXT: movdqa {{.*#+}} xmm0 = [1,1,2863311531,2863311531] -; X86-NEXT: pmuludq %xmm1, %xmm0 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [1,1,2863311531,2863311531] +; X86-NEXT: movapd %xmm1, %xmm0 +; X86-NEXT: pmuludq %xmm2, %xmm0 ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-NEXT: pmuludq %xmm2, %xmm1 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll --- a/llvm/test/CodeGen/X86/sdiv_fix.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix.ll @@ -153,27 +153,26 @@ ; ; X86-LABEL: func3: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shll $8, %eax -; X86-NEXT: movswl %ax, %esi ; X86-NEXT: addl %ecx, %ecx -; X86-NEXT: shrl $4, %esi +; X86-NEXT: shll $4, %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: cwtd ; X86-NEXT: idivw %si ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: leal -1(%eax), %edi +; X86-NEXT: testw %si, %si +; X86-NEXT: sets %bl ; X86-NEXT: testw %cx, %cx ; X86-NEXT: sets %cl -; X86-NEXT: testw %si, %si -; X86-NEXT: sets %ch -; X86-NEXT: xorb %cl, %ch +; X86-NEXT: xorb %bl, %cl ; X86-NEXT: testw %dx, %dx -; X86-NEXT: setne %cl -; X86-NEXT: testb %ch, %cl +; X86-NEXT: setne %dl +; X86-NEXT: testb %cl, %dl ; X86-NEXT: cmovel %eax, %edi ; X86-NEXT: addl %edi, %edi ; X86-NEXT: movswl %di, %eax @@ -181,6 +180,7 @@ ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx ; X86-NEXT: retl %y2 = sext i8 %y to i15 %y3 = shl i15 %y2, 7 @@ -535,168 +535,169 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $60, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: sarl $31, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %edx -; X86-NEXT: shll $31, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl %edi, %esi +; X86-NEXT: shll $31, %esi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: shrl %eax +; X86-NEXT: andl $-2147483648, %ebx # imm = 0x80000000 +; X86-NEXT: orl %eax, %ebx +; X86-NEXT: sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: shrl %eax +; X86-NEXT: andl $-2147483648, %ebp # imm = 0x80000000 +; X86-NEXT: orl %eax, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shrl %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: andl $-2147483648, %ebp # imm = 0x80000000 +; X86-NEXT: orl %eax, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sets (%esp) # 1-byte Folded Spill +; X86-NEXT: movl %edi, %eax +; X86-NEXT: shrl %eax +; X86-NEXT: andl $-2147483648, %edi # imm = 0x80000000 +; X86-NEXT: orl %eax, %edi +; X86-NEXT: sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: pushl %edx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrl $31, %eax -; X86-NEXT: shldl $31, %ecx, %eax +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: calll __moddi3 +; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %ebp +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %eax -; X86-NEXT: pushl %edx ; X86-NEXT: calll __divdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll $31, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: shll $31, %ebp -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: shrl $31, %ecx -; X86-NEXT: shldl $31, %ebx, %ecx ; X86-NEXT: pushl %eax ; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %ebp +; X86-NEXT: movl %ecx, %edi ; X86-NEXT: calll __moddi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi ; X86-NEXT: calll __divdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: sarl $31, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: shll $31, %ebx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: shrl $31, %edi -; X86-NEXT: shldl $31, %edx, %edi +; X86-NEXT: shll $31, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: pushl %edx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %ecx -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %eax ; X86-NEXT: movl %eax, %esi -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %ebx ; X86-NEXT: calll __moddi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %esi ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %esi ; X86-NEXT: calll __divdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: sarl $31, %ebx +; X86-NEXT: shll $31, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: shll $31, %esi ; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: shrl $31, %ebp -; X86-NEXT: shldl $31, %ecx, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %eax +; X86-NEXT: sarl $31, %ebp ; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: pushl %esi +; X86-NEXT: pushl %eax ; X86-NEXT: calll __moddi3 ; X86-NEXT: addl $16, %esp -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %edi +; X86-NEXT: testl %ebp, %ebp +; X86-NEXT: sets %bl +; X86-NEXT: xorb (%esp), %bl # 1-byte Folded Reload ; X86-NEXT: pushl %ebp +; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl %esi +; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: calll __divdi3 ; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %ebp, %ebp -; X86-NEXT: sets %cl -; X86-NEXT: testl %ebx, %ebx -; X86-NEXT: sets %dl -; X86-NEXT: xorb %cl, %dl -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl (%esp), %ecx # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: setne %cl -; X86-NEXT: testb %dl, %cl +; X86-NEXT: testb %bl, %cl ; X86-NEXT: leal -1(%eax), %ecx ; X86-NEXT: cmovel %eax, %ecx -; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NEXT: testl %edi, %edi +; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: sets %al +; X86-NEXT: xorb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: setne %dl +; X86-NEXT: testb %al, %dl +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: leal -1(%eax), %edi +; X86-NEXT: cmovel %eax, %edi ; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: sets %cl -; X86-NEXT: xorb %al, %cl +; X86-NEXT: sets %dl +; X86-NEXT: xorb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: setne %al -; X86-NEXT: testb %cl, %al +; X86-NEXT: setne %dh +; X86-NEXT: testb %dl, %dh ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: leal -1(%eax), %ecx -; X86-NEXT: cmovel %eax, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: sets %al +; X86-NEXT: leal -1(%eax), %edx +; X86-NEXT: cmovel %eax, %edx ; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: sets %cl -; X86-NEXT: xorb %al, %cl +; X86-NEXT: sets %bl +; X86-NEXT: xorb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: setne %al -; X86-NEXT: testb %cl, %al +; X86-NEXT: setne %bh +; X86-NEXT: testb %bl, %bh ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: leal -1(%eax), %ebp -; X86-NEXT: cmovel %eax, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: testl %edx, %edx -; X86-NEXT: sets %al -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: testl %ecx, %ecx -; X86-NEXT: sets %bl -; X86-NEXT: xorb %al, %bl -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: leal -1(%edi), %esi -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %edx -; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: calll __moddi3 -; X86-NEXT: addl $16, %esp -; X86-NEXT: orl %eax, %edx -; X86-NEXT: setne %al -; X86-NEXT: testb %bl, %al -; X86-NEXT: cmovel %edi, %esi +; X86-NEXT: leal -1(%eax), %esi +; X86-NEXT: cmovel %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %esi, 12(%eax) -; X86-NEXT: movl %ebp, 8(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 4(%eax) -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %edi, 4(%eax) ; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: addl $60, %esp +; X86-NEXT: addl $64, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll --- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll @@ -181,27 +181,26 @@ ; ; X86-LABEL: func3: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shll $8, %eax -; X86-NEXT: movswl %ax, %esi ; X86-NEXT: addl %ecx, %ecx -; X86-NEXT: shrl $4, %esi +; X86-NEXT: shll $4, %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: cwtd ; X86-NEXT: idivw %si ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: leal -1(%eax), %edi +; X86-NEXT: testw %si, %si +; X86-NEXT: sets %bl ; X86-NEXT: testw %cx, %cx ; X86-NEXT: sets %cl -; X86-NEXT: testw %si, %si -; X86-NEXT: sets %ch -; X86-NEXT: xorb %cl, %ch +; X86-NEXT: xorb %bl, %cl ; X86-NEXT: testw %dx, %dx -; X86-NEXT: setne %cl -; X86-NEXT: testb %ch, %cl +; X86-NEXT: setne %dl +; X86-NEXT: testb %cl, %dl ; X86-NEXT: cmovnel %edi, %eax ; X86-NEXT: movswl %ax, %ecx ; X86-NEXT: cmpl $16383, %ecx # imm = 0x3FFF @@ -214,6 +213,7 @@ ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx ; X86-NEXT: retl %y2 = sext i8 %y to i15 %y3 = shl i15 %y2, 7 @@ -563,232 +563,231 @@ ; X64-NEXT: subq $120, %rsp ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] -; X64-NEXT: psllq $32, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] -; X64-NEXT: psrad $31, %xmm2 -; X64-NEXT: psrlq $31, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: pcmpgtd %xmm0, %xmm2 ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: paddq %xmm0, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: movq %xmm0, %rbx -; X64-NEXT: movq %rbx, %r13 -; X64-NEXT: sarq $63, %r13 -; X64-NEXT: shldq $31, %rbx, %r13 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; X64-NEXT: movq %xmm0, %r14 +; X64-NEXT: movq %r14, %rbx +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: shldq $31, %r14, %rbx ; X64-NEXT: pxor %xmm0, %xmm0 ; X64-NEXT: pcmpgtd %xmm1, %xmm0 ; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: movq %xmm1, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: sarq $63, %r15 -; X64-NEXT: movq %rbx, %r12 -; X64-NEXT: shlq $31, %r12 -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %r13, %rsi -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: sarq $63, %r12 +; X64-NEXT: shlq $31, %r14 +; X64-NEXT: movq %r14, %rdi +; X64-NEXT: movq %rbx, %rsi +; X64-NEXT: movq %r12, %rcx ; X64-NEXT: callq __divti3@PLT -; X64-NEXT: movq %rax, %rbp +; X64-NEXT: movq %rax, %r13 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: subq $1, %rbp -; X64-NEXT: sbbq $0, %r14 -; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %r15d, %ebx -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %r13, %rsi +; X64-NEXT: subq $1, %r13 +; X64-NEXT: sbbq $0, %rbp +; X64-NEXT: movq %rbx, %r15 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: xorq %rdx, %r15 +; X64-NEXT: shrq $63, %r15 +; X64-NEXT: movq %r14, %rdi +; X64-NEXT: movq %rbx, %rsi +; X64-NEXT: movq %r12, %rcx ; X64-NEXT: callq __modti3@PLT +; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: orq %rax, %rdx -; X64-NEXT: setne %al -; X64-NEXT: testb %bl, %al -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload +; X64-NEXT: setne %cl +; X64-NEXT: testl %r15d, %ecx ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload ; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: movl $4294967295, %edx # imm = 0xFFFFFFFF -; X64-NEXT: cmpq %rdx, %rbp -; X64-NEXT: movq %r14, %rax +; X64-NEXT: cmpq %rdx, %r13 +; X64-NEXT: movq %rbp, %rax ; X64-NEXT: sbbq $0, %rax -; X64-NEXT: cmovgeq %rcx, %r14 -; X64-NEXT: cmovgeq %rdx, %rbp +; X64-NEXT: cmovgeq %rcx, %rbp +; X64-NEXT: cmovgeq %rdx, %r13 ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 -; X64-NEXT: cmpq %rbp, %rcx +; X64-NEXT: cmpq %r13, %rcx ; X64-NEXT: movq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: movq $-1, %rax -; X64-NEXT: sbbq %r14, %rax -; X64-NEXT: cmovgeq %rcx, %rbp -; X64-NEXT: movq %rbp, %xmm0 +; X64-NEXT: sbbq %rbp, %rax +; X64-NEXT: cmovgeq %rcx, %r13 +; X64-NEXT: movq %r13, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rbx -; X64-NEXT: movq %rbx, %r13 -; X64-NEXT: sarq $63, %r13 -; X64-NEXT: shldq $31, %rbx, %r13 +; X64-NEXT: movq %rbx, %r14 +; X64-NEXT: sarq $63, %r14 +; X64-NEXT: shldq $31, %rbx, %r14 ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: sarq $63, %r15 -; X64-NEXT: movq %rbx, %r12 -; X64-NEXT: shlq $31, %r12 -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %r13, %rsi -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: sarq $63, %r12 +; X64-NEXT: shlq $31, %rbx +; X64-NEXT: movq %rbx, %rdi +; X64-NEXT: movq %r14, %rsi +; X64-NEXT: movq %r12, %rcx ; X64-NEXT: callq __divti3@PLT -; X64-NEXT: movq %rax, %rbp +; X64-NEXT: movq %rax, %r13 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: subq $1, %rbp -; X64-NEXT: sbbq $0, %r14 -; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %r15d, %ebx -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %r13, %rsi +; X64-NEXT: subq $1, %r13 +; X64-NEXT: sbbq $0, %rbp +; X64-NEXT: movq %r14, %r15 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: xorq %rdx, %r15 +; X64-NEXT: shrq $63, %r15 +; X64-NEXT: movq %rbx, %rdi +; X64-NEXT: movq %r14, %rsi +; X64-NEXT: movq %r12, %rcx ; X64-NEXT: callq __modti3@PLT +; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: orq %rax, %rdx -; X64-NEXT: setne %al -; X64-NEXT: testb %bl, %al -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload +; X64-NEXT: setne %cl +; X64-NEXT: testl %r15d, %ecx ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF -; X64-NEXT: cmpq %rcx, %rbp -; X64-NEXT: movq %r14, %rax +; X64-NEXT: cmpq %rcx, %r13 +; X64-NEXT: movq %rbp, %rax ; X64-NEXT: sbbq $0, %rax ; X64-NEXT: movl $0, %eax -; X64-NEXT: cmovgeq %rax, %r14 -; X64-NEXT: cmovgeq %rcx, %rbp +; X64-NEXT: cmovgeq %rax, %rbp +; X64-NEXT: cmovgeq %rcx, %r13 ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 -; X64-NEXT: cmpq %rbp, %rcx +; X64-NEXT: cmpq %r13, %rcx ; X64-NEXT: movq $-1, %rax -; X64-NEXT: sbbq %r14, %rax -; X64-NEXT: cmovgeq %rcx, %rbp -; X64-NEXT: movq %rbp, %xmm0 +; X64-NEXT: sbbq %rbp, %rax +; X64-NEXT: cmovgeq %rcx, %r13 +; X64-NEXT: movq %r13, %xmm0 ; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; X64-NEXT: psrlq $1, %xmm1 ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; X64-NEXT: # xmm0 = mem[0,1,1,3] -; X64-NEXT: psllq $32, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; X64-NEXT: psrad $31, %xmm1 -; X64-NEXT: psrlq $31, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: movq %xmm0, %rbx -; X64-NEXT: movq %rbx, %r13 -; X64-NEXT: sarq $63, %r13 -; X64-NEXT: shldq $31, %rbx, %r13 -; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: pcmpgtd %xmm0, %xmm1 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: movq %xmm0, %rdx +; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-NEXT: # xmm1 = mem[2,3,2,3] +; X64-NEXT: pxor %xmm0, %xmm0 +; X64-NEXT: pcmpgtd %xmm1, %xmm0 +; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: paddq %xmm1, %xmm1 +; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movq %xmm1, %r14 +; X64-NEXT: movq %r14, %rbx +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: shldq $31, %r14, %rbx +; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-NEXT: # xmm1 = mem[2,3,2,3] +; X64-NEXT: pxor %xmm0, %xmm0 +; X64-NEXT: pcmpgtd %xmm1, %xmm0 +; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movq %xmm1, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: sarq $63, %r15 -; X64-NEXT: movq %rbx, %r12 -; X64-NEXT: shlq $31, %r12 -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %r13, %rsi -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: sarq $63, %r12 +; X64-NEXT: shlq $31, %r14 +; X64-NEXT: movq %r14, %rdi +; X64-NEXT: movq %rbx, %rsi +; X64-NEXT: movq %r12, %rcx ; X64-NEXT: callq __divti3@PLT -; X64-NEXT: movq %rax, %rbp +; X64-NEXT: movq %rax, %r13 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: subq $1, %rbp -; X64-NEXT: sbbq $0, %r14 -; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %r15d, %ebx -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %r13, %rsi +; X64-NEXT: subq $1, %r13 +; X64-NEXT: sbbq $0, %rbp +; X64-NEXT: movq %rbx, %r15 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: xorq %rdx, %r15 +; X64-NEXT: shrq $63, %r15 +; X64-NEXT: movq %r14, %rdi +; X64-NEXT: movq %rbx, %rsi +; X64-NEXT: movq %r12, %rcx ; X64-NEXT: callq __modti3@PLT +; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: orq %rax, %rdx -; X64-NEXT: setne %al -; X64-NEXT: testb %bl, %al -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload +; X64-NEXT: setne %cl +; X64-NEXT: testl %r15d, %ecx ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF -; X64-NEXT: cmpq %rcx, %rbp -; X64-NEXT: movq %r14, %rax +; X64-NEXT: cmpq %rcx, %r13 +; X64-NEXT: movq %rbp, %rax ; X64-NEXT: sbbq $0, %rax ; X64-NEXT: movl $0, %eax -; X64-NEXT: cmovgeq %rax, %r14 -; X64-NEXT: cmovgeq %rcx, %rbp +; X64-NEXT: cmovgeq %rax, %rbp +; X64-NEXT: cmovgeq %rcx, %r13 ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 -; X64-NEXT: cmpq %rbp, %rcx +; X64-NEXT: cmpq %r13, %rcx ; X64-NEXT: movq $-1, %rax -; X64-NEXT: sbbq %r14, %rax -; X64-NEXT: cmovgeq %rcx, %rbp -; X64-NEXT: movq %rbp, %xmm0 +; X64-NEXT: sbbq %rbp, %rax +; X64-NEXT: cmovgeq %rcx, %r13 +; X64-NEXT: movq %r13, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rbx -; X64-NEXT: movq %rbx, %r13 -; X64-NEXT: sarq $63, %r13 -; X64-NEXT: shldq $31, %rbx, %r13 +; X64-NEXT: movq %rbx, %r14 +; X64-NEXT: sarq $63, %r14 +; X64-NEXT: shldq $31, %rbx, %r14 ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: sarq $63, %r15 -; X64-NEXT: movq %rbx, %r12 -; X64-NEXT: shlq $31, %r12 -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %r13, %rsi -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: sarq $63, %r12 +; X64-NEXT: shlq $31, %rbx +; X64-NEXT: movq %rbx, %rdi +; X64-NEXT: movq %r14, %rsi +; X64-NEXT: movq %r12, %rcx ; X64-NEXT: callq __divti3@PLT -; X64-NEXT: movq %rax, %rbp +; X64-NEXT: movq %rax, %r13 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: subq $1, %rbp -; X64-NEXT: sbbq $0, %r14 -; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %r15d, %ebx -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %r13, %rsi +; X64-NEXT: subq $1, %r13 +; X64-NEXT: sbbq $0, %rbp +; X64-NEXT: movq %r14, %r15 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: xorq %rdx, %r15 +; X64-NEXT: shrq $63, %r15 +; X64-NEXT: movq %rbx, %rdi +; X64-NEXT: movq %r14, %rsi +; X64-NEXT: movq %r12, %rcx ; X64-NEXT: callq __modti3@PLT +; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: orq %rax, %rdx -; X64-NEXT: setne %al -; X64-NEXT: testb %bl, %al -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload +; X64-NEXT: setne %cl +; X64-NEXT: testl %r15d, %ecx ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF -; X64-NEXT: cmpq %rcx, %rbp -; X64-NEXT: movq %r14, %rax +; X64-NEXT: cmpq %rcx, %r13 +; X64-NEXT: movq %rbp, %rax ; X64-NEXT: sbbq $0, %rax ; X64-NEXT: movl $0, %eax -; X64-NEXT: cmovgeq %rax, %r14 -; X64-NEXT: cmovgeq %rcx, %rbp -; X64-NEXT: movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000 -; X64-NEXT: cmpq %rbp, %rax -; X64-NEXT: sbbq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: cmovgeq %rax, %rbp -; X64-NEXT: movq %rbp, %xmm1 -; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-NEXT: psrlq $1, %xmm0 -; X64-NEXT: shufps $136, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; X64-NEXT: # xmm0 = xmm0[0,2],mem[0,2] +; X64-NEXT: cmovgeq %rcx, %r13 +; X64-NEXT: movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000 +; X64-NEXT: cmpq %r13, %rax +; X64-NEXT: sbbq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; X64-NEXT: cmovgeq %rax, %r13 +; X64-NEXT: movq %r13, %xmm0 +; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; X64-NEXT: psrlq $1, %xmm1 +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; X64-NEXT: addq $120, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 @@ -811,45 +810,44 @@ ; X86-NEXT: movl 16(%ebp), %ebx ; X86-NEXT: movl 32(%ebp), %eax ; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %eax, %edx ; X86-NEXT: sarl $31, %edi -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: sarl $31, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal (%ebx,%ebx), %eax -; X86-NEXT: shrl $31, %ebx -; X86-NEXT: shldl $31, %eax, %ebx +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl %ebx, %ebx +; X86-NEXT: shrdl $1, %ecx, %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %edi -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %edx ; X86-NEXT: pushl %edx +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %ecx ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl $0 ; X86-NEXT: pushl %eax ; X86-NEXT: calll __modti3 ; X86-NEXT: addl $32, %esp +; X86-NEXT: movl %esi, %eax ; X86-NEXT: sarl $31, %esi -; X86-NEXT: movl 20(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: sarl $31, %eax -; X86-NEXT: leal (%ecx,%ecx), %edx -; X86-NEXT: shrl $31, %ecx -; X86-NEXT: shldl $31, %edx, %ecx +; X86-NEXT: movl 20(%ebp), %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: sarl $31, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-NEXT: addl %edx, %edx +; X86-NEXT: shrdl $1, %ecx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl 36(%ebp) -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %eax ; X86-NEXT: pushl %eax ; X86-NEXT: pushl %ecx -; X86-NEXT: pushl $0 +; X86-NEXT: pushl %ecx ; X86-NEXT: pushl %edx +; X86-NEXT: pushl $0 +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %eax ; X86-NEXT: calll __divti3 ; X86-NEXT: addl $32, %esp ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax @@ -872,9 +870,9 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl 36(%ebp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %ecx ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: pushl $0 ; X86-NEXT: pushl %eax @@ -884,45 +882,45 @@ ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: sarl $31, %ebx ; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: sarl $31, %edi -; X86-NEXT: leal (%ecx,%ecx), %eax -; X86-NEXT: shrl $31, %ecx -; X86-NEXT: shldl $31, %eax, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: sarl $31, %esi ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: shrdl $1, %esi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %esi ; X86-NEXT: pushl %ecx ; X86-NEXT: pushl $0 ; X86-NEXT: pushl %eax ; X86-NEXT: calll __modti3 ; X86-NEXT: addl $32, %esp -; X86-NEXT: movl 40(%ebp), %esi -; X86-NEXT: sarl $31, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 40(%ebp), %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: sarl $31, %eax +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl 24(%ebp), %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: leal (%ecx,%ecx), %edx -; X86-NEXT: shrl $31, %ecx -; X86-NEXT: shldl $31, %edx, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %edx -; X86-NEXT: pushl %esi -; X86-NEXT: pushl %esi -; X86-NEXT: pushl %esi -; X86-NEXT: pushl 40(%ebp) ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: shrdl $1, %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edx ; X86-NEXT: pushl %eax ; X86-NEXT: pushl %eax ; X86-NEXT: pushl %ecx ; X86-NEXT: pushl $0 -; X86-NEXT: pushl %edx +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %eax ; X86-NEXT: calll __divti3 ; X86-NEXT: addl $32, %esp ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax @@ -930,8 +928,8 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl 28(%ebp) -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %esi ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: pushl $0 ; X86-NEXT: pushl %eax @@ -952,14 +950,14 @@ ; X86-NEXT: sbbl $0, %edx ; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: sets %bl -; X86-NEXT: testl %edi, %edi +; X86-NEXT: testl %esi, %esi ; X86-NEXT: sets %bh ; X86-NEXT: xorb %bl, %bh -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: orl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: orl {{[0-9]+}}(%esp), %esi -; X86-NEXT: orl %edi, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: orl {{[0-9]+}}(%esp), %edi +; X86-NEXT: orl %esi, %edi ; X86-NEXT: setne %bl ; X86-NEXT: testb %bh, %bl ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload @@ -1108,16 +1106,16 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: subl $1, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl $0, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl $0, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: sets %al @@ -1145,38 +1143,38 @@ ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: setne %al ; X86-NEXT: testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: cmpl $-1, %ebx -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %esi, %eax ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sbbl $0, %eax -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl $0, %eax -; X86-NEXT: cmovgel %eax, %esi -; X86-NEXT: cmovgel %eax, %ecx ; X86-NEXT: cmovgel %eax, %edi +; X86-NEXT: cmovgel %eax, %ecx +; X86-NEXT: cmovgel %eax, %esi ; X86-NEXT: movl $-1, %edx ; X86-NEXT: cmovgel %edx, %ebx ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: negl %eax ; X86-NEXT: movl $-1, %eax -; X86-NEXT: sbbl %edi, %eax +; X86-NEXT: sbbl %esi, %eax ; X86-NEXT: movl $-1, %eax ; X86-NEXT: sbbl %ecx, %eax ; X86-NEXT: movl $-1, %eax -; X86-NEXT: sbbl %esi, %eax +; X86-NEXT: sbbl %edi, %eax ; X86-NEXT: movl $0, %eax ; X86-NEXT: cmovgel %eax, %ebx -; X86-NEXT: cmovgel %edx, %edi -; X86-NEXT: shldl $31, %ebx, %edi +; X86-NEXT: cmovgel %edx, %esi +; X86-NEXT: shldl $31, %ebx, %esi ; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl %esi, 12(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, 8(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload diff --git a/llvm/test/CodeGen/X86/select-sra.ll b/llvm/test/CodeGen/X86/select-sra.ll --- a/llvm/test/CodeGen/X86/select-sra.ll +++ b/llvm/test/CodeGen/X86/select-sra.ll @@ -18,7 +18,7 @@ ; CHECK-LABEL: isnonneg_i16: ; CHECK: # %bb.0: ; CHECK-NEXT: movswl %di, %eax -; CHECK-NEXT: sarl $15, %eax +; CHECK-NEXT: shrl $15, %eax ; CHECK-NEXT: orl $542, %eax # imm = 0x21E ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/setcc-combine.ll b/llvm/test/CodeGen/X86/setcc-combine.ll --- a/llvm/test/CodeGen/X86/setcc-combine.ll +++ b/llvm/test/CodeGen/X86/setcc-combine.ll @@ -6,16 +6,18 @@ ; SSE2-LABEL: test_eq_1: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: notl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_eq_1: ; SSE41: # %bb.0: ; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pextrd $1, %xmm1, %eax -; SSE41-NEXT: notl %eax +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE41-NEXT: pextrd $1, %xmm0, %eax ; SSE41-NEXT: retq %cmp = icmp slt <4 x i32> %A, %B %sext = sext <4 x i1> %cmp to <4 x i32> @@ -29,14 +31,20 @@ ; SSE2-LABEL: test_ne_1: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: notl %eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_ne_1: ; SSE41: # %bb.0: ; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pextrd $1, %xmm1, %eax +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE41-NEXT: pextrd $1, %xmm0, %eax +; SSE41-NEXT: notl %eax ; SSE41-NEXT: retq %cmp = icmp slt <4 x i32> %A, %B %sext = sext <4 x i1> %cmp to <4 x i32> @@ -47,10 +55,22 @@ } define i32 @test_le_1(<4 x i32> %A, <4 x i32> %B) { -; CHECK-LABEL: test_le_1: -; CHECK: # %bb.0: -; CHECK-NEXT: movl $-1, %eax -; CHECK-NEXT: retq +; SSE2-LABEL: test_le_1: +; SSE2: # %bb.0: +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1] +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_le_1: +; SSE41: # %bb.0: +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] +; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE41-NEXT: pextrd $1, %xmm0, %eax +; SSE41-NEXT: retq %cmp = icmp slt <4 x i32> %A, %B %sext = sext <4 x i1> %cmp to <4 x i32> %cmp1 = icmp sle <4 x i32> %sext, zeroinitializer @@ -63,7 +83,9 @@ ; SSE2-LABEL: test_ge_1: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: notl %eax ; SSE2-NEXT: retq @@ -71,7 +93,9 @@ ; SSE41-LABEL: test_ge_1: ; SSE41: # %bb.0: ; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pextrd $1, %xmm1, %eax +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE41-NEXT: pextrd $1, %xmm0, %eax ; SSE41-NEXT: notl %eax ; SSE41-NEXT: retq %cmp = icmp slt <4 x i32> %A, %B @@ -104,10 +128,22 @@ } define i32 @test_gt_1(<4 x i32> %A, <4 x i32> %B) { -; CHECK-LABEL: test_gt_1: -; CHECK: # %bb.0: -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: retq +; SSE2-LABEL: test_gt_1: +; SSE2: # %bb.0: +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_gt_1: +; SSE41: # %bb.0: +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pextrd $1, %xmm1, %eax +; SSE41-NEXT: retq %cmp = icmp slt <4 x i32> %A, %B %sext = sext <4 x i1> %cmp to <4 x i32> %cmp1 = icmp sgt <4 x i32> %sext, zeroinitializer @@ -120,16 +156,18 @@ ; SSE2-LABEL: test_eq_2: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: notl %eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_eq_2: ; SSE41: # %bb.0: ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE41-NEXT: pextrd $1, %xmm0, %eax -; SSE41-NEXT: notl %eax +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pextrd $1, %xmm1, %eax ; SSE41-NEXT: retq %cmp = icmp slt <4 x i32> %B, %A %sext = sext <4 x i1> %cmp to <4 x i32> @@ -143,14 +181,20 @@ ; SSE2-LABEL: test_ne_2: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: notl %eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_ne_2: ; SSE41: # %bb.0: ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE41-NEXT: pextrd $1, %xmm0, %eax +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pextrd $1, %xmm1, %eax +; SSE41-NEXT: notl %eax ; SSE41-NEXT: retq %cmp = icmp slt <4 x i32> %B, %A %sext = sext <4 x i1> %cmp to <4 x i32> @@ -164,15 +208,19 @@ ; SSE2-LABEL: test_le_2: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: notl %eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_le_2: ; SSE41: # %bb.0: ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE41-NEXT: pextrd $1, %xmm0, %eax +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pextrd $1, %xmm1, %eax ; SSE41-NEXT: notl %eax ; SSE41-NEXT: retq %cmp = icmp slt <4 x i32> %B, %A @@ -184,10 +232,22 @@ } define i32 @test_ge_2(<4 x i32> %A, <4 x i32> %B) { -; CHECK-LABEL: test_ge_2: -; CHECK: # %bb.0: -; CHECK-NEXT: movl $-1, %eax -; CHECK-NEXT: retq +; SSE2-LABEL: test_ge_2: +; SSE2: # %bb.0: +; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1] +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_ge_2: +; SSE41: # %bb.0: +; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1] +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pextrd $1, %xmm1, %eax +; SSE41-NEXT: retq %cmp = icmp slt <4 x i32> %B, %A %sext = sext <4 x i1> %cmp to <4 x i32> %cmp1 = icmp sge <4 x i32> zeroinitializer, %sext @@ -221,14 +281,18 @@ ; SSE2-LABEL: test_gt_2: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_gt_2: ; SSE41: # %bb.0: ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE41-NEXT: pextrd $1, %xmm0, %eax +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pextrd $1, %xmm1, %eax ; SSE41-NEXT: retq %cmp = icmp slt <4 x i32> %B, %A %sext = sext <4 x i1> %cmp to <4 x i32> diff --git a/llvm/test/CodeGen/X86/setcc-freeze.ll b/llvm/test/CodeGen/X86/setcc-freeze.ll --- a/llvm/test/CodeGen/X86/setcc-freeze.ll +++ b/llvm/test/CodeGen/X86/setcc-freeze.ll @@ -4,7 +4,8 @@ define i32 @f(ptr %p) { ; CHECK-LABEL: f: ; CHECK: # %bb.0: -; CHECK-NEXT: testb $8, 1(%rdi) +; CHECK-NEXT: movzwl (%rdi), %eax +; CHECK-NEXT: testl $2048, %eax # imm = 0x800 ; CHECK-NEXT: je .LBB0_1 ; CHECK-NEXT: # %bb.2: # %B ; CHECK-NEXT: movl $20, %eax diff --git a/llvm/test/CodeGen/X86/setcc-fsh.ll b/llvm/test/CodeGen/X86/setcc-fsh.ll --- a/llvm/test/CodeGen/X86/setcc-fsh.ll +++ b/llvm/test/CodeGen/X86/setcc-fsh.ll @@ -252,9 +252,9 @@ define <4 x i1> @or_rotl_ne_eq0(<4 x i32> %x, <4 x i32> %y) nounwind { ; CHECK-LABEL: or_rotl_ne_eq0: ; CHECK: # %bb.0: -; CHECK-NEXT: pxor %xmm2, %xmm2 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-NEXT: retq %rot = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32>%x, <4 x i32> %x, <4 x i32> %y) %or = or <4 x i32> %y, %rot @@ -291,10 +291,10 @@ define <4 x i1> @fshl_or2_eq_0(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: fshl_or2_eq_0: ; CHECK: # %bb.0: -; CHECK-NEXT: pxor %xmm2, %xmm2 ; CHECK-NEXT: psrld $7, %xmm1 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-NEXT: retq %or = or <4 x i32> %x, %y %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %or, <4 x i32> ) @@ -305,10 +305,10 @@ define <4 x i1> @fshl_or2_commute_eq_0(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: fshl_or2_commute_eq_0: ; CHECK: # %bb.0: -; CHECK-NEXT: pxor %xmm2, %xmm2 ; CHECK-NEXT: psrld $7, %xmm1 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-NEXT: retq %or = or <4 x i32> %y, %x %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %or, <4 x i32> ) @@ -319,8 +319,8 @@ define i1 @fshr_or_eq_0(i16 %x, i16 %y) { ; CHECK-LABEL: fshr_or_eq_0: ; CHECK: # %bb.0: -; CHECK-NEXT: shll $8, %esi -; CHECK-NEXT: orw %di, %si +; CHECK-NEXT: orl %edi, %esi +; CHECK-NEXT: shldw $8, %di, %si ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %or = or i16 %x, %y @@ -332,8 +332,8 @@ define i1 @fshr_or_commute_eq_0(i16 %x, i16 %y) { ; CHECK-LABEL: fshr_or_commute_eq_0: ; CHECK: # %bb.0: -; CHECK-NEXT: shll $8, %esi -; CHECK-NEXT: orw %di, %si +; CHECK-NEXT: orl %edi, %esi +; CHECK-NEXT: shldw $8, %di, %si ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %or = or i16 %y, %x @@ -397,10 +397,10 @@ define <4 x i1> @fshl_or2_ne_0(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: fshl_or2_ne_0: ; CHECK: # %bb.0: -; CHECK-NEXT: pxor %xmm2, %xmm2 ; CHECK-NEXT: psrld $27, %xmm1 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 ; CHECK-NEXT: pxor %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -413,10 +413,10 @@ define <4 x i1> @fshl_or2_commute_ne_0(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: fshl_or2_commute_ne_0: ; CHECK: # %bb.0: -; CHECK-NEXT: pxor %xmm2, %xmm2 ; CHECK-NEXT: psrld $27, %xmm1 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 ; CHECK-NEXT: pxor %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -429,8 +429,8 @@ define i1 @fshr_or_ne_0(i64 %x, i64 %y) { ; CHECK-LABEL: fshr_or_ne_0: ; CHECK: # %bb.0: -; CHECK-NEXT: shlq $63, %rsi -; CHECK-NEXT: orq %rdi, %rsi +; CHECK-NEXT: orl %edi, %esi +; CHECK-NEXT: shldq $63, %rdi, %rsi ; CHECK-NEXT: setne %al ; CHECK-NEXT: retq %or = or i64 %x, %y @@ -442,8 +442,8 @@ define i1 @fshr_or_commute_ne_0(i64 %x, i64 %y) { ; CHECK-LABEL: fshr_or_commute_ne_0: ; CHECK: # %bb.0: -; CHECK-NEXT: shlq $63, %rsi -; CHECK-NEXT: orq %rdi, %rsi +; CHECK-NEXT: orl %edi, %esi +; CHECK-NEXT: shldq $63, %rdi, %rsi ; CHECK-NEXT: setne %al ; CHECK-NEXT: retq %or = or i64 %y, %x @@ -455,9 +455,8 @@ define i1 @fshr_or2_ne_0(i16 %x, i16 %y) { ; CHECK-LABEL: fshr_or2_ne_0: ; CHECK: # %bb.0: -; CHECK-NEXT: movzwl %si, %eax -; CHECK-NEXT: shrl $2, %eax -; CHECK-NEXT: orw %di, %ax +; CHECK-NEXT: orl %edi, %esi +; CHECK-NEXT: shrdw $2, %di, %si ; CHECK-NEXT: setne %al ; CHECK-NEXT: retq %or = or i16 %x, %y @@ -469,9 +468,8 @@ define i1 @fshr_or2_commute_ne_0(i16 %x, i16 %y) { ; CHECK-LABEL: fshr_or2_commute_ne_0: ; CHECK: # %bb.0: -; CHECK-NEXT: movzwl %si, %eax -; CHECK-NEXT: shrl $2, %eax -; CHECK-NEXT: orw %di, %ax +; CHECK-NEXT: orl %edi, %esi +; CHECK-NEXT: shrdw $2, %di, %si ; CHECK-NEXT: setne %al ; CHECK-NEXT: retq %or = or i16 %y, %x diff --git a/llvm/test/CodeGen/X86/setcc-logic.ll b/llvm/test/CodeGen/X86/setcc-logic.ll --- a/llvm/test/CodeGen/X86/setcc-logic.ll +++ b/llvm/test/CodeGen/X86/setcc-logic.ll @@ -324,7 +324,11 @@ ; CHECK-NEXT: xorpd %xmm1, %xmm1 ; CHECK-NEXT: cmpltpd %xmm0, %xmm1 ; CHECK-NEXT: movmskpd %xmm1, %eax -; CHECK-NEXT: cmpl $3, %eax +; CHECK-NEXT: testb $2, %al +; CHECK-NEXT: notb %al +; CHECK-NEXT: sete %cl +; CHECK-NEXT: orb %al, %cl +; CHECK-NEXT: testb $1, %cl ; CHECK-NEXT: jne .LBB16_2 ; CHECK-NEXT: # %bb.1: # %true ; CHECK-NEXT: movl $42, %eax @@ -678,18 +682,12 @@ } define i1 @or_cmp_eq_i16(i16 zeroext %x, i16 zeroext %y) { -; NOBMI-LABEL: or_cmp_eq_i16: -; NOBMI: # %bb.0: -; NOBMI-NEXT: notl %edi -; NOBMI-NEXT: testl %esi, %edi -; NOBMI-NEXT: sete %al -; NOBMI-NEXT: retq -; -; BMI-LABEL: or_cmp_eq_i16: -; BMI: # %bb.0: -; BMI-NEXT: andnl %esi, %edi, %eax -; BMI-NEXT: sete %al -; BMI-NEXT: retq +; CHECK-LABEL: or_cmp_eq_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: orl %edi, %esi +; CHECK-NEXT: cmpw %si, %di +; CHECK-NEXT: sete %al +; CHECK-NEXT: retq %o = or i16 %x, %y %c = icmp eq i16 %x, %o ret i1 %c @@ -698,8 +696,8 @@ define i1 @or_cmp_ne_i8(i8 zeroext %x, i8 zeroext %y) { ; CHECK-LABEL: or_cmp_ne_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: notb %sil -; CHECK-NEXT: testb %dil, %sil +; CHECK-NEXT: orl %esi, %edi +; CHECK-NEXT: cmpb %dil, %sil ; CHECK-NEXT: setne %al ; CHECK-NEXT: retq %o = or i8 %x, %y diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll --- a/llvm/test/CodeGen/X86/setcc-wide-types.ll +++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll @@ -599,169 +599,287 @@ define i1 @ne_v4i256(<4 x i256> %a0) { ; SSE2-LABEL: ne_v4i256: ; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; SSE2-NEXT: xorl %ebp, %ebp +; SSE2-NEXT: orq %rbx, %r11 +; SSE2-NEXT: setne %bpl +; SSE2-NEXT: negl %ebp +; SSE2-NEXT: movd %ebp, %xmm0 ; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %r10 -; SSE2-NEXT: movq %r10, %xmm0 ; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: movq %rcx, %xmm0 -; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %rdx -; SSE2-NEXT: movq %rdx, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: xorl %r11d, %r11d +; SSE2-NEXT: orq %r10, %rax +; SSE2-NEXT: setne %r11b +; SSE2-NEXT: negl %r11d +; SSE2-NEXT: movd %r11d, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: orq %rcx, %rsi +; SSE2-NEXT: orq %rdx, %rdi +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: orq %rsi, %rdi +; SSE2-NEXT: setne %al +; SSE2-NEXT: negl %eax +; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %r9 -; SSE2-NEXT: movq %r9, %xmm0 ; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %r8 -; SSE2-NEXT: movq %r8, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: movq %rsi, %xmm0 -; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %rdi -; SSE2-NEXT: movq %rdi, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: orq %r9, %r8 +; SSE2-NEXT: setne %al +; SSE2-NEXT: negl %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: sete %al +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 ; SSE2-NEXT: retq ; ; SSE41-LABEL: ne_v4i256: ; SSE41: # %bb.0: +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: .cfi_def_cfa_offset 16 +; SSE41-NEXT: .cfi_offset %rbx, -16 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %r10 -; SSE41-NEXT: movq %r10, %xmm0 -; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %rax -; SSE41-NEXT: movq %rax, %xmm1 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %rcx -; SSE41-NEXT: movq %rcx, %xmm0 -; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %rdx -; SSE41-NEXT: movq %rdx, %xmm2 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE41-NEXT: por %xmm1, %xmm2 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; SSE41-NEXT: orq %rcx, %rsi +; SSE41-NEXT: orq %rdx, %rdi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: orq %rsi, %rdi +; SSE41-NEXT: setne %cl +; SSE41-NEXT: negl %ecx +; SSE41-NEXT: movd %ecx, %xmm0 ; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %r9 -; SSE41-NEXT: movq %r9, %xmm0 ; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %r8 -; SSE41-NEXT: movq %r8, %xmm1 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %rsi -; SSE41-NEXT: movq %rsi, %xmm0 -; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %rdi -; SSE41-NEXT: movq %rdi, %xmm3 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE41-NEXT: por %xmm1, %xmm3 -; SSE41-NEXT: por %xmm2, %xmm3 -; SSE41-NEXT: ptest %xmm3, %xmm3 +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: orq %r9, %r8 +; SSE41-NEXT: setne %cl +; SSE41-NEXT: negl %ecx +; SSE41-NEXT: pinsrd $1, %ecx, %xmm0 +; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: orq %rbx, %r11 +; SSE41-NEXT: setne %cl +; SSE41-NEXT: negl %ecx +; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 +; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %rax +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: orq %r10, %rax +; SSE41-NEXT: setne %cl +; SSE41-NEXT: negl %ecx +; SSE41-NEXT: pinsrd $3, %ecx, %xmm0 +; SSE41-NEXT: movmskps %xmm0, %eax +; SSE41-NEXT: testl %eax, %eax ; SSE41-NEXT: sete %al +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: .cfi_def_cfa_offset 8 ; SSE41-NEXT: retq ; ; AVX1-LABEL: ne_v4i256: ; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: .cfi_offset %rbx, -16 ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %r10 -; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %rcx -; AVX1-NEXT: orq %r10, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm0 -; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %rax -; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %rdx -; AVX1-NEXT: orq %rax, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX1-NEXT: orq %rcx, %rsi +; AVX1-NEXT: orq %rdx, %rdi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: orq %rsi, %rdi +; AVX1-NEXT: setne %cl +; AVX1-NEXT: negl %ecx +; AVX1-NEXT: vmovd %ecx, %xmm0 ; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %r9 -; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %rsi -; AVX1-NEXT: orq %r9, %rsi -; AVX1-NEXT: vmovq %rsi, %xmm1 ; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %r8 -; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %rdi -; AVX1-NEXT: orq %r8, %rdi -; AVX1-NEXT: vmovq %rdi, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: orq %r9, %r8 +; AVX1-NEXT: setne %cl +; AVX1-NEXT: negl %ecx +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: orq %rbx, %r11 +; AVX1-NEXT: setne %cl +; AVX1-NEXT: negl %ecx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %rax +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: orq %r10, %rax +; AVX1-NEXT: setne %cl +; AVX1-NEXT: negl %ecx +; AVX1-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: vtestps %xmm0, %xmm0 ; AVX1-NEXT: sete %al -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: .cfi_def_cfa_offset 8 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ne_v4i256: ; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: .cfi_def_cfa_offset 16 +; AVX2-NEXT: .cfi_offset %rbx, -16 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rcx -; AVX2-NEXT: orq %r10, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm0 -; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: orq %rax, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX2-NEXT: orq %rcx, %rsi +; AVX2-NEXT: orq %rdx, %rdi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: orq %rsi, %rdi +; AVX2-NEXT: setne %cl +; AVX2-NEXT: negl %ecx +; AVX2-NEXT: vmovd %ecx, %xmm0 ; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r9 -; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rsi -; AVX2-NEXT: orq %r9, %rsi -; AVX2-NEXT: vmovq %rsi, %xmm1 ; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r8 -; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rdi -; AVX2-NEXT: orq %r8, %rdi -; AVX2-NEXT: vmovq %rdi, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: orq %r9, %r8 +; AVX2-NEXT: setne %cl +; AVX2-NEXT: negl %ecx +; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: orq %rbx, %r11 +; AVX2-NEXT: setne %cl +; AVX2-NEXT: negl %ecx +; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: orq %r10, %rax +; AVX2-NEXT: setne %cl +; AVX2-NEXT: negl %ecx +; AVX2-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: vtestps %xmm0, %xmm0 ; AVX2-NEXT: sete %al -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: .cfi_def_cfa_offset 8 ; AVX2-NEXT: retq ; -; AVX512-LABEL: ne_v4i256: -; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: shrq $32, %rax -; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 -; AVX512-NEXT: shrq $32, %r10 -; AVX512-NEXT: vpinsrd $3, %r10d, %xmm0, %xmm0 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r8 -; AVX512-NEXT: vmovd %r8d, %xmm1 -; AVX512-NEXT: shrq $32, %r8 -; AVX512-NEXT: vpinsrd $1, %r8d, %xmm1, %xmm1 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r9 -; AVX512-NEXT: vpinsrd $2, %r9d, %xmm1, %xmm1 -; AVX512-NEXT: shrq $32, %r9 -; AVX512-NEXT: vpinsrd $3, %r9d, %xmm1, %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: vmovd %edx, %xmm1 -; AVX512-NEXT: shrq $32, %rdx -; AVX512-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 -; AVX512-NEXT: shrq $32, %rcx -; AVX512-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: vmovd %edi, %xmm2 -; AVX512-NEXT: shrq $32, %rdi -; AVX512-NEXT: vpinsrd $1, %edi, %xmm2, %xmm2 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: vpinsrd $2, %esi, %xmm2, %xmm2 -; AVX512-NEXT: shrq $32, %rsi -; AVX512-NEXT: vpinsrd $3, %esi, %xmm2, %xmm2 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: ne_v4i256: +; AVX512F: # %bb.0: +; AVX512F-NEXT: pushq %r14 +; AVX512F-NEXT: .cfi_def_cfa_offset 16 +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: .cfi_def_cfa_offset 24 +; AVX512F-NEXT: .cfi_offset %rbx, -24 +; AVX512F-NEXT: .cfi_offset %r14, -16 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; AVX512F-NEXT: orq %rcx, %rsi +; AVX512F-NEXT: orq %rdx, %rdi +; AVX512F-NEXT: orq %rsi, %rdi +; AVX512F-NEXT: setne %al +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r9 +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r8 +; AVX512F-NEXT: orq %r9, %r8 +; AVX512F-NEXT: setne %cl +; AVX512F-NEXT: kmovw %ecx, %k0 +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r14 +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; AVX512F-NEXT: orq %r14, %rbx +; AVX512F-NEXT: setne %cl +; AVX512F-NEXT: kmovw %ecx, %k1 +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; AVX512F-NEXT: orq %r11, %r10 +; AVX512F-NEXT: setne %cl +; AVX512F-NEXT: andl $1, %ecx +; AVX512F-NEXT: kmovw %ecx, %k2 +; AVX512F-NEXT: kshiftlw $1, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k2, %k1 +; AVX512F-NEXT: kshiftlw $2, %k1, %k1 +; AVX512F-NEXT: andl $1, %eax +; AVX512F-NEXT: kmovw %eax, %k2 +; AVX512F-NEXT: kshiftlw $1, %k0, %k0 +; AVX512F-NEXT: korw %k0, %k2, %k0 +; AVX512F-NEXT: kshiftlw $14, %k0, %k0 +; AVX512F-NEXT: kshiftrw $14, %k0, %k0 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $15, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: popq %rbx +; AVX512F-NEXT: .cfi_def_cfa_offset 16 +; AVX512F-NEXT: popq %r14 +; AVX512F-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: ne_v4i256: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %r14 +; AVX512BW-NEXT: .cfi_def_cfa_offset 16 +; AVX512BW-NEXT: pushq %rbx +; AVX512BW-NEXT: .cfi_def_cfa_offset 24 +; AVX512BW-NEXT: .cfi_offset %rbx, -24 +; AVX512BW-NEXT: .cfi_offset %r14, -16 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; AVX512BW-NEXT: orq %rcx, %rsi +; AVX512BW-NEXT: orq %rdx, %rdi +; AVX512BW-NEXT: orq %rsi, %rdi +; AVX512BW-NEXT: setne %al +; AVX512BW-NEXT: orq {{[0-9]+}}(%rsp), %r9 +; AVX512BW-NEXT: orq {{[0-9]+}}(%rsp), %r8 +; AVX512BW-NEXT: orq %r9, %r8 +; AVX512BW-NEXT: setne %cl +; AVX512BW-NEXT: kmovd %ecx, %k0 +; AVX512BW-NEXT: orq {{[0-9]+}}(%rsp), %r14 +; AVX512BW-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; AVX512BW-NEXT: orq %r14, %rbx +; AVX512BW-NEXT: setne %cl +; AVX512BW-NEXT: kmovd %ecx, %k1 +; AVX512BW-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; AVX512BW-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; AVX512BW-NEXT: orq %r11, %r10 +; AVX512BW-NEXT: setne %cl +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: kmovw %ecx, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: kshiftlw $2, %k1, %k1 +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: kmovw %eax, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 +; AVX512BW-NEXT: korw %k0, %k2, %k0 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k0, %k0 +; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $15, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: popq %rbx +; AVX512BW-NEXT: .cfi_def_cfa_offset 16 +; AVX512BW-NEXT: popq %r14 +; AVX512BW-NEXT: .cfi_def_cfa_offset 8 +; AVX512BW-NEXT: retq %c = icmp ne <4 x i256> %a0, zeroinitializer %b = bitcast <4 x i1> %c to i4 %r = icmp eq i4 %b, 0 @@ -1326,65 +1444,35 @@ } define i1 @eq_i512_op(i512 %a, i512 %b) { -; SSE-LABEL: eq_i512_op: -; SSE: # %bb.0: -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: addq $1, %rdi -; SSE-NEXT: adcq $0, %rsi -; SSE-NEXT: adcq $0, %rdx -; SSE-NEXT: adcq $0, %rcx -; SSE-NEXT: adcq $0, %r8 -; SSE-NEXT: adcq $0, %r9 -; SSE-NEXT: adcq $0, %r10 -; SSE-NEXT: adcq $0, %rax -; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %rsi -; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %r9 -; SSE-NEXT: orq %rsi, %r9 -; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: orq %r9, %rax -; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %rdx -; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: orq %rdx, %r10 -; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %r8 -; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %rdi -; SSE-NEXT: orq %r8, %rdi -; SSE-NEXT: orq %r10, %rdi -; SSE-NEXT: orq %rax, %rdi -; SSE-NEXT: sete %al -; SSE-NEXT: retq -; -; AVXANY-LABEL: eq_i512_op: -; AVXANY: # %bb.0: -; AVXANY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVXANY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVXANY-NEXT: addq $1, %rdi -; AVXANY-NEXT: adcq $0, %rsi -; AVXANY-NEXT: adcq $0, %rdx -; AVXANY-NEXT: adcq $0, %rcx -; AVXANY-NEXT: adcq $0, %r8 -; AVXANY-NEXT: adcq $0, %r9 -; AVXANY-NEXT: adcq $0, %r10 -; AVXANY-NEXT: adcq $0, %rax -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rsi -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %r9 -; AVXANY-NEXT: orq %rsi, %r9 -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rcx -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax -; AVXANY-NEXT: orq %rcx, %rax -; AVXANY-NEXT: orq %r9, %rax -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdx -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10 -; AVXANY-NEXT: orq %rdx, %r10 -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %r8 -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdi -; AVXANY-NEXT: orq %r8, %rdi -; AVXANY-NEXT: orq %r10, %rdi -; AVXANY-NEXT: orq %rax, %rdi -; AVXANY-NEXT: sete %al -; AVXANY-NEXT: retq +; ANY-LABEL: eq_i512_op: +; ANY: # %bb.0: +; ANY-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; ANY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; ANY-NEXT: addq $1, %rdi +; ANY-NEXT: adcq $0, %rsi +; ANY-NEXT: adcq $0, %rdx +; ANY-NEXT: adcq $0, %rcx +; ANY-NEXT: adcq $0, %r8 +; ANY-NEXT: adcq $0, %r9 +; ANY-NEXT: adcq $0, %r10 +; ANY-NEXT: adcq $0, %rax +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rsi +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r9 +; ANY-NEXT: orq %rsi, %r9 +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rcx +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax +; ANY-NEXT: orq %rcx, %rax +; ANY-NEXT: orq %r9, %rax +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdx +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10 +; ANY-NEXT: orq %rdx, %r10 +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r8 +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdi +; ANY-NEXT: orq %r8, %rdi +; ANY-NEXT: orq %r10, %rdi +; ANY-NEXT: orq %rax, %rdi +; ANY-NEXT: sete %al +; ANY-NEXT: retq %a2 = add i512 %a, 1 %r = icmp eq i512 %a2, %b ret i1 %r diff --git a/llvm/test/CodeGen/X86/setcc.ll b/llvm/test/CodeGen/X86/setcc.ll --- a/llvm/test/CodeGen/X86/setcc.ll +++ b/llvm/test/CodeGen/X86/setcc.ll @@ -141,7 +141,7 @@ define zeroext i1 @t7(i32 %0) { ; X86-LABEL: t7: ; X86: ## %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl $19, %ecx ; X86-NEXT: btl %eax, %ecx ; X86-NEXT: setb %al diff --git a/llvm/test/CodeGen/X86/shift-amount-mod.ll b/llvm/test/CodeGen/X86/shift-amount-mod.ll --- a/llvm/test/CodeGen/X86/shift-amount-mod.ll +++ b/llvm/test/CodeGen/X86/shift-amount-mod.ll @@ -225,8 +225,8 @@ ; X32-NEXT: movl %esi, %edx ; X32-NEXT: xorl %esi, %esi ; X32-NEXT: .LBB7_2: -; X32-NEXT: movl %esi, (%eax) ; X32-NEXT: movl %edx, 4(%eax) +; X32-NEXT: movl %esi, (%eax) ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: retl @@ -463,8 +463,8 @@ ; X32-NEXT: movl %esi, %edx ; X32-NEXT: xorl %esi, %esi ; X32-NEXT: .LBB15_2: -; X32-NEXT: movl %edx, (%eax) ; X32-NEXT: movl %esi, 4(%eax) +; X32-NEXT: movl %edx, (%eax) ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: retl @@ -705,8 +705,8 @@ ; X32-NEXT: movl %esi, %edx ; X32-NEXT: movl %edi, %esi ; X32-NEXT: .LBB23_2: -; X32-NEXT: movl %edx, (%eax) ; X32-NEXT: movl %esi, 4(%eax) +; X32-NEXT: movl %edx, (%eax) ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: retl @@ -782,7 +782,7 @@ ; X64-LABEL: reg64_lshr_by_sub_from_negated: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: leal (%rdx,%rsi), %ecx +; X64-NEXT: leal (%rsi,%rdx), %ecx ; X64-NEXT: negb %cl ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrq %cl, %rax @@ -844,7 +844,7 @@ ; X64-LABEL: reg64_lshr_by_sub_of_negated: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: leal (%rdx,%rsi), %ecx +; X64-NEXT: leal (%rsi,%rdx), %ecx ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrq %cl, %rax ; X64-NEXT: retq @@ -1033,7 +1033,7 @@ ; X64-LABEL: reg64_lshr_by_add_of_negated_amts: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: leal (%rdx,%rsi), %ecx +; X64-NEXT: leal (%rsi,%rdx), %ecx ; X64-NEXT: negb %cl ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrq %cl, %rax @@ -1157,7 +1157,7 @@ ; X64-LABEL: reg64_lshr_by_negated_unfolded_sub_b: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: leal (%rdx,%rsi), %ecx +; X64-NEXT: leal (%rsi,%rdx), %ecx ; X64-NEXT: negb %cl ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrq %cl, %rax @@ -1218,7 +1218,7 @@ ; X64-LABEL: reg64_lshr_by_b_sub_negated_unfolded: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: leal (%rdx,%rsi), %ecx +; X64-NEXT: leal (%rsi,%rdx), %ecx ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrq %cl, %rax ; X64-NEXT: retq @@ -1556,9 +1556,8 @@ define i16 @sh_trunc_sh(i64 %x) { ; X32-LABEL: sh_trunc_sh: ; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: shrl $4, %eax -; X32-NEXT: andl $15, %eax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X32-NEXT: shrl $12, %eax ; X32-NEXT: # kill: def $ax killed $ax killed $eax ; X32-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/shift-and.ll b/llvm/test/CodeGen/X86/shift-and.ll --- a/llvm/test/CodeGen/X86/shift-and.ll +++ b/llvm/test/CodeGen/X86/shift-and.ll @@ -144,8 +144,8 @@ ; X32-NEXT: movl %esi, %edx ; X32-NEXT: xorl %esi, %esi ; X32-NEXT: .LBB5_2: -; X32-NEXT: movl %edx, (%eax) ; X32-NEXT: movl %esi, 4(%eax) +; X32-NEXT: movl %edx, (%eax) ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: retl diff --git a/llvm/test/CodeGen/X86/shift-by-signext.ll b/llvm/test/CodeGen/X86/shift-by-signext.ll --- a/llvm/test/CodeGen/X86/shift-by-signext.ll +++ b/llvm/test/CodeGen/X86/shift-by-signext.ll @@ -88,9 +88,9 @@ define i32 @n6_fshl(i32 %x, i32 %y, i8 %shamt) nounwind { ; X86-LABEL: n6_fshl: ; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shldl %cl, %edx, %eax ; X86-NEXT: retl ; @@ -108,9 +108,9 @@ define i32 @n7_fshr(i32 %x, i32 %y, i8 %shamt) nounwind { ; X86-LABEL: n7_fshr: ; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shrdl %cl, %edx, %eax ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll --- a/llvm/test/CodeGen/X86/shift-combine.ll +++ b/llvm/test/CodeGen/X86/shift-combine.ll @@ -536,11 +536,11 @@ define i32 @xor_tree_with_shifts_i32(i32 %a, i32 %b, i32 %c, i32 %d) { ; X32-LABEL: xor_tree_with_shifts_i32: ; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; X32-NEXT: shrl $16, %eax -; X32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X32-NEXT: xorl %ecx, %eax ; X32-NEXT: retl ; ; X64-LABEL: xor_tree_with_shifts_i32: diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll --- a/llvm/test/CodeGen/X86/shift-i128.ll +++ b/llvm/test/CodeGen/X86/shift-i128.ll @@ -14,7 +14,7 @@ ; i686-NEXT: pushl %edi ; i686-NEXT: pushl %esi ; i686-NEXT: subl $32, %esp -; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx +; i686-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; i686-NEXT: movl {{[0-9]+}}(%esp), %eax ; i686-NEXT: movl {{[0-9]+}}(%esp), %esi ; i686-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -85,7 +85,7 @@ ; i686-NEXT: pushl %edi ; i686-NEXT: pushl %esi ; i686-NEXT: subl $32, %esp -; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx +; i686-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; i686-NEXT: movl {{[0-9]+}}(%esp), %eax ; i686-NEXT: movl {{[0-9]+}}(%esp), %esi ; i686-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -158,7 +158,7 @@ ; i686-NEXT: pushl %edi ; i686-NEXT: pushl %esi ; i686-NEXT: subl $32, %esp -; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx +; i686-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; i686-NEXT: movl {{[0-9]+}}(%esp), %eax ; i686-NEXT: movl {{[0-9]+}}(%esp), %edx ; i686-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -551,7 +551,7 @@ ; i686-NEXT: pushl %ebx ; i686-NEXT: pushl %edi ; i686-NEXT: pushl %esi -; i686-NEXT: subl $100, %esp +; i686-NEXT: subl $92, %esp ; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp ; i686-NEXT: movl {{[0-9]+}}(%esp), %eax ; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -569,93 +569,95 @@ ; i686-NEXT: movl %edx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) -; i686-NEXT: movl %ebp, %ecx -; i686-NEXT: shrl $3, %ebp -; i686-NEXT: andl $15, %ebp -; i686-NEXT: leal {{[0-9]+}}(%esp), %eax -; i686-NEXT: subl %ebp, %eax +; i686-NEXT: movl %ebp, %ebx +; i686-NEXT: movl %ebp, %eax +; i686-NEXT: shrl $3, %eax +; i686-NEXT: andl $15, %eax +; i686-NEXT: leal {{[0-9]+}}(%esp), %ebp +; i686-NEXT: subl %eax, %ebp ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) -; i686-NEXT: movl 8(%eax), %edx +; i686-NEXT: movl 8(%ebp), %edx ; i686-NEXT: movl %edx, (%esp) # 4-byte Spill -; i686-NEXT: andl $7, %ecx -; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: andl $7, %ebx +; i686-NEXT: movl %ebx, %ecx ; i686-NEXT: shll %cl, %edx -; i686-NEXT: movl 4(%eax), %esi -; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: shrl %esi +; i686-NEXT: movl %ebx, %ecx +; i686-NEXT: movl %ebx, %edi ; i686-NEXT: notl %ecx +; i686-NEXT: negl %eax +; i686-NEXT: movl 48(%esp,%eax), %eax +; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: shrl %eax +; i686-NEXT: movl {{[0-9]+}}(%esp), %esi ; i686-NEXT: # kill: def $cl killed $cl killed $ecx -; i686-NEXT: shrl %cl, %esi -; i686-NEXT: orl %edx, %esi -; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx -; i686-NEXT: movl (%eax), %eax +; i686-NEXT: shrl %cl, %eax +; i686-NEXT: orl %edx, %eax ; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %ebx, %edx -; i686-NEXT: shrl $3, %edx -; i686-NEXT: andl $15, %edx -; i686-NEXT: leal {{[0-9]+}}(%esp), %esi -; i686-NEXT: subl %edx, %esi +; i686-NEXT: movl %esi, %edx +; i686-NEXT: movl %esi, %eax +; i686-NEXT: shrl $3, %eax +; i686-NEXT: andl $15, %eax +; i686-NEXT: leal {{[0-9]+}}(%esp), %ebx +; i686-NEXT: subl %eax, %ebx +; i686-NEXT: movl (%ebp), %ecx +; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) -; i686-NEXT: andl $7, %ebx -; i686-NEXT: movl 8(%esi), %edi -; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: shll %cl, %edi -; i686-NEXT: movl 4(%esi), %eax +; i686-NEXT: andl $7, %edx +; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 8(%ebx), %esi +; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl %edx, %ecx +; i686-NEXT: shll %cl, %esi +; i686-NEXT: notl %edx +; i686-NEXT: negl %eax +; i686-NEXT: movl 80(%esp,%eax), %eax ; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: shrl %eax -; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: notl %ecx -; i686-NEXT: # kill: def $cl killed $cl killed $ecx +; i686-NEXT: movl %edx, %ecx ; i686-NEXT: shrl %cl, %eax -; i686-NEXT: orl %edi, %eax -; i686-NEXT: movl (%esi), %ecx -; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; i686-NEXT: movl %esi, %edi -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; i686-NEXT: shll %cl, %edi -; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %ecx, %edi -; i686-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; i686-NEXT: negl %ebp -; i686-NEXT: movl 64(%esp,%ebp), %esi -; i686-NEXT: movl %edi, %ecx -; i686-NEXT: # kill: def $cl killed $cl killed $ecx +; i686-NEXT: orl %esi, %eax +; i686-NEXT: movl 12(%ebp), %esi +; i686-NEXT: movl %edi, %edx +; i686-NEXT: movl %edx, %ecx ; i686-NEXT: movl (%esp), %edi # 4-byte Reload ; i686-NEXT: shldl %cl, %edi, %esi ; i686-NEXT: movl %esi, (%esp) # 4-byte Spill -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; i686-NEXT: movl %esi, %edi +; i686-NEXT: movl (%ebx), %edi +; i686-NEXT: movl 12(%ebx), %ebp +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: shll %cl, %edi -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; i686-NEXT: shldl %cl, %esi, %ebp -; i686-NEXT: negl %edx -; i686-NEXT: movl 96(%esp,%edx), %edx -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; i686-NEXT: shldl %cl, %ebx, %edx ; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx -; i686-NEXT: movl %edx, 28(%ecx) -; i686-NEXT: movl %ebp, 20(%ecx) -; i686-NEXT: movl %edi, 16(%ecx) -; i686-NEXT: movl (%esp), %edx # 4-byte Reload -; i686-NEXT: movl %edx, 12(%ecx) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; i686-NEXT: movl %edx, 4(%ecx) +; i686-NEXT: movl %ebp, 28(%ecx) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; i686-NEXT: movl %ebp, %esi +; i686-NEXT: movl %edx, %ecx +; i686-NEXT: shll %cl, %esi ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; i686-NEXT: movl %edx, (%ecx) -; i686-NEXT: movl %eax, 24(%ecx) +; i686-NEXT: shldl %cl, %ebp, %edx +; i686-NEXT: movl %edi, %ebp +; i686-NEXT: movl %ebx, %ecx +; i686-NEXT: shll %cl, %ebp +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; i686-NEXT: shldl %cl, %edi, %ebx +; i686-NEXT: movl {{[0-9]+}}(%esp), %edi +; i686-NEXT: movl %ebx, 20(%edi) +; i686-NEXT: movl %ebp, 16(%edi) +; i686-NEXT: movl (%esp), %ecx # 4-byte Reload +; i686-NEXT: movl %ecx, 12(%edi) +; i686-NEXT: movl %edx, 4(%edi) +; i686-NEXT: movl %esi, (%edi) +; i686-NEXT: movl %eax, 24(%edi) ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; i686-NEXT: movl %eax, 8(%ecx) -; i686-NEXT: addl $100, %esp +; i686-NEXT: movl %eax, 8(%edi) +; i686-NEXT: addl $92, %esp ; i686-NEXT: popl %esi ; i686-NEXT: popl %edi ; i686-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/shift-i256.ll b/llvm/test/CodeGen/X86/shift-i256.ll --- a/llvm/test/CodeGen/X86/shift-i256.ll +++ b/llvm/test/CodeGen/X86/shift-i256.ll @@ -12,7 +12,7 @@ ; CHECK-NEXT: pushl %edi ; CHECK-NEXT: pushl %esi ; CHECK-NEXT: subl $92, %esp -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -204,7 +204,7 @@ ; CHECK-NEXT: pushl %edi ; CHECK-NEXT: pushl %esi ; CHECK-NEXT: subl $92, %esp -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/shift-mask.ll b/llvm/test/CodeGen/X86/shift-mask.ll --- a/llvm/test/CodeGen/X86/shift-mask.ll +++ b/llvm/test/CodeGen/X86/shift-mask.ll @@ -129,7 +129,7 @@ ; ; X64-SHIFT-LABEL: test_i16_shl_lshr_1: ; X64-SHIFT: # %bb.0: -; X64-SHIFT-NEXT: movzwl %di, %eax +; X64-SHIFT-NEXT: movl %edi, %eax ; X64-SHIFT-NEXT: shrl $3, %eax ; X64-SHIFT-NEXT: shll $5, %eax ; X64-SHIFT-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/shift-parts.ll b/llvm/test/CodeGen/X86/shift-parts.ll --- a/llvm/test/CodeGen/X86/shift-parts.ll +++ b/llvm/test/CodeGen/X86/shift-parts.ll @@ -11,15 +11,13 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq g_144+24(%rip), %rax ; CHECK-NEXT: movq g_144+16(%rip), %rcx -; CHECK-NEXT: movzbl %sil, %edx -; CHECK-NEXT: shll $6, %edx ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_1: # %for.cond ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: testb $64, %dl -; CHECK-NEXT: movq %rcx, %rsi -; CHECK-NEXT: cmovneq %rax, %rsi -; CHECK-NEXT: testl %esi, %esi +; CHECK-NEXT: testb $1, %sil +; CHECK-NEXT: movq %rcx, %rdx +; CHECK-NEXT: cmovneq %rax, %rdx +; CHECK-NEXT: testl %edx, %edx ; CHECK-NEXT: je .LBB0_1 ; CHECK-NEXT: # %bb.2: # %if.then ; CHECK-NEXT: movl $1, %eax diff --git a/llvm/test/CodeGen/X86/shl_elim.ll b/llvm/test/CodeGen/X86/shl_elim.ll --- a/llvm/test/CodeGen/X86/shl_elim.ll +++ b/llvm/test/CodeGen/X86/shl_elim.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: test1: ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: shrl %eax +; CHECK-NEXT: shrl $9, %eax ; CHECK-NEXT: cwtl ; CHECK-NEXT: retl %tmp29 = lshr i64 %a, 24 ; [#uses=1] diff --git a/llvm/test/CodeGen/X86/shrink-compare-pgso.ll b/llvm/test/CodeGen/X86/shrink-compare-pgso.ll --- a/llvm/test/CodeGen/X86/shrink-compare-pgso.ll +++ b/llvm/test/CodeGen/X86/shrink-compare-pgso.ll @@ -98,10 +98,12 @@ define dso_local void @test5(i32 %X) nounwind !prof !14 { ; CHECK-LABEL: test5: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movzbl x+6(%rip), %eax -; CHECK-NEXT: shll $16, %eax -; CHECK-NEXT: movzwl x+4(%rip), %ecx -; CHECK-NEXT: orl %eax, %ecx +; CHECK-NEXT: movzwl x+4(%rip), %eax +; CHECK-NEXT: shlq $32, %rax +; CHECK-NEXT: movzbl x+6(%rip), %ecx +; CHECK-NEXT: shlq $48, %rcx +; CHECK-NEXT: orq %rax, %rcx +; CHECK-NEXT: shrq $32, %rcx ; CHECK-NEXT: cmpl $1, %ecx ; CHECK-NEXT: jne bar # TAILCALL ; CHECK-NEXT: # %bb.1: # %if.end diff --git a/llvm/test/CodeGen/X86/shrink-compare.ll b/llvm/test/CodeGen/X86/shrink-compare.ll --- a/llvm/test/CodeGen/X86/shrink-compare.ll +++ b/llvm/test/CodeGen/X86/shrink-compare.ll @@ -98,10 +98,12 @@ define dso_local void @test5(i32 %X) nounwind minsize { ; CHECK-LABEL: test5: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movzbl x+6(%rip), %eax -; CHECK-NEXT: shll $16, %eax -; CHECK-NEXT: movzwl x+4(%rip), %ecx -; CHECK-NEXT: orl %eax, %ecx +; CHECK-NEXT: movzwl x+4(%rip), %eax +; CHECK-NEXT: shlq $32, %rax +; CHECK-NEXT: movzbl x+6(%rip), %ecx +; CHECK-NEXT: shlq $48, %rcx +; CHECK-NEXT: orq %rax, %rcx +; CHECK-NEXT: shrq $32, %rcx ; CHECK-NEXT: cmpl $1, %ecx ; CHECK-NEXT: jne bar # TAILCALL ; CHECK-NEXT: # %bb.1: # %if.end diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -719,18 +719,18 @@ ; X86-SSE-NEXT: pmulhuw %xmm2, %xmm4 ; X86-SSE-NEXT: pmullw %xmm2, %xmm0 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; X86-SSE-NEXT: movdqa %xmm1, %xmm4 ; X86-SSE-NEXT: pmulhuw %xmm3, %xmm4 ; X86-SSE-NEXT: pmullw %xmm3, %xmm1 ; X86-SSE-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X86-SSE-NEXT: movdqu %xmm1, 32(%ecx,%eax,4) -; X86-SSE-NEXT: movdqu %xmm3, 48(%ecx,%eax,4) -; X86-SSE-NEXT: movdqu %xmm0, (%ecx,%eax,4) -; X86-SSE-NEXT: movdqu %xmm2, 16(%ecx,%eax,4) +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X86-SSE-NEXT: movdqu %xmm1, 48(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm3, 32(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm0, 16(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm2, (%ecx,%eax,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -790,18 +790,18 @@ ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm4 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; X64-SSE-NEXT: movdqa %xmm3, %xmm4 ; X64-SSE-NEXT: pmulhuw %xmm1, %xmm4 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1 -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4) +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) ; X64-SSE-NEXT: retq ; ; X64-AVX1-LABEL: mul_16xi16: @@ -1216,18 +1216,18 @@ ; X86-SSE-NEXT: pmulhw %xmm2, %xmm4 ; X86-SSE-NEXT: pmullw %xmm2, %xmm0 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; X86-SSE-NEXT: movdqa %xmm1, %xmm4 ; X86-SSE-NEXT: pmulhw %xmm3, %xmm4 ; X86-SSE-NEXT: pmullw %xmm3, %xmm1 ; X86-SSE-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X86-SSE-NEXT: movdqu %xmm1, 32(%ecx,%eax,4) -; X86-SSE-NEXT: movdqu %xmm3, 48(%ecx,%eax,4) -; X86-SSE-NEXT: movdqu %xmm0, (%ecx,%eax,4) -; X86-SSE-NEXT: movdqu %xmm2, 16(%ecx,%eax,4) +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X86-SSE-NEXT: movdqu %xmm1, 48(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm3, 32(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm0, 16(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm2, (%ecx,%eax,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -1287,18 +1287,18 @@ ; X64-SSE-NEXT: pmulhw %xmm0, %xmm4 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; X64-SSE-NEXT: movdqa %xmm3, %xmm4 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm4 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1 -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4) +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) ; X64-SSE-NEXT: retq ; ; X64-AVX1-LABEL: mul_16xi16_sext: @@ -1427,7 +1427,7 @@ ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx ; X86-SSE-NEXT: movd %ecx, %xmm0 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $24, %xmm0 ; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) @@ -1451,7 +1451,7 @@ ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx ; X64-SSE-NEXT: movd %ecx, %xmm0 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X64-SSE-NEXT: psrad $24, %xmm0 ; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) @@ -1619,7 +1619,7 @@ ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx ; X86-SSE-NEXT: movd %ecx, %xmm0 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $24, %xmm0 ; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) @@ -1643,7 +1643,7 @@ ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx ; X64-SSE-NEXT: movd %ecx, %xmm0 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X64-SSE-NEXT: psrad $24, %xmm0 ; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) @@ -1683,7 +1683,7 @@ ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx ; X86-SSE-NEXT: movd %ecx, %xmm0 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $24, %xmm0 ; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) @@ -1707,7 +1707,7 @@ ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx ; X64-SSE-NEXT: movd %ecx, %xmm0 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X64-SSE-NEXT: psrad $24, %xmm0 ; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) @@ -1750,6 +1750,8 @@ ; X86-SSE-NEXT: pmulhuw %xmm1, %xmm2 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE-NEXT: psllq $32, %xmm0 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; @@ -1773,6 +1775,8 @@ ; X64-SSE-NEXT: pmulhuw %xmm1, %xmm2 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-SSE-NEXT: psllq $32, %xmm0 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; @@ -1983,86 +1987,86 @@ ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movzwl 16(%eax), %edx ; X86-SSE-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-SSE-NEXT: movdqa (%eax), %xmm3 +; X86-SSE-NEXT: movdqa (%eax), %xmm2 ; X86-SSE-NEXT: movdqa (%ecx), %xmm0 ; X86-SSE-NEXT: movdqa 16(%ecx), %xmm1 -; X86-SSE-NEXT: pxor %xmm5, %xmm5 -; X86-SSE-NEXT: movdqa %xmm3, %xmm2 -; X86-SSE-NEXT: pextrw $7, %xmm3, %eax -; X86-SSE-NEXT: pextrw $4, %xmm3, %edi -; X86-SSE-NEXT: pextrw $0, %xmm3, %ebp -; X86-SSE-NEXT: pextrw $1, %xmm3, %esi -; X86-SSE-NEXT: pextrw $3, %xmm3, %ebx -; X86-SSE-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] -; X86-SSE-NEXT: movd %xmm3, %ecx +; X86-SSE-NEXT: pxor %xmm4, %xmm4 +; X86-SSE-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE-NEXT: pextrw $7, %xmm2, %eax +; X86-SSE-NEXT: pextrw $4, %xmm2, %edi +; X86-SSE-NEXT: pextrw $0, %xmm2, %ebp +; X86-SSE-NEXT: pextrw $1, %xmm2, %esi +; X86-SSE-NEXT: pextrw $3, %xmm2, %ebx +; X86-SSE-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] +; X86-SSE-NEXT: movd %xmm2, %ecx ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %ecx -; X86-SSE-NEXT: movd %edx, %xmm3 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; X86-SSE-NEXT: movd %xmm5, %eax -; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; X86-SSE-NEXT: movd %xmm5, %ecx +; X86-SSE-NEXT: movd %edx, %xmm2 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; X86-SSE-NEXT: movd %xmm4, %eax +; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; X86-SSE-NEXT: movd %xmm4, %ecx ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %ecx -; X86-SSE-NEXT: movd %edx, %xmm5 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; X86-SSE-NEXT: movd %edx, %xmm6 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; X86-SSE-NEXT: movl %edi, %eax ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-SSE-NEXT: divl 16(%edi) -; X86-SSE-NEXT: movd %edx, %xmm3 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; X86-SSE-NEXT: movd %xmm2, %eax +; X86-SSE-NEXT: movd %edx, %xmm4 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; X86-SSE-NEXT: movd %xmm3, %eax ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; X86-SSE-NEXT: movd %xmm1, %ecx ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %ecx ; X86-SSE-NEXT: movd %edx, %xmm1 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] ; X86-SSE-NEXT: movl %ebp, %eax ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl (%edi) -; X86-SSE-NEXT: movd %edx, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; X86-SSE-NEXT: movd %xmm2, %ecx +; X86-SSE-NEXT: movd %edx, %xmm3 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] +; X86-SSE-NEXT: movd %xmm6, %ecx ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %ecx -; X86-SSE-NEXT: movd %edx, %xmm2 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; X86-SSE-NEXT: movd %xmm2, %ecx +; X86-SSE-NEXT: movd %edx, %xmm6 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; X86-SSE-NEXT: movd %xmm7, %ecx ; X86-SSE-NEXT: movl %ebx, %eax ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %ecx -; X86-SSE-NEXT: movd %edx, %xmm2 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; X86-SSE-NEXT: movd %xmm4, %eax +; X86-SSE-NEXT: movd %edx, %xmm7 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; X86-SSE-NEXT: movd %xmm5, %eax ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X86-SSE-NEXT: movd %xmm0, %ecx ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %ecx ; X86-SSE-NEXT: movd %edx, %xmm0 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; X86-SSE-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl 32(%edi) -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X86-SSE-NEXT: pmuludq %xmm2, %xmm4 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm2, %xmm3 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm5 = [8199,8199,8199,8199] +; X86-SSE-NEXT: pmuludq %xmm5, %xmm6 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; X86-SSE-NEXT: pmuludq %xmm5, %xmm3 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; X86-SSE-NEXT: pmuludq %xmm5, %xmm4 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X86-SSE-NEXT: pmuludq %xmm5, %xmm1 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; X86-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007 @@ -2204,91 +2208,91 @@ ; X64-SSE-LABEL: PR34947: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: movzwl 16(%rdi), %ecx -; X64-SSE-NEXT: movdqa (%rdi), %xmm3 +; X64-SSE-NEXT: movdqa (%rdi), %xmm2 ; X64-SSE-NEXT: movdqa (%rsi), %xmm0 ; X64-SSE-NEXT: movdqa 16(%rsi), %xmm1 -; X64-SSE-NEXT: pxor %xmm5, %xmm5 -; X64-SSE-NEXT: movdqa %xmm3, %xmm2 -; X64-SSE-NEXT: pextrw $7, %xmm3, %eax -; X64-SSE-NEXT: pextrw $4, %xmm3, %r8d -; X64-SSE-NEXT: pextrw $0, %xmm3, %r10d -; X64-SSE-NEXT: pextrw $1, %xmm3, %edi -; X64-SSE-NEXT: pextrw $3, %xmm3, %r9d -; X64-SSE-NEXT: movdqa %xmm3, %xmm4 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] -; X64-SSE-NEXT: movd %xmm3, %r11d +; X64-SSE-NEXT: pxor %xmm4, %xmm4 +; X64-SSE-NEXT: movdqa %xmm2, %xmm3 +; X64-SSE-NEXT: pextrw $7, %xmm2, %eax +; X64-SSE-NEXT: pextrw $4, %xmm2, %r8d +; X64-SSE-NEXT: pextrw $0, %xmm2, %r10d +; X64-SSE-NEXT: pextrw $1, %xmm2, %edi +; X64-SSE-NEXT: pextrw $3, %xmm2, %r9d +; X64-SSE-NEXT: movdqa %xmm2, %xmm5 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] +; X64-SSE-NEXT: movd %xmm2, %r11d ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %r11d -; X64-SSE-NEXT: movd %edx, %xmm3 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; X64-SSE-NEXT: movd %xmm5, %eax -; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; X64-SSE-NEXT: movd %xmm5, %r11d +; X64-SSE-NEXT: movd %edx, %xmm2 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; X64-SSE-NEXT: movd %xmm4, %eax +; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; X64-SSE-NEXT: movd %xmm4, %r11d ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %r11d -; X64-SSE-NEXT: movd %edx, %xmm5 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; X64-SSE-NEXT: movd %edx, %xmm6 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; X64-SSE-NEXT: movl %r8d, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl 16(%rsi) -; X64-SSE-NEXT: movd %edx, %xmm3 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; X64-SSE-NEXT: movd %xmm2, %eax +; X64-SSE-NEXT: movd %edx, %xmm4 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; X64-SSE-NEXT: movd %xmm3, %eax ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; X64-SSE-NEXT: movd %xmm1, %r8d ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %r8d ; X64-SSE-NEXT: movd %edx, %xmm1 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] ; X64-SSE-NEXT: movl %r10d, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl (%rsi) -; X64-SSE-NEXT: movd %edx, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; X64-SSE-NEXT: movd %xmm2, %r8d +; X64-SSE-NEXT: movd %edx, %xmm3 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] +; X64-SSE-NEXT: movd %xmm6, %r8d ; X64-SSE-NEXT: movl %edi, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %r8d -; X64-SSE-NEXT: movd %edx, %xmm2 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; X64-SSE-NEXT: movd %xmm2, %edi +; X64-SSE-NEXT: movd %edx, %xmm6 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; X64-SSE-NEXT: movd %xmm7, %edi ; X64-SSE-NEXT: movl %r9d, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %edi -; X64-SSE-NEXT: movd %edx, %xmm2 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; X64-SSE-NEXT: movd %xmm4, %eax +; X64-SSE-NEXT: movd %edx, %xmm7 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; X64-SSE-NEXT: movd %xmm5, %eax ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-SSE-NEXT: movd %xmm0, %edi ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %edi ; X64-SSE-NEXT: movd %edx, %xmm0 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; X64-SSE-NEXT: movl %ecx, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl 32(%rsi) +; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] ; X64-SSE-NEXT: movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-SSE-NEXT: pmuludq %xmm0, %xmm2 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm0, %xmm6 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] ; X64-SSE-NEXT: pmuludq %xmm0, %xmm3 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; X64-SSE-NEXT: pmuludq %xmm0, %xmm2 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; X64-SSE-NEXT: pmuludq %xmm0, %xmm4 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; X64-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007 ; X64-SSE-NEXT: movl %eax, (%rax) +; X64-SSE-NEXT: movdqa %xmm4, (%rax) ; X64-SSE-NEXT: movdqa %xmm3, (%rax) -; X64-SSE-NEXT: movdqa %xmm1, (%rax) ; X64-SSE-NEXT: retq ; ; X64-AVX1-LABEL: PR34947: diff --git a/llvm/test/CodeGen/X86/shuffle-half.ll b/llvm/test/CodeGen/X86/shuffle-half.ll --- a/llvm/test/CodeGen/X86/shuffle-half.ll +++ b/llvm/test/CodeGen/X86/shuffle-half.ll @@ -10,8 +10,8 @@ ; CHECK-NEXT: jne .LBB0_2 ; CHECK-NEXT: # %bb.1: # %cond.load ; CHECK-NEXT: vpinsrw $0, (%rax), %xmm0, %xmm0 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; CHECK-NEXT: .LBB0_2: # %else diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll @@ -156,9 +156,9 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[3,1,2,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm0, (%rsi) ; SSE2-NEXT: retq @@ -463,8 +463,8 @@ ; SSE2-LABEL: shuffle_v16i8_to_v2i8_2: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rsi) @@ -541,8 +541,8 @@ ; SSE2-LABEL: shuffle_v16i8_to_v2i8_4: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3] -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rsi) @@ -619,8 +619,8 @@ ; SSE2-LABEL: shuffle_v16i8_to_v2i8_6: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3] -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rsi) diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll @@ -64,10 +64,10 @@ ; ; AVX512BWVL-FAST-ALL-LABEL: shuffle_v16i32_to_v8i32_1: ; AVX512BWVL-FAST-ALL: # %bb.0: -; AVX512BWVL-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] -; AVX512BWVL-FAST-ALL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1 -; AVX512BWVL-FAST-ALL-NEXT: vmovdqa %ymm1, (%rsi) +; AVX512BWVL-FAST-ALL-NEXT: vmovaps (%rdi), %ymm0 +; AVX512BWVL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] +; AVX512BWVL-FAST-ALL-NEXT: vpermi2ps 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-FAST-ALL-NEXT: vmovaps %ymm1, (%rsi) ; AVX512BWVL-FAST-ALL-NEXT: vzeroupper ; AVX512BWVL-FAST-ALL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -276,9 +276,9 @@ ; ; AVX512VL-LABEL: shuffle_v8i32_to_v4i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vpmovqd %ymm0, (%rsi) -; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] +; AVX512VL-NEXT: vmovaps %xmm0, (%rsi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v8i32_to_v4i32: @@ -290,16 +290,16 @@ ; ; AVX512BWVL-LABEL: shuffle_v8i32_to_v4i32: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vpmovqd %ymm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] +; AVX512BWVL-NEXT: vmovaps %xmm0, (%rsi) ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: shuffle_v8i32_to_v4i32: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vpmovqd %ymm0, (%rsi) -; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VBMIVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] +; AVX512VBMIVL-NEXT: vmovaps %xmm0, (%rsi) ; AVX512VBMIVL-NEXT: retq %vec = load <8 x i32>, ptr %L %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1325,74 +1325,74 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind { ; AVX1-LABEL: negative: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm1, %eax +; AVX1-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: negative: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm1, %eax +; AVX2-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: negative: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512F-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: negative: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX512VL-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm1, %eax +; AVX512VL-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: negative: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vmovd %xmm1, %eax +; AVX512BW-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: negative: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512BWVL-NEXT: movl $65537, %eax # imm = 0x10001 -; AVX512BWVL-NEXT: kmovd %eax, %k1 -; AVX512BWVL-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} -; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovd %xmm1, %eax +; AVX512BWVL-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: negative: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; AVX512VBMIVL-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512VBMIVL-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 -; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VBMIVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512VBMIVL-NEXT: vmovd %xmm1, %eax +; AVX512VBMIVL-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %strided.vec = shufflevector <32 x i8> %v, <32 x i8> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -166,12 +166,32 @@ } define void @shuffle_v16i32_to_v8i32(ptr %L, ptr %S) nounwind { -; AVX512-LABEL: shuffle_v16i32_to_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, (%rsi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: shuffle_v16i32_to_v8i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps (%rdi), %ymm0 +; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6] +; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512F-NEXT: vmovaps %ymm0, (%rsi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-FAST-ALL-LABEL: shuffle_v16i32_to_v8i32: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovaps (%rdi), %ymm0 +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14] +; AVX512VL-FAST-ALL-NEXT: vpermi2ps 32(%rdi), %ymm0, %ymm1 +; AVX512VL-FAST-ALL-NEXT: vmovaps %ymm1, (%rsi) +; AVX512VL-FAST-ALL-NEXT: vzeroupper +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i32_to_v8i32: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6] +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rsi) +; AVX512VL-FAST-PERLANE-NEXT: vzeroupper +; AVX512VL-FAST-PERLANE-NEXT: retq %vec = load <16 x i32>, ptr %L %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> store <8 x i32> %strided.vec, ptr %S diff --git a/llvm/test/CodeGen/X86/signbit-shift.ll b/llvm/test/CodeGen/X86/signbit-shift.ll --- a/llvm/test/CodeGen/X86/signbit-shift.ll +++ b/llvm/test/CodeGen/X86/signbit-shift.ll @@ -86,8 +86,9 @@ ; CHECK-LABEL: add_sext_ifpos: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: shrl $31, %edi -; CHECK-NEXT: leal 41(%rdi), %eax +; CHECK-NEXT: notl %edi +; CHECK-NEXT: sarl $31, %edi +; CHECK-NEXT: leal 42(%rdi), %eax ; CHECK-NEXT: retq %c = icmp sgt i32 %x, -1 %e = sext i1 %c to i32 diff --git a/llvm/test/CodeGen/X86/single_elt_vector_memory_operation.ll b/llvm/test/CodeGen/X86/single_elt_vector_memory_operation.ll --- a/llvm/test/CodeGen/X86/single_elt_vector_memory_operation.ll +++ b/llvm/test/CodeGen/X86/single_elt_vector_memory_operation.ll @@ -87,8 +87,8 @@ ; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: movaps %xmm2, 48(%rdx) ; SSE-NEXT: movaps %xmm2, 32(%rdx) -; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: load_single_256bit_elt_vector: @@ -129,8 +129,8 @@ ; SSE: # %bb.0: ; SSE-NEXT: movaps (%rdi), %xmm0 ; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: store_single_256bit_elt_vector: @@ -186,8 +186,8 @@ ; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vmovaps %ymm2, 96(%rdx) ; AVX-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX-NEXT: vmovaps %ymm0, (%rdx) ; AVX-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX-NEXT: vmovaps %ymm0, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -198,8 +198,8 @@ ; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vmovaps %ymm2, 96(%rdx) ; AVX2-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-NEXT: vmovaps %ymm0, (%rdx) ; AVX2-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-NEXT: vmovaps %ymm0, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -226,17 +226,17 @@ ; SSE-NEXT: movaps 32(%rdi), %xmm2 ; SSE-NEXT: movaps 48(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, 48(%rdx) -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps %xmm1, 16(%rdx) ; SSE-NEXT: movaps %xmm2, 32(%rdx) +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: store_single_512bit_elt_vector: ; AVX: # %bb.0: ; AVX-NEXT: vmovaps (%rdi), %ymm0 ; AVX-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX-NEXT: vmovaps %ymm0, (%rdx) ; AVX-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX-NEXT: vmovaps %ymm0, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -244,8 +244,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-NEXT: vmovaps %ymm0, (%rdx) ; AVX2-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-NEXT: vmovaps %ymm0, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/smul-with-overflow.ll b/llvm/test/CodeGen/X86/smul-with-overflow.ll --- a/llvm/test/CodeGen/X86/smul-with-overflow.ll +++ b/llvm/test/CodeGen/X86/smul-with-overflow.ll @@ -435,8 +435,8 @@ ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -822,7 +822,7 @@ ; X64-NEXT: pushq %rbx ; X64-NEXT: movq %r9, %r15 ; X64-NEXT: movq %rcx, %r9 -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rsi, %r12 ; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r11 @@ -830,42 +830,42 @@ ; X64-NEXT: negq %r11 ; X64-NEXT: andl $1, %r9d ; X64-NEXT: negq %r9 -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r8 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rbp ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, %rdi ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: addq %rdx, %rbp +; X64-NEXT: addq %rdx, %rdi ; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: addq %rax, %rbp +; X64-NEXT: addq %rax, %rdi ; X64-NEXT: adcq %rdx, %rcx ; X64-NEXT: setb %sil -; X64-NEXT: movzbl %sil, %edi +; X64-NEXT: movzbl %sil, %r14d ; X64-NEXT: addq %rax, %rcx -; X64-NEXT: adcq %rdx, %rdi +; X64-NEXT: adcq %rdx, %r14 ; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r10, %r13 -; X64-NEXT: adcq $0, %rbx +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %rbx, %rbp +; X64-NEXT: adcq $0, %r13 ; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: addq %r13, %rax +; X64-NEXT: addq %rbp, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rbx, %rsi +; X64-NEXT: adcq %r13, %rsi ; X64-NEXT: setb %r8b -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: addq %rsi, %rax @@ -873,63 +873,64 @@ ; X64-NEXT: adcq %rdx, %rbx ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: adcq %rbp, %rbx +; X64-NEXT: adcq %rdi, %rbx ; X64-NEXT: adcq $0, %rcx -; X64-NEXT: adcq $0, %rdi +; X64-NEXT: adcq $0, %r14 ; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %r13 -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rax, %r14 ; X64-NEXT: movq %rax, %r8 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: movq %rax, %rdi ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: addq %r13, %r14 +; X64-NEXT: addq %r13, %r15 ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rbp -; X64-NEXT: addq %r15, %r14 +; X64-NEXT: addq %r8, %r15 ; X64-NEXT: adcq %r13, %rbp ; X64-NEXT: setb %al -; X64-NEXT: addq %r8, %rbp +; X64-NEXT: addq %rdi, %rbp ; X64-NEXT: movzbl %al, %r12d ; X64-NEXT: adcq %rdx, %r12 -; X64-NEXT: addq %r15, %rsi +; X64-NEXT: addq %r8, %rsi +; X64-NEXT: movq %r8, %r10 +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %rbx, %r14 +; X64-NEXT: adcq %rbx, %r15 ; X64-NEXT: adcq $0, %rbp ; X64-NEXT: adcq $0, %r12 ; X64-NEXT: addq %rcx, %rbp -; X64-NEXT: adcq %rdi, %r12 +; X64-NEXT: adcq %r14, %r12 ; X64-NEXT: setb %cl ; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rdx, %r10 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: addq %rax, %r10 -; X64-NEXT: adcq %rdx, %rdi -; X64-NEXT: setb %bl -; X64-NEXT: addq %rax, %rdi -; X64-NEXT: movzbl %bl, %esi -; X64-NEXT: adcq %rdx, %rsi +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %rdx, %r8 +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: adcq $0, %rbx +; X64-NEXT: addq %rax, %r8 +; X64-NEXT: adcq %rdx, %rbx +; X64-NEXT: setb %r14b +; X64-NEXT: addq %rax, %rbx +; X64-NEXT: movzbl %r14b, %r14d +; X64-NEXT: adcq %rdx, %r14 ; X64-NEXT: addq %rax, %rbp -; X64-NEXT: adcq %r12, %r10 +; X64-NEXT: adcq %r12, %r8 ; X64-NEXT: movzbl %cl, %eax -; X64-NEXT: adcq %rax, %rdi -; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %rax, %rbx +; X64-NEXT: adcq $0, %r14 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq %rsi, %r8 +; X64-NEXT: movq %rsi, %rdi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: addq %rax, %r8 +; X64-NEXT: addq %rax, %rdi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; X64-NEXT: addq %rbx, %r8 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; X64-NEXT: addq %r12, %rdi ; X64-NEXT: adcq %rax, %rcx ; X64-NEXT: setb %al ; X64-NEXT: addq %rsi, %rcx @@ -937,42 +938,43 @@ ; X64-NEXT: adcq %rdx, %rsi ; X64-NEXT: movq %r9, %rax ; X64-NEXT: imulq %r11 -; X64-NEXT: movq %rbx, %r11 +; X64-NEXT: movq %r12, %r11 ; X64-NEXT: addq %rax, %r11 -; X64-NEXT: movq %r8, %r12 +; X64-NEXT: movq %rdi, %r12 ; X64-NEXT: adcq %rdx, %r12 ; X64-NEXT: addq %rcx, %r11 ; X64-NEXT: adcq %rsi, %r12 -; X64-NEXT: movq %r15, %r9 +; X64-NEXT: movq %r10, %r9 ; X64-NEXT: addq %r13, %r9 ; X64-NEXT: adcq $0, %r13 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; X64-NEXT: addq %rcx, %r9 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; X64-NEXT: adcq %rsi, %r13 -; X64-NEXT: setb %bl +; X64-NEXT: setb %r10b ; X64-NEXT: addq %rcx, %r13 -; X64-NEXT: movzbl %bl, %ecx +; X64-NEXT: movzbl %r10b, %ecx ; X64-NEXT: adcq %rsi, %rcx -; X64-NEXT: addq %r15, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: addq %rsi, %rax ; X64-NEXT: adcq %r9, %rdx ; X64-NEXT: addq %r13, %rax ; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; X64-NEXT: adcq %r8, %r9 +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; X64-NEXT: adcq %rdi, %r9 ; X64-NEXT: adcq %r11, %rax ; X64-NEXT: adcq %r12, %rdx -; X64-NEXT: addq %rbp, %r15 -; X64-NEXT: adcq %r10, %r9 -; X64-NEXT: adcq %rdi, %rax -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload -; X64-NEXT: movq %r14, %rcx +; X64-NEXT: addq %rbp, %rsi +; X64-NEXT: adcq %r8, %r9 +; X64-NEXT: adcq %rbx, %rax +; X64-NEXT: adcq %r14, %rdx +; X64-NEXT: movq %r15, %rcx ; X64-NEXT: sarq $63, %rcx ; X64-NEXT: xorq %rcx, %rdx ; X64-NEXT: xorq %rcx, %r9 ; X64-NEXT: orq %rdx, %r9 ; X64-NEXT: xorq %rcx, %rax -; X64-NEXT: xorq %r15, %rcx +; X64-NEXT: xorq %rsi, %rcx ; X64-NEXT: orq %rax, %rcx ; X64-NEXT: orq %r9, %rcx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload @@ -980,9 +982,9 @@ ; X64-NEXT: andl $1, %esi ; X64-NEXT: movq %rsi, %rdx ; X64-NEXT: negq %rdx -; X64-NEXT: xorq %rdx, %r14 +; X64-NEXT: xorq %rdx, %r15 ; X64-NEXT: xorq %rax, %rdx -; X64-NEXT: orq %r14, %rdx +; X64-NEXT: orq %r15, %rdx ; X64-NEXT: orq %rcx, %rdx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload diff --git a/llvm/test/CodeGen/X86/splat-for-size.ll b/llvm/test/CodeGen/X86/splat-for-size.ll --- a/llvm/test/CodeGen/X86/splat-for-size.ll +++ b/llvm/test/CodeGen/X86/splat-for-size.ll @@ -387,8 +387,9 @@ define <8 x i64> @pr23259() #1 { ; AVX-LABEL: pr23259: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps A+16(%rip), %xmm0 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [1,1] +; AVX-NEXT: # xmm0 = mem[0,0] +; AVX-NEXT: vpinsrq $0, A+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1] ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll @@ -185,7 +185,7 @@ ; SSE2-NEXT: addq %rdx, %rax ; SSE2-NEXT: leaq (%rax,%rax,8), %rax ; SSE2-NEXT: subq %rax, %rsi -; SSE2-NEXT: movq %rsi, %xmm1 +; SSE2-NEXT: movq %rsi, %xmm0 ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: imulq %r8 ; SSE2-NEXT: movq %rdx, %rax @@ -193,10 +193,10 @@ ; SSE2-NEXT: addq %rdx, %rax ; SSE2-NEXT: leaq (%rax,%rax,8), %rax ; SSE2-NEXT: subq %rax, %rdi -; SSE2-NEXT: movq %rdi, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8589934591,8589934591] -; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movq %rdi, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [8589934591,8589934591] +; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: movabsq $2049638230412172401, %rdx # imm = 0x1C71C71C71C71C71 ; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: imulq %rdx @@ -208,19 +208,21 @@ ; SSE2-NEXT: leaq (%rdx,%rdx,8), %rax ; SSE2-NEXT: addq %rcx, %rax ; SSE2-NEXT: movq %rax, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,2] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,3] -; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: notl %ecx ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: # kill: def $cl killed $cl killed $ecx ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_srem_vec: @@ -271,7 +273,7 @@ ; SSE41-NEXT: pxor %xmm1, %xmm2 ; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: pextrb $8, %xmm0, %edx -; SSE41-NEXT: pextrb $0, %xmm2, %ecx +; SSE41-NEXT: movd %xmm2, %ecx ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: # kill: def $dl killed $dl killed $edx ; SSE41-NEXT: # kill: def $cl killed $cl killed $ecx @@ -375,12 +377,11 @@ ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: notl %eax -; AVX2-NEXT: vpextrb $8, %xmm1, %edx -; AVX2-NEXT: vpextrb $0, %xmm2, %ecx +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: vpextrb $8, %xmm0, %edx +; AVX2-NEXT: vpextrb $0, %xmm1, %ecx ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: # kill: def $dl killed $dl killed $edx ; AVX2-NEXT: # kill: def $cl killed $cl killed $ecx diff --git a/llvm/test/CodeGen/X86/srem-seteq-optsize.ll b/llvm/test/CodeGen/X86/srem-seteq-optsize.ll --- a/llvm/test/CodeGen/X86/srem-seteq-optsize.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-optsize.ll @@ -47,11 +47,18 @@ define i32 @test_optsize(i32 %X) optsize nounwind readnone { ; X86-LABEL: test_optsize: ; X86: # %bb.0: -; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %eax # imm = 0xCCCCCCCD -; X86-NEXT: addl $429496729, %eax # imm = 0x19999999 -; X86-NEXT: cmpl $858993459, %eax # imm = 0x33333333 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1717986919, %edx # imm = 0x66666667 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: leal (%edx,%edx,4), %eax +; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: movl $42, %eax -; X86-NEXT: jb .LBB1_2 +; X86-NEXT: je .LBB1_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: movl $-10, %eax ; X86-NEXT: .LBB1_2: @@ -59,12 +66,17 @@ ; ; X64-LABEL: test_optsize: ; X64: # %bb.0: -; X64-NEXT: imull $-858993459, %edi, %eax # imm = 0xCCCCCCCD -; X64-NEXT: addl $429496729, %eax # imm = 0x19999999 -; X64-NEXT: cmpl $858993459, %eax # imm = 0x33333333 +; X64-NEXT: movslq %edi, %rax +; X64-NEXT: imulq $1717986919, %rax, %rcx # imm = 0x66666667 +; X64-NEXT: movq %rcx, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $33, %rcx +; X64-NEXT: addl %edx, %ecx +; X64-NEXT: leal (%rcx,%rcx,4), %ecx +; X64-NEXT: cmpl %ecx, %eax ; X64-NEXT: movl $42, %ecx ; X64-NEXT: movl $-10, %eax -; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: cmovel %ecx, %eax ; X64-NEXT: retq %rem = srem i32 %X, 5 %cmp = icmp eq i32 %rem, 0 diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -9,76 +9,131 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2454267027,1374389535,1374389535] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 +; CHECK-SSE2-NEXT: psrad $3, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,306783378,171798690,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2454267027,1374389535,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm3 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE41-NEXT: psrad $5, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm5 +; CHECK-SSE41-NEXT: psrad $1, %xmm5 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm3[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm4 +; CHECK-SSE41-NEXT: paddd %xmm5, %xmm4 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: psubd %xmm4, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm2, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $1, %xmm2, %xmm5 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1],xmm1[2,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_even: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1],xmm1[2,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -94,55 +149,125 @@ define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_allones_eq: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,1717986919,0,1717986919] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = <0,u,4294967295,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $1, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_allones_eq: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,4294967295,858993458] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,0,0,0] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,0,4294967295,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_allones_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_allones_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [429496729,429496729,429496729,429496729] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_allones_eq: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -153,56 +278,126 @@ define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_allones_ne: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,1717986919,0,1717986919] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = <0,u,4294967295,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $1, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_allones_ne: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,4294967295,858993458] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,0,0,0] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,0,4294967295,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_allones_ne: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_allones_ne: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [429496729,429496729,429496729,429496729] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_allones_ne: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpandnd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -215,70 +410,131 @@ define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_allones_eq: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: pslld $31, %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,0,2454267027] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = <1,u,4294967295,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm5 +; CHECK-SSE2-NEXT: movdqa %xmm5, %xmm3 +; CHECK-SSE2-NEXT: psrad $3, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm5, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm5 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm5 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_allones_eq: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pslld $31, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,4294967295,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,0,0,0] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,4294967295,1] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $3, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_allones_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_allones_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_even_allones_eq: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprord $1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -289,71 +545,132 @@ define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_allones_ne: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: pslld $31, %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,0,2454267027] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = <1,u,4294967295,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm5 +; CHECK-SSE2-NEXT: movdqa %xmm5, %xmm3 +; CHECK-SSE2-NEXT: psrad $3, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm5, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm5 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm5 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_allones_ne: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pslld $31, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,4294967295,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,0,0,0] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,4294967295,1] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $3, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_allones_ne: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_allones_ne: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_even_allones_ne: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprord $1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpandnd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -366,77 +683,145 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_allones_eq: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,1374389535,1374389535] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4294967295,0] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm4 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_allones_eq: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,306783378,4294967295,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4294967295,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $5, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_allones_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_allones_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_eq: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -447,78 +832,146 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_allones_ne: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,1374389535,1374389535] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4294967295,0] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm4 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_allones_ne: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,306783378,4294967295,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4294967295,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $5, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_allones_ne: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_allones_ne: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_ne: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpandnd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -533,70 +986,123 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrlq $32, %xmm0 -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,1717986919,2147483649,1717986919] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,0,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 +; CHECK-SSE2-NEXT: psrad $1, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 +; CHECK-SSE2-NEXT: psrad $3, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <1,u,268435456,u> -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-SSE41-NEXT: psrlq $32, %xmm1 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,268435454,858993458] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm4 = <1717986919,u,2147483649,u> +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm4 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE41-NEXT: psrad $3, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE41-NEXT: psrad $1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm4 +; CHECK-SSE41-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: psubd %xmm4, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm2, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5],xmm4[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX2-NEXT: vpmuldq %xmm4, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm4, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -610,83 +1116,107 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,268435456,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2147483649,2454267027] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm0, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: psrad $3, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,u,2147483649,u> +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,268435454,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm0, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrad $3, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_even_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -700,82 +1230,131 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,268435456,1073741824] -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2454267027,2147483649,1374389535] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,4294967295,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 +; CHECK-SSE2-NEXT: psrad $3, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,306783378,268435454,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2454267027,2147483649,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm3 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE41-NEXT: psrad $5, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm5 +; CHECK-SSE41-NEXT: psrad $1, %xmm5 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm3[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm4 +; CHECK-SSE41-NEXT: paddd %xmm5, %xmm4 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: psubd %xmm4, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm2, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $1, %xmm2, %xmm5 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_even_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -791,54 +1370,123 @@ define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,1717986919,0,1717986919] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,0,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $1, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,4294967295,858993458] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1717986919,0,0,0] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm4 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE41-NEXT: psrad $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: psubd %xmm4, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [429496729,429496729,429496729,429496729] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX2-NEXT: vpmuldq %xmm4, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm4, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -852,70 +1500,121 @@ define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: pslld $31, %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,0,2454267027] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm0, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 +; CHECK-SSE2-NEXT: psrad $3, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pslld $31, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,4294967295,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,0,0,0] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm0, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $3, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_even_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprord $1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -928,76 +1627,139 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,4294967295,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 +; CHECK-SSE2-NEXT: psrad $3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,306783378,4294967295,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm3 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE41-NEXT: psrad $5, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE41-NEXT: psrad $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: psubd %xmm4, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm2, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $1, %xmm2, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_even_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1014,77 +1776,123 @@ ; CHECK-SSE2-LABEL: test_srem_odd_INT_MIN: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2] +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,1717986919,2147483647,1717986919] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = <0,u,4294967295,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $1, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: psrad $30, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_INT_MIN: ; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <1717986919,u,2147483647,u> +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,0,4294967295,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $30, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrad $1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm3, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-SSE41-NEXT: pand %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,0,858993458] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_INT_MIN: ; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $30, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_INT_MIN: ; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-AVX2-NEXT: vpand %xmm2, %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_INT_MIN: ; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -1097,114 +1905,129 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_INT_MIN: ; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3067833783,u,1,u> -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2147483647,2454267027] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295,0,4294967295] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pxor %xmm5, %xmm3 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; CHECK-SSE2-NEXT: psrld $31, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = <1,u,4294967295,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE2-NEXT: psrad $30, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_INT_MIN: ; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,u,2147483647,u> +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,4294967295,1] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $30, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm3, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3067833783,3067833783,1,3067833783] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378] -; CHECK-SSE41-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] -; CHECK-SSE41-NEXT: por %xmm5, %xmm4 -; CHECK-SSE41-NEXT: pminud %xmm4, %xmm3 -; CHECK-SSE41-NEXT: pcmpeqd %xmm4, %xmm3 -; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_INT_MIN: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378] -; CHECK-AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] -; CHECK-AVX1-NEXT: vpor %xmm5, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm3 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $30, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_INT_MIN: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] ; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378] -; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm4 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpor %xmm4, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpminud %xmm3, %xmm2, %xmm3 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_even_INT_MIN: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] ; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378] -; CHECK-AVX512VL-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; CHECK-AVX512VL-NEXT: vpminud %xmm3, %xmm2, %xmm3 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; CHECK-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -1217,110 +2040,144 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_INT_MIN: ; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,1,3264175145] -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2454267027,2147483647,1374389535] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,1374389535,1374389535] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4294967295,0] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2147483648,2,1073741824] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; CHECK-SSE2-NEXT: psrld $31, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE2-NEXT: psrad $30, %xmm3 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_INT_MIN: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,3067833783,1,3264175145] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483647,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4294967295,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $5, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $30, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm4 +; CHECK-SSE41-NEXT: psrad $1, %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: por %xmm4, %xmm3 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [858993458,306783378,0,42949672] -; CHECK-SSE41-NEXT: pminud %xmm3, %xmm2 -; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm2 -; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm4, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_INT_MIN: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $30, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_INT_MIN: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_even_INT_MIN: ; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -1335,71 +2192,140 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrlq $32, %xmm0 -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,0,2147483649,1717986919] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,0,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,0,1717986919,1717986919] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,4294967295,1,0] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $1, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE2-NEXT: psrad $3, %xmm3 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm4 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <1,u,268435456,u> -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-SSE41-NEXT: psrlq $32, %xmm1 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435454,858993458] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1717986919] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm4 +; CHECK-SSE41-NEXT: psrad $3, %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1,2,3],xmm4[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: paddd %xmm4, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_allones_and_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -1412,83 +2338,129 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,1,268435456,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,4294967295] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,0,2147483649,2454267027] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [0,0,2454267027,2454267027] +; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm6 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE2-NEXT: psrad $3, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm5 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[0,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[2,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,0,2147483649,2454267027] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,4294967295,268435454,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,4294967295,1,1] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $3, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_even_allones_and_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -1501,83 +2473,144 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1073741824] -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,0,2147483649,1374389535] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,0,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,0,1374389535,1374389535] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,4294967295,1,0] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE2-NEXT: psrad $3, %xmm3 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $1, %xmm2 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm4 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435454,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $5, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm4 +; CHECK-SSE41-NEXT: psrad $1, %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3],xmm4[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: paddd %xmm4, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_and_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -1592,55 +2625,126 @@ define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_allones_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,0,0,1717986919] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,0,1717986919,1717986919] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,1,0] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $1, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[1,2] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,3,1] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_allones_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,4294967295,858993458] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,0,0,1717986919] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_allones_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_allones_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [429496729,429496729,429496729,429496729] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_allones_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -1653,70 +2757,128 @@ define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_allones_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: pslld $31, %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,0,4294967295] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,0,0,2454267027] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [0,0,2454267027,2454267027] +; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm6 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE2-NEXT: psrad $3, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[1,2] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,3,1] +; CHECK-SSE2-NEXT: psrld $31, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_allones_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pslld $31, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,4294967295,4294967295,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,0,0,2454267027] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,4294967295,1,1] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $3, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_allones_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_allones_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_even_allones_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprord $1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -1729,77 +2891,137 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_allones_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,0,0,1374389535] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,0,1374389535,1374389535] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,1,0] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $5, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $1, %xmm3 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_allones_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,0,0,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $5, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrad $1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 ; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,4294967295,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm3, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_allones_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_allones_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -1814,76 +3036,134 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2147483649,0,1717986919] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,4294967295,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 +; CHECK-SSE2-NEXT: psrad $1, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5 +; CHECK-SSE2-NEXT: psrad $3, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm4[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,268435454,4294967295,858993458] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2147483649,0,1717986919] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm3 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE41-NEXT: psrad $1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: psubd %xmm4, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm2, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_poweroftwo_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1897,83 +3177,119 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,268435456,1,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2147483649,0,2454267027] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm0, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 +; CHECK-SSE2-NEXT: psrad $3, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,268435454,4294967295,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: paddd %xmm0, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $3, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_even_poweroftwo_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -1986,76 +3302,139 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2147483649,0,1374389535] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,4294967295,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 +; CHECK-SSE2-NEXT: psrad $3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,268435454,4294967295,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2147483649,0,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm3 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE41-NEXT: psrad $5, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE41-NEXT: psrad $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: psubd %xmm4, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm2, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $1, %xmm2, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_even_poweroftwo_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -2070,68 +3449,122 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrlq $32, %xmm0 -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,1] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,1,1] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm5, %xmm5 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,0,2147483649,0] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm5 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [0,0,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm6 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm6 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: psrlq $32, %xmm3 +; CHECK-SSE2-NEXT: psubd %xmm6, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 +; CHECK-SSE2-NEXT: psrad $3, %xmm4 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,1] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <1717986919,u,2147483649,u> +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: psrlq $32, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $3, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE41-NEXT: psrad $1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrlq $32, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435454,4294967295] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpsrlq $32, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrlq $32, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -2143,68 +3576,112 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrlq $32, %xmm0 -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,0,2147483649,0] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 +; CHECK-SSE2-NEXT: psrlq $32, %xmm5 +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: psrad $3, %xmm5 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrlq $32, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,4294967295,268435454,4294967295] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,4294967295,1,1] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,u,2147483649,u> +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: psrlq $32, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $3, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpsrlq $32, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrlq $32, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -2218,123 +3695,190 @@ ; CHECK-SSE2-LABEL: pr51133: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: movq %rdi, %rax -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm5 -; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; CHECK-SSE2-NEXT: pand %xmm4, %xmm5 ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm6 ; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 -; CHECK-SSE2-NEXT: pand %xmm4, %xmm6 -; CHECK-SSE2-NEXT: packuswb %xmm5, %xmm6 -; CHECK-SSE2-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 -; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm5 -; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 -; CHECK-SSE2-NEXT: psrlw $8, %xmm5 -; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm6 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm4 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: packuswb %xmm4, %xmm6 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pxor %xmm8, %xmm8 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] +; CHECK-SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 +; CHECK-SSE2-NEXT: psrlw $8, %xmm8 +; CHECK-SSE2-NEXT: pxor %xmm7, %xmm7 +; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; CHECK-SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 +; CHECK-SSE2-NEXT: psrlw $8, %xmm7 +; CHECK-SSE2-NEXT: packuswb %xmm8, %xmm7 +; CHECK-SSE2-NEXT: paddb %xmm6, %xmm7 +; CHECK-SSE2-NEXT: movdqa %xmm7, %xmm6 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; CHECK-SSE2-NEXT: psraw $8, %xmm6 ; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 ; CHECK-SSE2-NEXT: psrlw $8, %xmm6 -; CHECK-SSE2-NEXT: packuswb %xmm5, %xmm6 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm7 = [84,2,36,42,2,0,2,4,2,255,4,36,126,30,2,2] -; CHECK-SSE2-NEXT: pminub %xmm6, %xmm7 -; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm7 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; CHECK-SSE2-NEXT: pandn %xmm5, %xmm7 -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm6, %xmm6 -; CHECK-SSE2-NEXT: pcmpgtb %xmm6, %xmm1 -; CHECK-SSE2-NEXT: pandn %xmm1, %xmm5 -; CHECK-SSE2-NEXT: por %xmm7, %xmm5 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pand %xmm4, %xmm1 -; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pand %xmm4, %xmm0 -; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm0 -; CHECK-SSE2-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: psrlw $8, %xmm1 -; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: psrlw $8, %xmm0 -; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm0 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [19,51,13,7,127,31,127,3,5,5,51,37,3,127,85,5] -; CHECK-SSE2-NEXT: pmaxub %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm3 -; CHECK-SSE2-NEXT: pandn %xmm5, %xmm3 -; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm2 -; CHECK-SSE2-NEXT: pandn %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pmovmskb %xmm2, %ecx -; CHECK-SSE2-NEXT: pmovmskb %xmm3, %edx -; CHECK-SSE2-NEXT: shll $16, %edx +; CHECK-SSE2-NEXT: movdqa %xmm7, %xmm8 +; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; CHECK-SSE2-NEXT: psraw $8, %xmm8 +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 +; CHECK-SSE2-NEXT: psrlw $8, %xmm8 +; CHECK-SSE2-NEXT: packuswb %xmm6, %xmm8 +; CHECK-SSE2-NEXT: psrlw $7, %xmm7 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-SSE2-NEXT: pand %xmm6, %xmm7 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 +; CHECK-SSE2-NEXT: paddb %xmm8, %xmm7 +; CHECK-SSE2-NEXT: movdqa %xmm7, %xmm8 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 +; CHECK-SSE2-NEXT: pand %xmm5, %xmm8 +; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 +; CHECK-SSE2-NEXT: pand %xmm5, %xmm7 +; CHECK-SSE2-NEXT: packuswb %xmm8, %xmm7 +; CHECK-SSE2-NEXT: psubb %xmm7, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm7, %xmm7 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] +; CHECK-SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 +; CHECK-SSE2-NEXT: psrlw $8, %xmm7 +; CHECK-SSE2-NEXT: pxor %xmm8, %xmm8 +; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; CHECK-SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 +; CHECK-SSE2-NEXT: psrlw $8, %xmm8 +; CHECK-SSE2-NEXT: packuswb %xmm7, %xmm8 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm7 = [0,0,0,0,255,255,255,0,255,255,0,255,0,255,0,255] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm7 +; CHECK-SSE2-NEXT: paddb %xmm8, %xmm7 +; CHECK-SSE2-NEXT: movdqa %xmm7, %xmm8 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; CHECK-SSE2-NEXT: psraw $8, %xmm8 +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 +; CHECK-SSE2-NEXT: psrlw $8, %xmm8 +; CHECK-SSE2-NEXT: movdqa %xmm7, %xmm9 +; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; CHECK-SSE2-NEXT: psraw $8, %xmm9 +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm9 +; CHECK-SSE2-NEXT: psrlw $8, %xmm9 +; CHECK-SSE2-NEXT: packuswb %xmm8, %xmm9 +; CHECK-SSE2-NEXT: psrlw $7, %xmm7 +; CHECK-SSE2-NEXT: pand %xmm6, %xmm7 +; CHECK-SSE2-NEXT: paddb %xmm9, %xmm7 +; CHECK-SSE2-NEXT: movdqa %xmm7, %xmm6 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 +; CHECK-SSE2-NEXT: pand %xmm5, %xmm6 +; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 +; CHECK-SSE2-NEXT: pand %xmm5, %xmm7 +; CHECK-SSE2-NEXT: packuswb %xmm6, %xmm7 +; CHECK-SSE2-NEXT: psubb %xmm7, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqb %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqb %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqb %xmm4, %xmm2 +; CHECK-SSE2-NEXT: por %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pcmpeqb %xmm4, %xmm3 +; CHECK-SSE2-NEXT: por %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pmovmskb %xmm3, %ecx +; CHECK-SSE2-NEXT: notl %ecx +; CHECK-SSE2-NEXT: shll $16, %ecx +; CHECK-SSE2-NEXT: pmovmskb %xmm2, %edx +; CHECK-SSE2-NEXT: xorl $65535, %edx # imm = 0xFFFF ; CHECK-SSE2-NEXT: orl %ecx, %edx ; CHECK-SSE2-NEXT: movl %edx, (%rdi) ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: pr51133: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm4 ; CHECK-SSE41-NEXT: movq %rdi, %rax -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm0 -; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE41-NEXT: pxor %xmm5, %xmm5 +; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; CHECK-SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE41-NEXT: psrlw $8, %xmm5 +; CHECK-SSE41-NEXT: pxor %xmm6, %xmm6 +; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; CHECK-SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 +; CHECK-SSE41-NEXT: psrlw $8, %xmm6 +; CHECK-SSE41-NEXT: packuswb %xmm5, %xmm6 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm7 = [0,0,0,0,255,255,255,0,255,255,0,255,0,255,0,255] +; CHECK-SSE41-NEXT: pand %xmm0, %xmm7 +; CHECK-SSE41-NEXT: paddb %xmm6, %xmm7 +; CHECK-SSE41-NEXT: movdqa %xmm7, %xmm5 +; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] +; CHECK-SSE41-NEXT: psraw $8, %xmm5 +; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE41-NEXT: psrlw $8, %xmm5 +; CHECK-SSE41-NEXT: movdqa %xmm7, %xmm8 +; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; CHECK-SSE41-NEXT: psraw $8, %xmm8 +; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 +; CHECK-SSE41-NEXT: psrlw $8, %xmm8 +; CHECK-SSE41-NEXT: packuswb %xmm5, %xmm8 +; CHECK-SSE41-NEXT: psrlw $7, %xmm7 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-SSE41-NEXT: pand %xmm6, %xmm7 +; CHECK-SSE41-NEXT: paddb %xmm8, %xmm7 +; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 ; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] -; CHECK-SSE41-NEXT: pand %xmm5, %xmm0 -; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; CHECK-SSE41-NEXT: pand %xmm5, %xmm7 +; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 +; CHECK-SSE41-NEXT: pand %xmm5, %xmm8 +; CHECK-SSE41-NEXT: packuswb %xmm7, %xmm8 +; CHECK-SSE41-NEXT: psubb %xmm8, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm7, %xmm7 +; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] +; CHECK-SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 +; CHECK-SSE41-NEXT: psrlw $8, %xmm7 +; CHECK-SSE41-NEXT: pxor %xmm8, %xmm8 +; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; CHECK-SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 +; CHECK-SSE41-NEXT: psrlw $8, %xmm8 +; CHECK-SSE41-NEXT: packuswb %xmm7, %xmm8 +; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 +; CHECK-SSE41-NEXT: pand %xmm5, %xmm7 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm9 +; CHECK-SSE41-NEXT: pshufb {{.*#+}} xmm9 = zero,zero,xmm9[9],zero,zero,zero,xmm9[11],zero,xmm9[12],zero,xmm9[13],zero,zero,zero,xmm9[15],zero +; CHECK-SSE41-NEXT: packuswb %xmm9, %xmm7 +; CHECK-SSE41-NEXT: paddb %xmm8, %xmm7 +; CHECK-SSE41-NEXT: movdqa %xmm7, %xmm8 +; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; CHECK-SSE41-NEXT: psraw $8, %xmm8 +; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 +; CHECK-SSE41-NEXT: psrlw $8, %xmm8 +; CHECK-SSE41-NEXT: movdqa %xmm7, %xmm9 +; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; CHECK-SSE41-NEXT: psraw $8, %xmm9 +; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm9 +; CHECK-SSE41-NEXT: psrlw $8, %xmm9 +; CHECK-SSE41-NEXT: packuswb %xmm8, %xmm9 +; CHECK-SSE41-NEXT: psrlw $7, %xmm7 +; CHECK-SSE41-NEXT: pand %xmm6, %xmm7 +; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 +; CHECK-SSE41-NEXT: paddb %xmm9, %xmm7 +; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 +; CHECK-SSE41-NEXT: pand %xmm5, %xmm7 ; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 ; CHECK-SSE41-NEXT: pand %xmm5, %xmm6 -; CHECK-SSE41-NEXT: packuswb %xmm0, %xmm6 -; CHECK-SSE41-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 -; CHECK-SSE41-NEXT: movdqa %xmm6, %xmm0 -; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: psrlw $8, %xmm0 -; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 -; CHECK-SSE41-NEXT: psrlw $8, %xmm6 -; CHECK-SSE41-NEXT: packuswb %xmm0, %xmm6 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [84,2,36,42,2,0,2,4,2,255,4,36,126,30,2,2] -; CHECK-SSE41-NEXT: pminub %xmm6, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm7, %xmm7 -; CHECK-SSE41-NEXT: pxor %xmm0, %xmm7 -; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm6, %xmm6 -; CHECK-SSE41-NEXT: pcmpgtb %xmm6, %xmm1 -; CHECK-SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; CHECK-SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm1 -; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; CHECK-SSE41-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pand %xmm5, %xmm0 -; CHECK-SSE41-NEXT: packuswb %xmm4, %xmm0 -; CHECK-SSE41-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm4 -; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; CHECK-SSE41-NEXT: psrlw $8, %xmm4 -; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: psrlw $8, %xmm0 -; CHECK-SSE41-NEXT: packuswb %xmm4, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm4 = [19,51,13,7,127,31,127,3,5,5,51,37,3,127,85,5] -; CHECK-SSE41-NEXT: pmaxub %xmm0, %xmm4 -; CHECK-SSE41-NEXT: pcmpeqb %xmm0, %xmm4 -; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm3 -; CHECK-SSE41-NEXT: pandn %xmm1, %xmm3 -; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm2 -; CHECK-SSE41-NEXT: pandn %xmm4, %xmm2 +; CHECK-SSE41-NEXT: packuswb %xmm7, %xmm6 +; CHECK-SSE41-NEXT: psubb %xmm6, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqb %xmm4, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqb %xmm4, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqb %xmm4, %xmm3 +; CHECK-SSE41-NEXT: por %xmm1, %xmm3 +; CHECK-SSE41-NEXT: pcmpeqb %xmm4, %xmm2 +; CHECK-SSE41-NEXT: por %xmm0, %xmm2 ; CHECK-SSE41-NEXT: pmovmskb %xmm2, %ecx +; CHECK-SSE41-NEXT: xorl $65535, %ecx # imm = 0xFFFF ; CHECK-SSE41-NEXT: pmovmskb %xmm3, %edx +; CHECK-SSE41-NEXT: notl %edx ; CHECK-SSE41-NEXT: shll $16, %edx ; CHECK-SSE41-NEXT: orl %ecx, %edx ; CHECK-SSE41-NEXT: movl %edx, (%rdi) @@ -2360,57 +3904,55 @@ ; CHECK-AVX1-NEXT: vpsraw $8, %xmm5, %xmm5 ; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 ; CHECK-AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 -; CHECK-AVX1-NEXT: vpackuswb %xmm4, %xmm5, %xmm5 +; CHECK-AVX1-NEXT: vpackuswb %xmm4, %xmm5, %xmm4 ; CHECK-AVX1-NEXT: vpsrlw $7, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; CHECK-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpaddb %xmm3, %xmm5, %xmm5 -; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm6 +; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm4 +; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm5 ; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; CHECK-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm6 -; CHECK-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 ; CHECK-AVX1-NEXT: vpand %xmm3, %xmm5, %xmm5 -; CHECK-AVX1-NEXT: vpackuswb %xmm6, %xmm5, %xmm5 -; CHECK-AVX1-NEXT: vpsubb %xmm5, %xmm0, %xmm5 +; CHECK-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 +; CHECK-AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 +; CHECK-AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm4 ; CHECK-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 +; CHECK-AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 +; CHECK-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 ; CHECK-AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 -; CHECK-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7 -; CHECK-AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7 -; CHECK-AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm6 -; CHECK-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7 -; CHECK-AVX1-NEXT: vpand %xmm3, %xmm7, %xmm7 -; CHECK-AVX1-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm0[9],zero,zero,zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,zero,zero,xmm0[15],zero -; CHECK-AVX1-NEXT: vpackuswb %xmm8, %xmm7, %xmm7 -; CHECK-AVX1-NEXT: vpaddb %xmm7, %xmm6, %xmm6 -; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-AVX1-NEXT: vpackuswb %xmm5, %xmm6, %xmm5 +; CHECK-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 +; CHECK-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm6 +; CHECK-AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm0[9],zero,zero,zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,zero,zero,xmm0[15],zero +; CHECK-AVX1-NEXT: vpackuswb %xmm7, %xmm6, %xmm6 +; CHECK-AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5 +; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-AVX1-NEXT: vpsraw $8, %xmm6, %xmm6 +; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 +; CHECK-AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 +; CHECK-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; CHECK-AVX1-NEXT: vpsraw $8, %xmm7, %xmm7 ; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7 ; CHECK-AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7 -; CHECK-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-AVX1-NEXT: vpsraw $8, %xmm8, %xmm8 -; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8, %xmm8 -; CHECK-AVX1-NEXT: vpsrlw $8, %xmm8, %xmm8 -; CHECK-AVX1-NEXT: vpackuswb %xmm7, %xmm8, %xmm7 -; CHECK-AVX1-NEXT: vpsrlw $7, %xmm6, %xmm6 -; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 -; CHECK-AVX1-NEXT: vpand %xmm4, %xmm6, %xmm4 -; CHECK-AVX1-NEXT: vpaddb %xmm4, %xmm7, %xmm4 -; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm6 +; CHECK-AVX1-NEXT: vpsrlw $7, %xmm5, %xmm5 +; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 +; CHECK-AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5 +; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 ; CHECK-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm6 -; CHECK-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 -; CHECK-AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 +; CHECK-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 +; CHECK-AVX1-NEXT: vpand %xmm3, %xmm5, %xmm3 ; CHECK-AVX1-NEXT: vpackuswb %xmm6, %xmm3, %xmm3 ; CHECK-AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqb %xmm2, %xmm5, %xmm3 +; CHECK-AVX1-NEXT: vpcmpeqb %xmm2, %xmm4, %xmm3 ; CHECK-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; CHECK-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; CHECK-AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3 @@ -2469,32 +4011,46 @@ ; ; CHECK-AVX512VL-LABEL: pr51133: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] +; CHECK-AVX512VL-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; CHECK-AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 +; CHECK-AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] +; CHECK-AVX512VL-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 +; CHECK-AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4 +; CHECK-AVX512VL-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 ; CHECK-AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; CHECK-AVX512VL-NEXT: vpand %ymm3, %ymm4, %ymm3 -; CHECK-AVX512VL-NEXT: vpackuswb %ymm2, %ymm3, %ymm2 -; CHECK-AVX512VL-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; CHECK-AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 +; CHECK-AVX512VL-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[8],zero,ymm0[9],zero,zero,zero,ymm0[11],zero,zero,zero,ymm0[13],zero,zero,zero,ymm0[15],zero,zero,zero,ymm0[25],zero,zero,zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,zero,zero,ymm0[31],zero +; CHECK-AVX512VL-NEXT: vpackuswb %ymm6, %ymm4, %ymm4 +; CHECK-AVX512VL-NEXT: vpaddb %ymm4, %ymm3, %ymm3 +; CHECK-AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-AVX512VL-NEXT: vpsraw $8, %ymm4, %ymm4 +; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 +; CHECK-AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4 +; CHECK-AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-AVX512VL-NEXT: vpsraw $8, %ymm6, %ymm6 +; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 +; CHECK-AVX512VL-NEXT: vpsrlw $8, %ymm6, %ymm6 +; CHECK-AVX512VL-NEXT: vpackuswb %ymm4, %ymm6, %ymm4 +; CHECK-AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm3 +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; CHECK-AVX512VL-NEXT: vpternlogd $128, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm6 +; CHECK-AVX512VL-NEXT: vpaddb %ymm6, %ymm4, %ymm3 +; CHECK-AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 +; CHECK-AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 +; CHECK-AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 -; CHECK-AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 -; CHECK-AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; CHECK-AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 -; CHECK-AVX512VL-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 -; CHECK-AVX512VL-NEXT: vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 -; CHECK-AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; CHECK-AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; CHECK-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-AVX512VL-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0 -; CHECK-AVX512VL-NEXT: vpandn %ymm0, %ymm3, %ymm3 -; CHECK-AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm0 -; CHECK-AVX512VL-NEXT: vpternlogq $14, %ymm3, %ymm2, %ymm0 +; CHECK-AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3 +; CHECK-AVX512VL-NEXT: vpackuswb %ymm4, %ymm3, %ymm3 +; CHECK-AVX512VL-NEXT: vpsubb %ymm3, %ymm0, %ymm0 +; CHECK-AVX512VL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm3 +; CHECK-AVX512VL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; CHECK-AVX512VL-NEXT: vpternlogq $30, %ymm3, %ymm1, %ymm0 ; CHECK-AVX512VL-NEXT: retq %rem = srem <32 x i8> %x, %cmp = icmp ne <32 x i8> %rem, zeroinitializer diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll --- a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll @@ -9,55 +9,105 @@ define <4 x i32> @test_srem_odd_25(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_25: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm1, %xmm4 +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: psrad $3, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [25,25,25,25] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_25: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [171798690,171798690,171798690,171798690] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrad $3, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_25: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_25: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [85899345,85899345,85899345,85899345] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [171798690,171798690,171798690,171798690] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [25,25,25,25] +; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_25: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -71,70 +121,105 @@ define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_100: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: pslld $30, %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm1, %xmm4 +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [100,100,100,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_100: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pslld $30, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrad $5, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_100: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_100: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [85899344,85899344,85899344,85899344] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $2, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpslld $30, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsrad $5, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [100,100,100,100] +; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_even_100: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprord $2, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrad $5, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -150,55 +235,102 @@ define <4 x i32> @test_srem_odd_neg25(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_neg25: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1374389535,2920577761,2920577761,1374389535] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,4294967295,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: psrad $3, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_neg25: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [171798690,171798690,171798690,171798690] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,2920577761,2920577761,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrad $3, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_neg25: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_neg25: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [85899345,85899345,85899345,85899345] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [171798690,171798690,171798690,171798690] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_neg25: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -212,70 +344,106 @@ define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_neg100: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: pslld $30, %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2920577761,1374389535,2920577761,1374389535] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,0,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_neg100: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pslld $30, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2920577761,u,2920577761,u> +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrad $5, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_neg100: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_neg100: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [85899344,85899344,85899344,85899344] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $2, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpslld $30, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2920577761,2920577761,2920577761,2920577761] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsrad $5, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_even_neg100: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprord $2, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2920577761,2920577761,2920577761,2920577761] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrad $5, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/srem-seteq.ll b/llvm/test/CodeGen/X86/srem-seteq.ll --- a/llvm/test/CodeGen/X86/srem-seteq.ll +++ b/llvm/test/CodeGen/X86/srem-seteq.ll @@ -9,20 +9,32 @@ define i32 @test_srem_odd(i32 %X) nounwind { ; X86-LABEL: test_srem_odd: ; X86: # %bb.0: -; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %ecx # imm = 0xCCCCCCCD -; X86-NEXT: addl $429496729, %ecx # imm = 0x19999999 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1717986919, %edx # imm = 0x66666667 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: leal (%edx,%edx,4), %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $858993459, %ecx # imm = 0x33333333 -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_srem_odd: ; X64: # %bb.0: -; X64-NEXT: imull $-858993459, %edi, %ecx # imm = 0xCCCCCCCD -; X64-NEXT: addl $429496729, %ecx # imm = 0x19999999 +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: imulq $1717986919, %rcx, %rax # imm = 0x66666667 +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $33, %rax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: leal (%rax,%rax,4), %edx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $858993459, %ecx # imm = 0x33333333 -; X64-NEXT: setb %al +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: sete %al ; X64-NEXT: retq %srem = srem i32 %X, 5 %cmp = icmp eq i32 %srem, 0 @@ -33,20 +45,34 @@ define i32 @test_srem_odd_25(i32 %X) nounwind { ; X86-LABEL: test_srem_odd_25: ; X86: # %bb.0: -; X86-NEXT: imull $-1030792151, {{[0-9]+}}(%esp), %ecx # imm = 0xC28F5C29 -; X86-NEXT: addl $85899345, %ecx # imm = 0x51EB851 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1374389535, %edx # imm = 0x51EB851F +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl $3, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: leal (%edx,%edx,4), %eax +; X86-NEXT: leal (%eax,%eax,4), %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $171798691, %ecx # imm = 0xA3D70A3 -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_srem_odd_25: ; X64: # %bb.0: -; X64-NEXT: imull $-1030792151, %edi, %ecx # imm = 0xC28F5C29 -; X64-NEXT: addl $85899345, %ecx # imm = 0x51EB851 +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: imulq $1374389535, %rcx, %rax # imm = 0x51EB851F +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $35, %rax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: leal (%rax,%rax,4), %eax +; X64-NEXT: leal (%rax,%rax,4), %edx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $171798691, %ecx # imm = 0xA3D70A3 -; X64-NEXT: setb %al +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: sete %al ; X64-NEXT: retq %srem = srem i32 %X, 25 %cmp = icmp eq i32 %srem, 0 @@ -58,20 +84,34 @@ define i32 @test_srem_odd_bit30(i32 %X) nounwind { ; X86-LABEL: test_srem_odd_bit30: ; X86: # %bb.0: -; X86-NEXT: imull $1789569707, {{[0-9]+}}(%esp), %ecx # imm = 0x6AAAAAAB -; X86-NEXT: incl %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $536870911, %edx # imm = 0x1FFFFFFF +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl $27, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: imull $1073741827, %edx, %edx # imm = 0x40000003 ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $3, %ecx -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_srem_odd_bit30: ; X64: # %bb.0: -; X64-NEXT: imull $1789569707, %edi, %ecx # imm = 0x6AAAAAAB -; X64-NEXT: incl %ecx +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: shlq $29, %rax +; X64-NEXT: subq %rcx, %rax +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $59, %rax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: imull $1073741827, %eax, %edx # imm = 0x40000003 ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $3, %ecx -; X64-NEXT: setb %al +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: sete %al ; X64-NEXT: retq %srem = srem i32 %X, 1073741827 %cmp = icmp eq i32 %srem, 0 @@ -83,20 +123,35 @@ define i32 @test_srem_odd_bit31(i32 %X) nounwind { ; X86-LABEL: test_srem_odd_bit31: ; X86: # %bb.0: -; X86-NEXT: imull $-715827883, {{[0-9]+}}(%esp), %ecx # imm = 0xD5555555 -; X86-NEXT: incl %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-536870913, %edx # imm = 0xDFFFFFFF +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl $28, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: imull $-2147483645, %edx, %edx # imm = 0x80000003 ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $3, %ecx -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_srem_odd_bit31: ; X64: # %bb.0: -; X64-NEXT: imull $-715827883, %edi, %ecx # imm = 0xD5555555 -; X64-NEXT: incl %ecx +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: shlq $29, %rax +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $60, %rax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: imull $-2147483645, %eax, %edx # imm = 0x80000003 ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $3, %ecx -; X64-NEXT: setb %al +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: sete %al ; X64-NEXT: retq %srem = srem i32 %X, 2147483651 %cmp = icmp eq i32 %srem, 0 @@ -111,25 +166,35 @@ define i16 @test_srem_even(i16 %X) nounwind { ; X86-LABEL: test_srem_even: ; X86: # %bb.0: -; X86-NEXT: imull $28087, {{[0-9]+}}(%esp), %eax # imm = 0x6DB7 -; X86-NEXT: addl $4680, %eax # imm = 0x1248 -; X86-NEXT: rorw %ax -; X86-NEXT: movzwl %ax, %ecx +; X86-NEXT: movswl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: imull $18725, %ecx, %edx # imm = 0x4925 +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl $18, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: leal (%edx,%edx), %eax +; X86-NEXT: shll $4, %edx +; X86-NEXT: subl %eax, %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $4681, %ecx # imm = 0x1249 -; X86-NEXT: setae %al +; X86-NEXT: cmpw %dx, %cx +; X86-NEXT: setne %al ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; ; X64-LABEL: test_srem_even: ; X64: # %bb.0: -; X64-NEXT: imull $28087, %edi, %eax # imm = 0x6DB7 -; X64-NEXT: addl $4680, %eax # imm = 0x1248 -; X64-NEXT: rorw %ax -; X64-NEXT: movzwl %ax, %ecx +; X64-NEXT: movswl %di, %ecx +; X64-NEXT: imull $18725, %ecx, %edx # imm = 0x4925 +; X64-NEXT: movl %edx, %eax +; X64-NEXT: shrl $31, %eax +; X64-NEXT: sarl $18, %edx +; X64-NEXT: addl %eax, %edx +; X64-NEXT: leal (%rdx,%rdx), %eax +; X64-NEXT: shll $4, %edx +; X64-NEXT: subl %eax, %edx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $4681, %ecx # imm = 0x1249 -; X64-NEXT: setae %al +; X64-NEXT: cmpw %dx, %cx +; X64-NEXT: setne %al ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %srem = srem i16 %X, 14 @@ -141,22 +206,32 @@ define i32 @test_srem_even_100(i32 %X) nounwind { ; X86-LABEL: test_srem_even_100: ; X86: # %bb.0: -; X86-NEXT: imull $-1030792151, {{[0-9]+}}(%esp), %ecx # imm = 0xC28F5C29 -; X86-NEXT: addl $85899344, %ecx # imm = 0x51EB850 -; X86-NEXT: rorl $2, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1374389535, %edx # imm = 0x51EB851F +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl $5, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: imull $100, %edx, %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $42949673, %ecx # imm = 0x28F5C29 -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_srem_even_100: ; X64: # %bb.0: -; X64-NEXT: imull $-1030792151, %edi, %ecx # imm = 0xC28F5C29 -; X64-NEXT: addl $85899344, %ecx # imm = 0x51EB850 -; X64-NEXT: rorl $2, %ecx +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: imulq $1374389535, %rcx, %rax # imm = 0x51EB851F +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $37, %rax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: imull $100, %eax, %edx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $42949673, %ecx # imm = 0x28F5C29 -; X64-NEXT: setb %al +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: sete %al ; X64-NEXT: retq %srem = srem i32 %X, 100 %cmp = icmp eq i32 %srem, 0 @@ -168,22 +243,32 @@ define i32 @test_srem_even_bit30(i32 %X) nounwind { ; X86-LABEL: test_srem_even_bit30: ; X86: # %bb.0: -; X86-NEXT: imull $-51622203, {{[0-9]+}}(%esp), %ecx # imm = 0xFCEC4EC5 -; X86-NEXT: addl $8, %ecx -; X86-NEXT: rorl $3, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1073741721, %edx # imm = 0x3FFFFF99 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl $28, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: imull $1073741928, %edx, %edx # imm = 0x40000068 ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $3, %ecx -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_srem_even_bit30: ; X64: # %bb.0: -; X64-NEXT: imull $-51622203, %edi, %ecx # imm = 0xFCEC4EC5 -; X64-NEXT: addl $8, %ecx -; X64-NEXT: rorl $3, %ecx +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: imulq $1073741721, %rcx, %rax # imm = 0x3FFFFF99 +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $60, %rax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: imull $1073741928, %eax, %edx # imm = 0x40000068 ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $3, %ecx -; X64-NEXT: setb %al +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: sete %al ; X64-NEXT: retq %srem = srem i32 %X, 1073741928 %cmp = icmp eq i32 %srem, 0 @@ -195,22 +280,35 @@ define i32 @test_srem_even_bit31(i32 %X) nounwind { ; X86-LABEL: test_srem_even_bit31: ; X86: # %bb.0: -; X86-NEXT: imull $-989526779, {{[0-9]+}}(%esp), %ecx # imm = 0xC5050505 -; X86-NEXT: addl $2, %ecx -; X86-NEXT: rorl %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $2147483545, %edx # imm = 0x7FFFFF99 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: subl %ecx, %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl $30, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: imull $-2147483546, %edx, %edx # imm = 0x80000066 ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $3, %ecx -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_srem_even_bit31: ; X64: # %bb.0: -; X64-NEXT: imull $-989526779, %edi, %ecx # imm = 0xC5050505 -; X64-NEXT: addl $2, %ecx -; X64-NEXT: rorl %ecx +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: imulq $2147483545, %rcx, %rax # imm = 0x7FFFFF99 +; X64-NEXT: shrq $32, %rax +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: movl %eax, %edx +; X64-NEXT: shrl $31, %edx +; X64-NEXT: sarl $30, %eax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: imull $-2147483546, %eax, %edx # imm = 0x80000066 ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $3, %ecx -; X64-NEXT: setb %al +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: sete %al ; X64-NEXT: retq %srem = srem i32 %X, 2147483750 %cmp = icmp eq i32 %srem, 0 @@ -226,20 +324,32 @@ define i32 @test_srem_odd_setne(i32 %X) nounwind { ; X86-LABEL: test_srem_odd_setne: ; X86: # %bb.0: -; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %ecx # imm = 0xCCCCCCCD -; X86-NEXT: addl $429496729, %ecx # imm = 0x19999999 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1717986919, %edx # imm = 0x66666667 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: leal (%edx,%edx,4), %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $858993459, %ecx # imm = 0x33333333 -; X86-NEXT: setae %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: test_srem_odd_setne: ; X64: # %bb.0: -; X64-NEXT: imull $-858993459, %edi, %ecx # imm = 0xCCCCCCCD -; X64-NEXT: addl $429496729, %ecx # imm = 0x19999999 +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: imulq $1717986919, %rcx, %rax # imm = 0x66666667 +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $33, %rax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: leal (%rax,%rax,4), %edx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $858993459, %ecx # imm = 0x33333333 -; X64-NEXT: setae %al +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: setne %al ; X64-NEXT: retq %srem = srem i32 %X, 5 %cmp = icmp ne i32 %srem, 0 @@ -251,20 +361,32 @@ define i32 @test_srem_negative_odd(i32 %X) nounwind { ; X86-LABEL: test_srem_negative_odd: ; X86: # %bb.0: -; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %ecx # imm = 0xCCCCCCCD -; X86-NEXT: addl $429496729, %ecx # imm = 0x19999999 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-1717986919, %edx # imm = 0x99999999 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: leal (%edx,%edx,4), %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $858993459, %ecx # imm = 0x33333333 -; X86-NEXT: setae %al +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: test_srem_negative_odd: ; X64: # %bb.0: -; X64-NEXT: imull $-858993459, %edi, %ecx # imm = 0xCCCCCCCD -; X64-NEXT: addl $429496729, %ecx # imm = 0x19999999 +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: imulq $-1717986919, %rcx, %rax # imm = 0x99999999 +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $33, %rax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: leal (%rax,%rax,4), %edx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $858993459, %ecx # imm = 0x33333333 -; X64-NEXT: setae %al +; X64-NEXT: addl %edx, %ecx +; X64-NEXT: setne %al ; X64-NEXT: retq %srem = srem i32 %X, -5 %cmp = icmp ne i32 %srem, 0 @@ -274,22 +396,35 @@ define i32 @test_srem_negative_even(i32 %X) nounwind { ; X86-LABEL: test_srem_negative_even: ; X86: # %bb.0: -; X86-NEXT: imull $-1227133513, {{[0-9]+}}(%esp), %ecx # imm = 0xB6DB6DB7 -; X86-NEXT: addl $306783378, %ecx # imm = 0x12492492 -; X86-NEXT: rorl %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1840700269, %edx # imm = 0x6DB6DB6D +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: subl %ecx, %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl $3, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: imull $-14, %edx, %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $306783379, %ecx # imm = 0x12492493 -; X86-NEXT: setae %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: test_srem_negative_even: ; X64: # %bb.0: -; X64-NEXT: imull $-1227133513, %edi, %ecx # imm = 0xB6DB6DB7 -; X64-NEXT: addl $306783378, %ecx # imm = 0x12492492 -; X64-NEXT: rorl %ecx +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: imulq $1840700269, %rcx, %rax # imm = 0x6DB6DB6D +; X64-NEXT: shrq $32, %rax +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: movl %eax, %edx +; X64-NEXT: shrl $31, %edx +; X64-NEXT: sarl $3, %eax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: imull $-14, %eax, %edx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $306783379, %ecx # imm = 0x12492493 -; X64-NEXT: setae %al +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: setne %al ; X64-NEXT: retq %srem = srem i32 %X, -14 %cmp = icmp ne i32 %srem, 0 diff --git a/llvm/test/CodeGen/X86/srem-vector-lkk.ll b/llvm/test/CodeGen/X86/srem-vector-lkk.ll --- a/llvm/test/CodeGen/X86/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/X86/srem-vector-lkk.ll @@ -14,7 +14,7 @@ ; SSE-NEXT: movzwl %cx, %ecx ; SSE-NEXT: movswl %cx, %edx ; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $9, %edx +; SSE-NEXT: shrl $9, %edx ; SSE-NEXT: addl %ecx, %edx ; SSE-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15 ; SSE-NEXT: subl %ecx, %eax @@ -26,7 +26,7 @@ ; SSE-NEXT: movzwl %dx, %edx ; SSE-NEXT: movswl %dx, %esi ; SSE-NEXT: shrl $15, %edx -; SSE-NEXT: sarl $6, %esi +; SSE-NEXT: shrl $6, %esi ; SSE-NEXT: addl %edx, %esi ; SSE-NEXT: imull $95, %esi, %edx ; SSE-NEXT: subl %edx, %ecx @@ -36,7 +36,8 @@ ; SSE-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF ; SSE-NEXT: movl %edx, %esi ; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $21, %edx +; SSE-NEXT: sarl $16, %edx +; SSE-NEXT: shrl $5, %edx ; SSE-NEXT: addl %esi, %edx ; SSE-NEXT: imull $-124, %edx, %edx ; SSE-NEXT: subl %edx, %ecx @@ -46,7 +47,8 @@ ; SSE-NEXT: imull $2675, %edx, %edx # imm = 0xA73 ; SSE-NEXT: movl %edx, %esi ; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $18, %edx +; SSE-NEXT: sarl $16, %edx +; SSE-NEXT: shrl $2, %edx ; SSE-NEXT: addl %esi, %edx ; SSE-NEXT: imull $98, %edx, %edx ; SSE-NEXT: subl %edx, %ecx @@ -65,7 +67,7 @@ ; AVX-NEXT: movzwl %cx, %ecx ; AVX-NEXT: movswl %cx, %edx ; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $9, %edx +; AVX-NEXT: shrl $9, %edx ; AVX-NEXT: addl %ecx, %edx ; AVX-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15 ; AVX-NEXT: subl %ecx, %eax @@ -77,7 +79,7 @@ ; AVX-NEXT: movzwl %dx, %edx ; AVX-NEXT: movswl %dx, %esi ; AVX-NEXT: shrl $15, %edx -; AVX-NEXT: sarl $6, %esi +; AVX-NEXT: shrl $6, %esi ; AVX-NEXT: addl %edx, %esi ; AVX-NEXT: imull $95, %esi, %edx ; AVX-NEXT: subl %edx, %ecx @@ -87,7 +89,8 @@ ; AVX-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF ; AVX-NEXT: movl %edx, %esi ; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $21, %edx +; AVX-NEXT: sarl $16, %edx +; AVX-NEXT: shrl $5, %edx ; AVX-NEXT: addl %esi, %edx ; AVX-NEXT: imull $-124, %edx, %edx ; AVX-NEXT: subl %edx, %ecx @@ -97,7 +100,8 @@ ; AVX-NEXT: imull $2675, %edx, %edx # imm = 0xA73 ; AVX-NEXT: movl %edx, %esi ; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $18, %edx +; AVX-NEXT: sarl $16, %edx +; AVX-NEXT: shrl $2, %edx ; AVX-NEXT: addl %esi, %edx ; AVX-NEXT: imull $98, %edx, %edx ; AVX-NEXT: subl %edx, %ecx @@ -180,13 +184,13 @@ ; SSE-NEXT: leal 31(%rax), %ecx ; SSE-NEXT: testw %ax, %ax ; SSE-NEXT: cmovnsl %eax, %ecx -; SSE-NEXT: andl $-32, %ecx +; SSE-NEXT: andl $65504, %ecx # imm = 0xFFE0 ; SSE-NEXT: subl %ecx, %eax ; SSE-NEXT: movd %xmm0, %ecx ; SSE-NEXT: leal 63(%rcx), %edx ; SSE-NEXT: testw %cx, %cx ; SSE-NEXT: cmovnsl %ecx, %edx -; SSE-NEXT: andl $-64, %edx +; SSE-NEXT: andl $65472, %edx # imm = 0xFFC0 ; SSE-NEXT: subl %edx, %ecx ; SSE-NEXT: movd %ecx, %xmm0 ; SSE-NEXT: pinsrw $1, %eax, %xmm0 @@ -194,7 +198,7 @@ ; SSE-NEXT: leal 7(%rax), %ecx ; SSE-NEXT: testw %ax, %ax ; SSE-NEXT: cmovnsl %eax, %ecx -; SSE-NEXT: andl $-8, %ecx +; SSE-NEXT: andl $65528, %ecx # imm = 0xFFF8 ; SSE-NEXT: subl %ecx, %eax ; SSE-NEXT: pinsrw $2, %eax, %xmm0 ; SSE-NEXT: pextrw $3, %xmm1, %eax @@ -205,7 +209,7 @@ ; SSE-NEXT: movzwl %cx, %ecx ; SSE-NEXT: movswl %cx, %edx ; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $6, %edx +; SSE-NEXT: shrl $6, %edx ; SSE-NEXT: addl %ecx, %edx ; SSE-NEXT: imull $95, %edx, %ecx ; SSE-NEXT: subl %ecx, %eax @@ -218,13 +222,13 @@ ; AVX-NEXT: leal 31(%rax), %ecx ; AVX-NEXT: testw %ax, %ax ; AVX-NEXT: cmovnsl %eax, %ecx -; AVX-NEXT: andl $-32, %ecx +; AVX-NEXT: andl $65504, %ecx # imm = 0xFFE0 ; AVX-NEXT: subl %ecx, %eax ; AVX-NEXT: vmovd %xmm0, %ecx ; AVX-NEXT: leal 63(%rcx), %edx ; AVX-NEXT: testw %cx, %cx ; AVX-NEXT: cmovnsl %ecx, %edx -; AVX-NEXT: andl $-64, %edx +; AVX-NEXT: andl $65472, %edx # imm = 0xFFC0 ; AVX-NEXT: subl %edx, %ecx ; AVX-NEXT: vmovd %ecx, %xmm1 ; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 @@ -232,7 +236,7 @@ ; AVX-NEXT: leal 7(%rax), %ecx ; AVX-NEXT: testw %ax, %ax ; AVX-NEXT: cmovnsl %eax, %ecx -; AVX-NEXT: andl $-8, %ecx +; AVX-NEXT: andl $65528, %ecx # imm = 0xFFF8 ; AVX-NEXT: subl %ecx, %eax ; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 ; AVX-NEXT: vpextrw $3, %xmm0, %eax @@ -243,7 +247,7 @@ ; AVX-NEXT: movzwl %cx, %ecx ; AVX-NEXT: movswl %cx, %edx ; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $6, %edx +; AVX-NEXT: shrl $6, %edx ; AVX-NEXT: addl %ecx, %edx ; AVX-NEXT: imull $95, %edx, %ecx ; AVX-NEXT: subl %ecx, %eax @@ -265,7 +269,7 @@ ; SSE-NEXT: movzwl %ax, %edx ; SSE-NEXT: movswl %dx, %eax ; SSE-NEXT: shrl $15, %edx -; SSE-NEXT: sarl $4, %eax +; SSE-NEXT: shrl $4, %eax ; SSE-NEXT: addl %edx, %eax ; SSE-NEXT: leal (%rax,%rax,2), %edx ; SSE-NEXT: shll $3, %edx @@ -276,7 +280,8 @@ ; SSE-NEXT: imull $12827, %edx, %edx # imm = 0x321B ; SSE-NEXT: movl %edx, %esi ; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $23, %edx +; SSE-NEXT: sarl $16, %edx +; SSE-NEXT: shrl $7, %edx ; SSE-NEXT: addl %esi, %edx ; SSE-NEXT: imull $654, %edx, %edx # imm = 0x28E ; SSE-NEXT: subl %edx, %ecx @@ -298,32 +303,33 @@ ; ; AVX-LABEL: dont_fold_srem_one: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vpextrw $2, %xmm0, %ecx +; AVX-NEXT: movswl %cx, %eax +; AVX-NEXT: imull $-19945, %eax, %eax # imm = 0xB217 +; AVX-NEXT: shrl $16, %eax +; AVX-NEXT: addl %ecx, %eax +; AVX-NEXT: movzwl %ax, %edx +; AVX-NEXT: movswl %dx, %eax +; AVX-NEXT: shrl $15, %edx +; AVX-NEXT: shrl $4, %eax +; AVX-NEXT: addl %edx, %eax +; AVX-NEXT: leal (%rax,%rax,2), %edx +; AVX-NEXT: shll $3, %edx +; AVX-NEXT: subl %edx, %eax +; AVX-NEXT: addl %ecx, %eax +; AVX-NEXT: vpextrw $1, %xmm0, %ecx ; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $4, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: leal (%rdx,%rdx,2), %ecx -; AVX-NEXT: shll $3, %ecx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: addl %eax, %edx -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B -; AVX-NEXT: movl %ecx, %esi +; AVX-NEXT: imull $12827, %edx, %edx # imm = 0x321B +; AVX-NEXT: movl %edx, %esi ; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $23, %ecx -; AVX-NEXT: addl %esi, %ecx -; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: sarl $16, %edx +; AVX-NEXT: shrl $7, %edx +; AVX-NEXT: addl %esi, %edx +; AVX-NEXT: imull $654, %edx, %edx # imm = 0x28E +; AVX-NEXT: subl %edx, %ecx ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 ; AVX-NEXT: vpextrw $3, %xmm0, %eax ; AVX-NEXT: movswl %ax, %ecx ; AVX-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 @@ -351,7 +357,7 @@ ; SSE-NEXT: movzwl %cx, %ecx ; SSE-NEXT: movswl %cx, %edx ; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $4, %edx +; SSE-NEXT: shrl $4, %edx ; SSE-NEXT: addl %ecx, %edx ; SSE-NEXT: leal (%rdx,%rdx,2), %ecx ; SSE-NEXT: shll $3, %ecx @@ -361,7 +367,7 @@ ; SSE-NEXT: leal 32767(%rax), %ecx ; SSE-NEXT: testw %ax, %ax ; SSE-NEXT: cmovnsl %eax, %ecx -; SSE-NEXT: andl $-32768, %ecx # imm = 0x8000 +; SSE-NEXT: andl $32768, %ecx # imm = 0x8000 ; SSE-NEXT: addl %eax, %ecx ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: pinsrw $1, %ecx, %xmm1 @@ -389,7 +395,7 @@ ; AVX-NEXT: movzwl %cx, %ecx ; AVX-NEXT: movswl %cx, %edx ; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $4, %edx +; AVX-NEXT: shrl $4, %edx ; AVX-NEXT: addl %ecx, %edx ; AVX-NEXT: leal (%rdx,%rdx,2), %ecx ; AVX-NEXT: shll $3, %ecx @@ -399,7 +405,7 @@ ; AVX-NEXT: leal 32767(%rax), %ecx ; AVX-NEXT: testw %ax, %ax ; AVX-NEXT: cmovnsl %eax, %ecx -; AVX-NEXT: andl $-32768, %ecx # imm = 0x8000 +; AVX-NEXT: andl $32768, %ecx # imm = 0x8000 ; AVX-NEXT: addl %eax, %ecx ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/sse-intel-ocl.ll b/llvm/test/CodeGen/X86/sse-intel-ocl.ll --- a/llvm/test/CodeGen/X86/sse-intel-ocl.ll +++ b/llvm/test/CodeGen/X86/sse-intel-ocl.ll @@ -220,9 +220,9 @@ ; WIN64-NEXT: subq $232, %rsp ; WIN64-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; WIN64-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; WIN64-NEXT: movaps (%r9), %xmm4 -; WIN64-NEXT: movaps (%rdx), %xmm5 -; WIN64-NEXT: movaps (%r8), %xmm6 +; WIN64-NEXT: movaps (%rdx), %xmm4 +; WIN64-NEXT: movaps (%r8), %xmm5 +; WIN64-NEXT: movaps (%r9), %xmm6 ; WIN64-NEXT: movaps (%rcx), %xmm7 ; WIN64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) @@ -231,11 +231,11 @@ ; WIN64-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rax diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll --- a/llvm/test/CodeGen/X86/sse2.ll +++ b/llvm/test/CodeGen/X86/sse2.ll @@ -103,6 +103,8 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: vmovaps (%edx), %xmm0 ; X86-AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X86-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; X86-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; X86-AVX-NEXT: vmovaps %xmm0, (%eax) ; X86-AVX-NEXT: retl ; @@ -117,6 +119,8 @@ ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovaps (%rsi), %xmm0 ; X64-AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) ; X64-AVX-NEXT: retq %tmp = load <4 x float>, ptr %B ; <<4 x float>> [#uses=2] @@ -597,25 +601,25 @@ define fastcc void @test17() nounwind { ; X86-SSE-LABEL: test17: ; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = +; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = ; X86-SSE-NEXT: movaps %xmm0, (%eax) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: test17: ; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768] +; X86-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [4.59177481E-41,4.59177481E-41,4.59177481E-41,4.59177481E-41] ; X86-AVX-NEXT: vmovaps %xmm0, (%eax) ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: test17: ; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movaps {{.*#+}} xmm0 = +; X64-SSE-NEXT: movaps {{.*#+}} xmm0 = ; X64-SSE-NEXT: movaps %xmm0, (%rax) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: test17: ; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768] +; X64-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [4.59177481E-41,4.59177481E-41,4.59177481E-41,4.59177481E-41] ; X64-AVX-NEXT: vmovaps %xmm0, (%rax) ; X64-AVX-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -764,34 +764,52 @@ ; X86-SSE-LABEL: insertps_from_load_ins_elt_undef_i32: ; X86-SSE: ## %bb.0: ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-SSE-NEXT: pinsrd $2, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x22,0x00,0x02] +; X86-SSE-NEXT: movd (%eax), %xmm1 ## encoding: [0x66,0x0f,0x6e,0x08] +; X86-SSE-NEXT: ## xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pshufd $68, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x44] +; X86-SSE-NEXT: ## xmm1 = xmm1[0,1,0,1] +; X86-SSE-NEXT: pblendw $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x30] +; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] ; X86-SSE-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX1-LABEL: insertps_from_load_ins_elt_undef_i32: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0x00,0x02] +; X86-AVX1-NEXT: vbroadcastss (%eax), %xmm1 ## encoding: [0xc4,0xe2,0x79,0x18,0x08] +; X86-AVX1-NEXT: vblendps $4, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x04] +; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; X86-AVX1-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512-LABEL: insertps_from_load_ins_elt_undef_i32: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x00,0x02] +; X86-AVX512-NEXT: vbroadcastss (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x08] +; X86-AVX512-NEXT: vblendps $4, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x04] +; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; ; X64-SSE-LABEL: insertps_from_load_ins_elt_undef_i32: ; X64-SSE: ## %bb.0: -; X64-SSE-NEXT: pinsrd $2, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x22,0x07,0x02] +; X64-SSE-NEXT: movd (%rdi), %xmm1 ## encoding: [0x66,0x0f,0x6e,0x0f] +; X64-SSE-NEXT: ## xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pshufd $68, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x44] +; X64-SSE-NEXT: ## xmm1 = xmm1[0,1,0,1] +; X64-SSE-NEXT: pblendw $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x30] +; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] ; X64-SSE-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX1-LABEL: insertps_from_load_ins_elt_undef_i32: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vpinsrd $2, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0x07,0x02] +; X64-AVX1-NEXT: vbroadcastss (%rdi), %xmm1 ## encoding: [0xc4,0xe2,0x79,0x18,0x0f] +; X64-AVX1-NEXT: vblendps $4, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x04] +; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; X64-AVX1-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512-LABEL: insertps_from_load_ins_elt_undef_i32: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpinsrd $2, (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x07,0x02] +; X64-AVX512-NEXT: vbroadcastss (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x0f] +; X64-AVX512-NEXT: vblendps $4, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x04] +; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %1 = load i32, ptr %b, align 4 %2 = insertelement <4 x i32> undef, i32 %1, i32 0 @@ -1160,29 +1178,27 @@ define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) { ; SSE-LABEL: i32_shuf_W00W: ; SSE: ## %bb.0: -; SSE-NEXT: pshufd $255, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc8,0xff] -; SSE-NEXT: ## xmm1 = xmm0[3,3,3,3] -; SSE-NEXT: pxor %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xef,0xc0] -; SSE-NEXT: pblendw $195, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc3] -; SSE-NEXT: ## xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; SSE-NEXT: movdqa %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x6f,0xc8] +; SSE-NEXT: psrldq $12, %xmm1 ## encoding: [0x66,0x0f,0x73,0xd9,0x0c] +; SSE-NEXT: ## xmm1 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pblendw $63, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x3f] +; SSE-NEXT: ## xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX1-LABEL: i32_shuf_W00W: ; AVX1: ## %bb.0: -; AVX1-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff] -; AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3] -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] -; AVX1-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] -; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; AVX1-NEXT: vpsrldq $12, %xmm0, %xmm1 ## encoding: [0xc5,0xf1,0x73,0xd8,0x0c] +; AVX1-NEXT: ## xmm1 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpblendw $192, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0e,0xc0,0xc0] +; AVX1-NEXT: ## xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX512-LABEL: i32_shuf_W00W: ; AVX512: ## %bb.0: -; AVX512-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xff] -; AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] -; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] -; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; AVX512-NEXT: vpsrldq $12, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x73,0xd8,0x0c] +; AVX512-NEXT: ## xmm1 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vblendps $8, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x08] +; AVX512-NEXT: ## xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x i32> %x, i32 3 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 @@ -1595,8 +1611,9 @@ ; X86-AVX1-LABEL: insertps_from_broadcast_loadv4f32: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vinsertps $48, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30] -; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] +; X86-AVX1-NEXT: vmovups (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x10,0x08] +; X86-AVX1-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] +; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] ; X86-AVX1-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512-LABEL: insertps_from_broadcast_loadv4f32: @@ -1615,8 +1632,9 @@ ; ; X64-AVX1-LABEL: insertps_from_broadcast_loadv4f32: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vinsertps $48, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30] -; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] +; X64-AVX1-NEXT: vmovups (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x10,0x0f] +; X64-AVX1-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] +; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] ; X64-AVX1-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512-LABEL: insertps_from_broadcast_loadv4f32: @@ -2124,14 +2142,14 @@ ; AVX1-LABEL: build_vector_to_shuffle_1: ; AVX1: ## %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] -; AVX1-NEXT: vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a] +; AVX1-NEXT: vblendps $5, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x05] ; AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] ; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX512-LABEL: build_vector_to_shuffle_1: ; AVX512: ## %bb.0: ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512-NEXT: vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a] +; AVX512-NEXT: vblendps $5, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x05] ; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x float> %A, i32 1 @@ -2152,14 +2170,14 @@ ; AVX1-LABEL: build_vector_to_shuffle_2: ; AVX1: ## %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] -; AVX1-NEXT: vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02] +; AVX1-NEXT: vblendps $13, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x0d] ; AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX512-LABEL: build_vector_to_shuffle_2: ; AVX512: ## %bb.0: ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512-NEXT: vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02] +; AVX512-NEXT: vblendps $13, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x0d] ; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x float> %A, i32 1 diff --git a/llvm/test/CodeGen/X86/sshl_sat.ll b/llvm/test/CodeGen/X86/sshl_sat.ll --- a/llvm/test/CodeGen/X86/sshl_sat.ll +++ b/llvm/test/CodeGen/X86/sshl_sat.ll @@ -128,7 +128,7 @@ ; X86: # %bb.0: ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll $7, %ecx ; X86-NEXT: addl %eax, %eax ; X86-NEXT: movl %eax, %edx diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll --- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll @@ -33,15 +33,13 @@ ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,0,3,2] ; X64-NEXT: pand %xmm7, %xmm1 ; X64-NEXT: andpd %xmm1, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; X64-NEXT: pand %xmm2, %xmm0 -; X64-NEXT: pxor %xmm5, %xmm5 -; X64-NEXT: pcmpgtd %xmm4, %xmm5 -; X64-NEXT: por %xmm2, %xmm5 -; X64-NEXT: pcmpeqd %xmm2, %xmm2 -; X64-NEXT: pxor %xmm5, %xmm2 -; X64-NEXT: por %xmm0, %xmm2 -; X64-NEXT: pandn %xmm2, %xmm1 +; X64-NEXT: pand %xmm0, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-NEXT: pxor %xmm4, %xmm4 +; X64-NEXT: pcmpgtd %xmm0, %xmm4 +; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; X64-NEXT: por %xmm2, %xmm4 +; X64-NEXT: pandn %xmm4, %xmm1 ; X64-NEXT: por %xmm3, %xmm1 ; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -1270,18 +1270,19 @@ ; SSE41-NEXT: pand %xmm5, %xmm4 ; SSE41-NEXT: por %xmm3, %xmm4 ; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm3 -; SSE41-NEXT: por %xmm1, %xmm3 -; SSE41-NEXT: pxor %xmm4, %xmm3 -; SSE41-NEXT: movapd {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm1 +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 ; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: retq ; @@ -1475,55 +1476,57 @@ ; SSE41-LABEL: v4i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: psubq %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pxor %xmm6, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 ; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm6 +; SSE41-NEXT: por %xmm0, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm8 -; SSE41-NEXT: por %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm2 +; SSE41-NEXT: por %xmm0, %xmm2 ; SSE41-NEXT: pxor %xmm6, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm5 -; SSE41-NEXT: por %xmm2, %xmm5 -; SSE41-NEXT: pxor %xmm8, %xmm5 -; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807] -; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movapd %xmm7, %xmm2 +; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807] +; SSE41-NEXT: movapd {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: movapd %xmm6, %xmm8 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: psubq %xmm3, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: pxor %xmm6, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm5 +; SSE41-NEXT: pxor %xmm5, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm8 ; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm9 +; SSE41-NEXT: pand %xmm8, %xmm9 ; SSE41-NEXT: por %xmm0, %xmm9 -; SSE41-NEXT: pxor %xmm6, %xmm3 +; SSE41-NEXT: pxor %xmm5, %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm2 -; SSE41-NEXT: por %xmm3, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm2 +; SSE41-NEXT: por %xmm0, %xmm2 ; SSE41-NEXT: pxor %xmm9, %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm6 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 ; SSE41-NEXT: movapd %xmm4, %xmm0 ; SSE41-NEXT: retq ; @@ -1844,103 +1847,107 @@ ; SSE41-LABEL: v8i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: psubq %xmm4, %xmm8 -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pxor %xmm10, %xmm9 +; SSE41-NEXT: movdqa %xmm8, %xmm10 +; SSE41-NEXT: pxor %xmm9, %xmm10 ; SSE41-NEXT: movdqa %xmm0, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm11 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm11, %xmm10 +; SSE41-NEXT: por %xmm0, %xmm10 +; SSE41-NEXT: pxor %xmm9, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm11, %xmm12 -; SSE41-NEXT: por %xmm0, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: pand %xmm11, %xmm4 +; SSE41-NEXT: por %xmm0, %xmm4 ; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm9 -; SSE41-NEXT: por %xmm4, %xmm9 -; SSE41-NEXT: pxor %xmm12, %xmm9 -; SSE41-NEXT: movapd {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807] -; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movapd %xmm11, %xmm4 +; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807] +; SSE41-NEXT: movapd {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: movapd %xmm10, %xmm12 ; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm4 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm8 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: psubq %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE41-NEXT: pxor %xmm9, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm12 ; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm13 +; SSE41-NEXT: pand %xmm12, %xmm13 ; SSE41-NEXT: por %xmm0, %xmm13 -; SSE41-NEXT: pxor %xmm10, %xmm5 +; SSE41-NEXT: pxor %xmm9, %xmm5 ; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm4 -; SSE41-NEXT: por %xmm5, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm12, %xmm4 +; SSE41-NEXT: por %xmm0, %xmm4 ; SSE41-NEXT: pxor %xmm13, %xmm4 -; SSE41-NEXT: movapd %xmm11, %xmm5 +; SSE41-NEXT: movapd %xmm10, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: psubq %xmm6, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 +; SSE41-NEXT: pxor %xmm9, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm9 -; SSE41-NEXT: por %xmm0, %xmm9 -; SSE41-NEXT: pxor %xmm10, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm12 +; SSE41-NEXT: por %xmm0, %xmm12 +; SSE41-NEXT: pxor %xmm9, %xmm6 ; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm4 -; SSE41-NEXT: por %xmm6, %xmm4 -; SSE41-NEXT: pxor %xmm9, %xmm4 -; SSE41-NEXT: movapd %xmm11, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm4 +; SSE41-NEXT: por %xmm0, %xmm4 +; SSE41-NEXT: pxor %xmm12, %xmm4 +; SSE41-NEXT: movapd %xmm10, %xmm5 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: psubq %xmm7, %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 +; SSE41-NEXT: pxor %xmm9, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm6 ; SSE41-NEXT: por %xmm0, %xmm6 -; SSE41-NEXT: pxor %xmm10, %xmm7 +; SSE41-NEXT: pxor %xmm9, %xmm7 ; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm4 -; SSE41-NEXT: por %xmm7, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm4 +; SSE41-NEXT: por %xmm0, %xmm4 ; SSE41-NEXT: pxor %xmm6, %xmm4 ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm11 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm10 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3 ; SSE41-NEXT: movapd %xmm8, %xmm0 ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/store-narrow.ll b/llvm/test/CodeGen/X86/store-narrow.ll --- a/llvm/test/CodeGen/X86/store-narrow.ll +++ b/llvm/test/CodeGen/X86/store-narrow.ll @@ -67,22 +67,21 @@ define void @test3(ptr nocapture %a0, i16 zeroext %a1) nounwind ssp { ; X64-LABEL: test3: ; X64: ## %bb.0: ## %entry -; X64-NEXT: movw %si, (%rdi) +; X64-NEXT: movzwl 2(%rdi), %eax +; X64-NEXT: shll $16, %eax +; X64-NEXT: orl %esi, %eax +; X64-NEXT: movl %eax, (%rdi) ; X64-NEXT: retq ; -; X86-BWON-LABEL: test3: -; X86-BWON: ## %bb.0: ## %entry -; X86-BWON-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-BWON-NEXT: movw %ax, (%ecx) -; X86-BWON-NEXT: retl -; -; X86-BWOFF-LABEL: test3: -; X86-BWOFF: ## %bb.0: ## %entry -; X86-BWOFF-NEXT: movw {{[0-9]+}}(%esp), %ax -; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-BWOFF-NEXT: movw %ax, (%ecx) -; X86-BWOFF-NEXT: retl +; X86-LABEL: test3: +; X86: ## %bb.0: ## %entry +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl 2(%ecx), %edx +; X86-NEXT: shll $16, %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edx, (%ecx) +; X86-NEXT: retl entry: %A = load i32, ptr %a0, align 4 %B = and i32 %A, -65536 ; 0xFFFF0000 @@ -95,22 +94,21 @@ define void @test4(ptr nocapture %a0, i16 zeroext %a1) nounwind ssp { ; X64-LABEL: test4: ; X64: ## %bb.0: ## %entry -; X64-NEXT: movw %si, 2(%rdi) +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: shll $16, %esi +; X64-NEXT: orl %eax, %esi +; X64-NEXT: movl %esi, (%rdi) ; X64-NEXT: retq ; -; X86-BWON-LABEL: test4: -; X86-BWON: ## %bb.0: ## %entry -; X86-BWON-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-BWON-NEXT: movw %ax, 2(%ecx) -; X86-BWON-NEXT: retl -; -; X86-BWOFF-LABEL: test4: -; X86-BWOFF: ## %bb.0: ## %entry -; X86-BWOFF-NEXT: movw {{[0-9]+}}(%esp), %ax -; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-BWOFF-NEXT: movw %ax, 2(%ecx) -; X86-BWOFF-NEXT: retl +; X86-LABEL: test4: +; X86: ## %bb.0: ## %entry +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %edx +; X86-NEXT: shll $16, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl entry: %A = load i32, ptr %a0, align 4 %B = and i32 %A, 65535 ; 0x0000FFFF diff --git a/llvm/test/CodeGen/X86/stores-merging.ll b/llvm/test/CodeGen/X86/stores-merging.ll --- a/llvm/test/CodeGen/X86/stores-merging.ll +++ b/llvm/test/CodeGen/X86/stores-merging.ll @@ -13,8 +13,9 @@ define dso_local void @redundant_stores_merging() { ; CHECK-LABEL: redundant_stores_merging: ; CHECK: # %bb.0: -; CHECK-NEXT: movabsq $1958505086977, %rax # imm = 0x1C800000001 +; CHECK-NEXT: movabsq $528280977409, %rax # imm = 0x7B00000001 ; CHECK-NEXT: movq %rax, e+4(%rip) +; CHECK-NEXT: movl $456, e+8(%rip) # imm = 0x1C8 ; CHECK-NEXT: retq store i32 1, ptr getelementptr inbounds (%structTy, ptr @e, i64 0, i32 1), align 4 store i32 123, ptr getelementptr inbounds (%structTy, ptr @e, i64 0, i32 2), align 4 @@ -26,9 +27,8 @@ define dso_local void @redundant_stores_merging_reverse() { ; CHECK-LABEL: redundant_stores_merging_reverse: ; CHECK: # %bb.0: -; CHECK-NEXT: movabsq $528280977409, %rax # imm = 0x7B00000001 +; CHECK-NEXT: movabsq $1958505086977, %rax # imm = 0x1C800000001 ; CHECK-NEXT: movq %rax, e+4(%rip) -; CHECK-NEXT: movl $456, e+8(%rip) # imm = 0x1C8 ; CHECK-NEXT: retq store i32 123, ptr getelementptr inbounds (%structTy, ptr @e, i64 0, i32 2), align 4 store i32 456, ptr getelementptr inbounds (%structTy, ptr @e, i64 0, i32 2), align 4 @@ -359,14 +359,12 @@ define dso_local void @rotate32_consecutive(ptr %p) { ; CHECK-LABEL: rotate32_consecutive: ; CHECK: # %bb.0: -; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: movzwl 2(%rdi), %ecx -; CHECK-NEXT: movzwl 4(%rdi), %edx -; CHECK-NEXT: movzwl 6(%rdi), %esi -; CHECK-NEXT: movw %cx, 84(%rdi) -; CHECK-NEXT: movw %ax, 86(%rdi) -; CHECK-NEXT: movw %si, 88(%rdi) -; CHECK-NEXT: movw %dx, 90(%rdi) +; CHECK-NEXT: movl (%rdi), %eax +; CHECK-NEXT: movl 4(%rdi), %ecx +; CHECK-NEXT: roll $16, %eax +; CHECK-NEXT: roll $16, %ecx +; CHECK-NEXT: movl %eax, 84(%rdi) +; CHECK-NEXT: movl %ecx, 88(%rdi) ; CHECK-NEXT: retq %p1 = getelementptr i16, ptr %p, i64 1 %p2 = getelementptr i16, ptr %p, i64 2 @@ -433,7 +431,12 @@ define dso_local void @trunc_i32_to_i8(i32 %x, ptr %p) { ; CHECK-LABEL: trunc_i32_to_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, (%rsi) +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movw %di, (%rsi) +; CHECK-NEXT: shrl $16, %edi +; CHECK-NEXT: shrl $24, %eax +; CHECK-NEXT: movb %dil, 2(%rsi) +; CHECK-NEXT: movb %al, 3(%rsi) ; CHECK-NEXT: retq %t1 = trunc i32 %x to i8 %sh1 = lshr i32 %x, 8 @@ -499,7 +502,25 @@ define dso_local void @trunc_i64_to_i8(i64 %x, ptr %p) { ; CHECK-LABEL: trunc_i64_to_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %rdi, (%rsi) +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movq %rdi, %rcx +; CHECK-NEXT: movq %rdi, %rdx +; CHECK-NEXT: movq %rdi, %r8 +; CHECK-NEXT: movq %rdi, %r9 +; CHECK-NEXT: movw %di, (%rsi) +; CHECK-NEXT: # kill: def $edi killed $edi killed $rdi +; CHECK-NEXT: shrl $16, %edi +; CHECK-NEXT: shrl $24, %eax +; CHECK-NEXT: shrq $32, %rcx +; CHECK-NEXT: shrq $40, %rdx +; CHECK-NEXT: shrq $48, %r8 +; CHECK-NEXT: shrq $56, %r9 +; CHECK-NEXT: movb %dil, 2(%rsi) +; CHECK-NEXT: movb %al, 3(%rsi) +; CHECK-NEXT: movb %cl, 4(%rsi) +; CHECK-NEXT: movb %dl, 5(%rsi) +; CHECK-NEXT: movb %r8b, 6(%rsi) +; CHECK-NEXT: movb %r9b, 7(%rsi) ; CHECK-NEXT: retq %t1 = trunc i64 %x to i8 %sh1 = lshr i64 %x, 8 @@ -537,7 +558,12 @@ define dso_local void @trunc_i64_to_i16(i64 %x, ptr %p) { ; CHECK-LABEL: trunc_i64_to_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %rdi, (%rsi) +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movl %edi, (%rsi) +; CHECK-NEXT: shrq $32, %rdi +; CHECK-NEXT: shrq $48, %rax +; CHECK-NEXT: movw %di, 4(%rsi) +; CHECK-NEXT: movw %ax, 6(%rsi) ; CHECK-NEXT: retq %t1 = trunc i64 %x to i16 %sh1 = lshr i64 %x, 16 diff --git a/llvm/test/CodeGen/X86/subcarry.ll b/llvm/test/CodeGen/X86/subcarry.ll --- a/llvm/test/CodeGen/X86/subcarry.ll +++ b/llvm/test/CodeGen/X86/subcarry.ll @@ -43,16 +43,21 @@ ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: subq (%rsi), %rdx -; CHECK-NEXT: movl $0, %edi -; CHECK-NEXT: sbbq 8(%rsi), %rdi -; CHECK-NEXT: movl $0, %r8d -; CHECK-NEXT: sbbq 16(%rsi), %r8 -; CHECK-NEXT: sbbq 24(%rsi), %rcx -; CHECK-NEXT: movq %rdx, (%rax) +; CHECK-NEXT: subq (%rsi), %rcx +; CHECK-NEXT: setae %dl +; CHECK-NEXT: movq 8(%rsi), %rdi +; CHECK-NEXT: movq 16(%rsi), %r8 +; CHECK-NEXT: notq %rdi +; CHECK-NEXT: addq %rdx, %rdi +; CHECK-NEXT: notq %r8 +; CHECK-NEXT: adcq $0, %r8 +; CHECK-NEXT: movq 24(%rsi), %rdx +; CHECK-NEXT: notq %rdx +; CHECK-NEXT: adcq $0, %rdx +; CHECK-NEXT: movq %rcx, (%rax) ; CHECK-NEXT: movq %rdi, 8(%rax) ; CHECK-NEXT: movq %r8, 16(%rax) -; CHECK-NEXT: movq %rcx, 24(%rax) +; CHECK-NEXT: movq %rdx, 24(%rax) ; CHECK-NEXT: retq entry: %0 = load i64, ptr %this, align 8 @@ -94,13 +99,12 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movq (%rsi), %rdi -; CHECK-NEXT: movq 8(%rsi), %r10 +; CHECK-NEXT: xorl %r10d, %r10d ; CHECK-NEXT: subq %rdx, %rdi -; CHECK-NEXT: setae %dl -; CHECK-NEXT: addb $-1, %dl -; CHECK-NEXT: adcq $0, %r10 +; CHECK-NEXT: setae %r10b +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: addq 8(%rsi), %r10 ; CHECK-NEXT: setb %dl -; CHECK-NEXT: movzbl %dl, %edx ; CHECK-NEXT: notq %rcx ; CHECK-NEXT: addq %r10, %rcx ; CHECK-NEXT: adcq 16(%rsi), %rdx @@ -596,14 +600,12 @@ ; CHECK-NEXT: movq 8(%rsi), %rdi ; CHECK-NEXT: movq 16(%rsi), %r8 ; CHECK-NEXT: movq 24(%rsi), %rsi -; CHECK-NEXT: xorl %r9d, %r9d ; CHECK-NEXT: subq 16(%rdx), %r8 -; CHECK-NEXT: setb %r9b -; CHECK-NEXT: subq 24(%rdx), %rsi +; CHECK-NEXT: sbbq 24(%rdx), %rsi ; CHECK-NEXT: subq (%rdx), %rcx ; CHECK-NEXT: sbbq 8(%rdx), %rdi ; CHECK-NEXT: sbbq $0, %r8 -; CHECK-NEXT: sbbq %r9, %rsi +; CHECK-NEXT: sbbq $0, %rsi ; CHECK-NEXT: movq %rcx, (%rax) ; CHECK-NEXT: movq %rdi, 8(%rax) ; CHECK-NEXT: movq %r8, 16(%rax) diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll --- a/llvm/test/CodeGen/X86/subvector-broadcast.ll +++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll @@ -1646,18 +1646,31 @@ } define <4 x double> @broadcast_v4f64_v2f64_4u61(ptr %vp, <4 x double> %default) { -; X86-LABEL: broadcast_v4f64_v2f64_4u61: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vinsertf128 $1, (%eax), %ymm0, %ymm1 -; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; X86-NEXT: retl +; X86-AVX-LABEL: broadcast_v4f64_v2f64_4u61: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; X86-AVX-NEXT: retl ; -; X64-LABEL: broadcast_v4f64_v2f64_4u61: -; X64: # %bb.0: -; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm1 -; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; X64-NEXT: retq +; X86-AVX512-LABEL: broadcast_v4f64_v2f64_4u61: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X86-AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; X86-AVX512-NEXT: retl +; +; X64-AVX-LABEL: broadcast_v4f64_v2f64_4u61: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; X64-AVX-NEXT: retq +; +; X64-AVX512-LABEL: broadcast_v4f64_v2f64_4u61: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X64-AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; X64-AVX512-NEXT: retq %vec = load <2 x double>, ptr %vp %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> %res = select <4 x i1> , <4 x double> %shuf, <4 x double> %default @@ -1669,13 +1682,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastsd (%eax), %ymm1 -; X86-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; X86-NEXT: retl ; ; X64-LABEL: broadcast_v8f32_v2f32_u1uu0uEu: ; X64: # %bb.0: ; X64-NEXT: vbroadcastsd (%rdi), %ymm1 -; X64-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; X64-NEXT: retq %vec = load <2 x float>, ptr %vp %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> @@ -1723,7 +1736,8 @@ ; X86-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X86-AVX512-NEXT: retl ; ; X64-AVX-LABEL: broadcast_v8f64_v2f64_0uuu0101: @@ -1734,7 +1748,8 @@ ; ; X64-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X64-AVX512-NEXT: retq %vec = load <2 x double>, ptr %vp %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll --- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll +++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll @@ -356,44 +356,14 @@ ; SCALAR-NEXT: movl %eax, 8(%rdx) ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec128_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec128_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vmovdqa %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec128_v2i32: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rdx) -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec128_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512-NEXT: retq +; SSE-LABEL: vec128_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <2 x i32> %in.subvec.not, store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -419,44 +389,14 @@ ; SCALAR-NEXT: movl %eax, 8(%rdx) ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec128_v2f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec128_v2f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vmovdqa %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec128_v2f32: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rdx) -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec128_v2f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512-NEXT: retq +; SSE-LABEL: vec128_v2f32: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 %in.subvec.int = xor <2 x i32> %in.subvec.not, %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float> @@ -566,44 +506,14 @@ ; SCALAR-NEXT: movw %r8w, 8(%rdx) ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec128_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec128_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vmovdqa %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec128_v4i16: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rdx) -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec128_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512-NEXT: retq +; SSE-LABEL: vec128_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <4 x i16> %in.subvec.not, store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -661,44 +571,14 @@ ; SCALAR-NEXT: popq %rbx ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec128_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec128_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vmovdqa %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec128_v8i8: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rdx) -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec128_v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512-NEXT: retq +; SSE-LABEL: vec128_v8i8: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <8 x i8> %in.subvec.not, store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -966,49 +846,16 @@ ; SCALAR-NEXT: movl %eax, 24(%rdx) ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec256_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec256_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec256_v2i32: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec256_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; SSE-LABEL: vec256_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: movq %rax, 16(%rdx) +; SSE-NEXT: movq %rax, 24(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <2 x i32> %in.subvec.not, store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -1042,49 +889,16 @@ ; SCALAR-NEXT: movl %eax, 24(%rdx) ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec256_v2f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec256_v2f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec256_v2f32: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec256_v2f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; SSE-LABEL: vec256_v2f32: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: movq %rax, 16(%rdx) +; SSE-NEXT: movq %rax, 24(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 %in.subvec.int = xor <2 x i32> %in.subvec.not, %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float> @@ -1318,49 +1132,16 @@ ; SCALAR-NEXT: movw %ax, 24(%rdx) ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec256_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec256_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec256_v4i16: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec256_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; SSE-LABEL: vec256_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: movq %rax, 16(%rdx) +; SSE-NEXT: movq %rax, 24(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <4 x i16> %in.subvec.not, store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -1513,49 +1294,16 @@ ; SCALAR-NEXT: popq %rbx ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec256_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec256_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec256_v8i8: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec256_v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; SSE-LABEL: vec256_v8i8: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: movq %rax, 16(%rdx) +; SSE-NEXT: movq %rax, 24(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <8 x i8> %in.subvec.not, store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -2130,53 +1878,18 @@ ; SCALAR-NEXT: movl %eax, 40(%rdx) ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec384_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec384_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX1-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec384_v2i32: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec384_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; SSE-LABEL: vec384_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: movq %rax, 16(%rdx) +; SSE-NEXT: movq %rax, 24(%rdx) +; SSE-NEXT: movq %rax, 32(%rdx) +; SSE-NEXT: movq %rax, 40(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <2 x i32> %in.subvec.not, store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -2218,53 +1931,18 @@ ; SCALAR-NEXT: movl %eax, 40(%rdx) ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec384_v2f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec384_v2f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX1-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec384_v2f32: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec384_v2f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; SSE-LABEL: vec384_v2f32: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: movq %rax, 16(%rdx) +; SSE-NEXT: movq %rax, 24(%rdx) +; SSE-NEXT: movq %rax, 32(%rdx) +; SSE-NEXT: movq %rax, 40(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 %in.subvec.int = xor <2 x i32> %in.subvec.not, %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float> @@ -3331,21 +3009,21 @@ define void @vec384_v3i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { ; SCALAR-LABEL: vec384_v3i64: ; SCALAR: # %bb.0: -; SCALAR-NEXT: movq (%rdi), %rax -; SCALAR-NEXT: movq 8(%rdi), %rcx -; SCALAR-NEXT: movq 16(%rdi), %rdi -; SCALAR-NEXT: notq %rdi +; SCALAR-NEXT: movq 16(%rdi), %rax +; SCALAR-NEXT: movq (%rdi), %rcx +; SCALAR-NEXT: movq 8(%rdi), %rdi ; SCALAR-NEXT: notq %rcx +; SCALAR-NEXT: notq %rdi ; SCALAR-NEXT: notq %rax -; SCALAR-NEXT: movq %rax, (%rsi) -; SCALAR-NEXT: movq %rcx, 8(%rsi) -; SCALAR-NEXT: movq %rdi, 16(%rsi) -; SCALAR-NEXT: movq %rax, (%rdx) -; SCALAR-NEXT: movq %rcx, 8(%rdx) -; SCALAR-NEXT: movq %rdi, 16(%rdx) -; SCALAR-NEXT: movq %rdi, 48(%rdx) -; SCALAR-NEXT: movq %rcx, 40(%rdx) -; SCALAR-NEXT: movq %rax, 32(%rdx) +; SCALAR-NEXT: movq %rax, 16(%rsi) +; SCALAR-NEXT: movq %rdi, 8(%rsi) +; SCALAR-NEXT: movq %rcx, (%rsi) +; SCALAR-NEXT: movq %rax, 16(%rdx) +; SCALAR-NEXT: movq %rdi, 8(%rdx) +; SCALAR-NEXT: movq %rcx, (%rdx) +; SCALAR-NEXT: movq %rax, 48(%rdx) +; SCALAR-NEXT: movq %rdi, 40(%rdx) +; SCALAR-NEXT: movq %rcx, 32(%rdx) ; SCALAR-NEXT: retq ; ; SSE2-LABEL: vec384_v3i64: @@ -3403,21 +3081,21 @@ define void @vec384_v3f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { ; SCALAR-LABEL: vec384_v3f64: ; SCALAR: # %bb.0: -; SCALAR-NEXT: movq (%rdi), %rax -; SCALAR-NEXT: movq 8(%rdi), %rcx -; SCALAR-NEXT: movq 16(%rdi), %rdi -; SCALAR-NEXT: notq %rdi +; SCALAR-NEXT: movq 16(%rdi), %rax +; SCALAR-NEXT: movq (%rdi), %rcx +; SCALAR-NEXT: movq 8(%rdi), %rdi ; SCALAR-NEXT: notq %rcx +; SCALAR-NEXT: notq %rdi ; SCALAR-NEXT: notq %rax -; SCALAR-NEXT: movq %rax, (%rsi) -; SCALAR-NEXT: movq %rcx, 8(%rsi) -; SCALAR-NEXT: movq %rdi, 16(%rsi) -; SCALAR-NEXT: movq %rax, (%rdx) -; SCALAR-NEXT: movq %rcx, 8(%rdx) -; SCALAR-NEXT: movq %rdi, 16(%rdx) -; SCALAR-NEXT: movq %rdi, 48(%rdx) -; SCALAR-NEXT: movq %rcx, 40(%rdx) -; SCALAR-NEXT: movq %rax, 32(%rdx) +; SCALAR-NEXT: movq %rax, 16(%rsi) +; SCALAR-NEXT: movq %rdi, 8(%rsi) +; SCALAR-NEXT: movq %rcx, (%rsi) +; SCALAR-NEXT: movq %rax, 16(%rdx) +; SCALAR-NEXT: movq %rdi, 8(%rdx) +; SCALAR-NEXT: movq %rcx, (%rdx) +; SCALAR-NEXT: movq %rax, 48(%rdx) +; SCALAR-NEXT: movq %rdi, 40(%rdx) +; SCALAR-NEXT: movq %rcx, 32(%rdx) ; SCALAR-NEXT: retq ; ; SSE2-LABEL: vec384_v3f64: @@ -3641,53 +3319,18 @@ ; SCALAR-NEXT: movw %ax, 40(%rdx) ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec384_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec384_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX1-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec384_v4i16: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec384_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; SSE-LABEL: vec384_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: movq %rax, 16(%rdx) +; SSE-NEXT: movq %rax, 24(%rdx) +; SSE-NEXT: movq %rax, 32(%rdx) +; SSE-NEXT: movq %rax, 40(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <4 x i16> %in.subvec.not, store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -3794,56 +3437,56 @@ define void @vec384_v6i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { ; SCALAR-LABEL: vec384_v6i8: ; SCALAR: # %bb.0: -; SCALAR-NEXT: movq (%rdi), %rdi -; SCALAR-NEXT: movq %rdi, %rax -; SCALAR-NEXT: shrq $40, %rax -; SCALAR-NEXT: movq %rdi, %rcx -; SCALAR-NEXT: shrq $32, %rcx -; SCALAR-NEXT: movl %edi, %r8d -; SCALAR-NEXT: shrl $24, %r8d -; SCALAR-NEXT: movl %edi, %r9d -; SCALAR-NEXT: shrl $16, %r9d -; SCALAR-NEXT: movl %edi, %r10d +; SCALAR-NEXT: movq (%rdi), %rcx +; SCALAR-NEXT: movl %ecx, %eax +; SCALAR-NEXT: shrl $16, %eax +; SCALAR-NEXT: movl %ecx, %edi +; SCALAR-NEXT: shrl $24, %edi +; SCALAR-NEXT: movq %rcx, %r8 +; SCALAR-NEXT: shrq $40, %r8 +; SCALAR-NEXT: movq %rcx, %r9 +; SCALAR-NEXT: shrq $32, %r9 +; SCALAR-NEXT: movl %ecx, %r10d ; SCALAR-NEXT: shrl $8, %r10d -; SCALAR-NEXT: notb %dil -; SCALAR-NEXT: movzbl %dil, %edi +; SCALAR-NEXT: notb %cl +; SCALAR-NEXT: movzbl %cl, %ecx ; SCALAR-NEXT: notb %r10b ; SCALAR-NEXT: movzbl %r10b, %r10d ; SCALAR-NEXT: shll $8, %r10d -; SCALAR-NEXT: orl %edi, %r10d +; SCALAR-NEXT: orl %ecx, %r10d ; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movzbl %r9b, %edi +; SCALAR-NEXT: movzbl %r9b, %r9d ; SCALAR-NEXT: notb %r8b -; SCALAR-NEXT: movzbl %r8b, %r8d -; SCALAR-NEXT: shll $8, %r8d -; SCALAR-NEXT: orl %edi, %r8d -; SCALAR-NEXT: notb %cl -; SCALAR-NEXT: movzbl %cl, %ecx +; SCALAR-NEXT: movzbl %r8b, %ecx +; SCALAR-NEXT: shll $8, %ecx +; SCALAR-NEXT: orl %r9d, %ecx +; SCALAR-NEXT: notb %dil +; SCALAR-NEXT: movzbl %dil, %edi ; SCALAR-NEXT: notb %al ; SCALAR-NEXT: movzbl %al, %eax -; SCALAR-NEXT: shll $8, %eax -; SCALAR-NEXT: orl %ecx, %eax -; SCALAR-NEXT: movw %ax, 4(%rsi) -; SCALAR-NEXT: shll $16, %r8d -; SCALAR-NEXT: movzwl %r10w, %ecx -; SCALAR-NEXT: orl %r8d, %ecx -; SCALAR-NEXT: movl %ecx, (%rsi) -; SCALAR-NEXT: movw %ax, 4(%rdx) -; SCALAR-NEXT: movl %ecx, (%rdx) -; SCALAR-NEXT: movw %ax, 12(%rdx) -; SCALAR-NEXT: movl %ecx, 8(%rdx) -; SCALAR-NEXT: movw %ax, 20(%rdx) -; SCALAR-NEXT: movl %ecx, 16(%rdx) -; SCALAR-NEXT: movw %ax, 28(%rdx) -; SCALAR-NEXT: movl %ecx, 24(%rdx) -; SCALAR-NEXT: movw %ax, 36(%rdx) -; SCALAR-NEXT: movl %ecx, 32(%rdx) -; SCALAR-NEXT: movw %ax, 44(%rdx) -; SCALAR-NEXT: movl %ecx, 40(%rdx) -; SCALAR-NEXT: movw %ax, 52(%rdx) -; SCALAR-NEXT: movl %ecx, 48(%rdx) -; SCALAR-NEXT: movw %ax, 60(%rdx) -; SCALAR-NEXT: movl %ecx, 56(%rdx) +; SCALAR-NEXT: movw %cx, 4(%rsi) +; SCALAR-NEXT: shll $16, %eax +; SCALAR-NEXT: shll $24, %edi +; SCALAR-NEXT: orl %eax, %edi +; SCALAR-NEXT: movzwl %r10w, %eax +; SCALAR-NEXT: orl %edi, %eax +; SCALAR-NEXT: movl %eax, (%rsi) +; SCALAR-NEXT: movw %cx, 4(%rdx) +; SCALAR-NEXT: movl %eax, (%rdx) +; SCALAR-NEXT: movw %cx, 12(%rdx) +; SCALAR-NEXT: movl %eax, 8(%rdx) +; SCALAR-NEXT: movw %cx, 20(%rdx) +; SCALAR-NEXT: movl %eax, 16(%rdx) +; SCALAR-NEXT: movw %cx, 28(%rdx) +; SCALAR-NEXT: movl %eax, 24(%rdx) +; SCALAR-NEXT: movw %cx, 36(%rdx) +; SCALAR-NEXT: movl %eax, 32(%rdx) +; SCALAR-NEXT: movw %cx, 44(%rdx) +; SCALAR-NEXT: movl %eax, 40(%rdx) +; SCALAR-NEXT: movw %cx, 52(%rdx) +; SCALAR-NEXT: movl %eax, 48(%rdx) +; SCALAR-NEXT: movw %cx, 60(%rdx) +; SCALAR-NEXT: movl %eax, 56(%rdx) ; SCALAR-NEXT: retq ; ; SSE2-ONLY-LABEL: vec384_v6i8: @@ -4214,36 +3857,36 @@ define void @vec384_v6i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { ; SCALAR-LABEL: vec384_v6i32: ; SCALAR: # %bb.0: -; SCALAR-NEXT: movq (%rdi), %rax -; SCALAR-NEXT: movq 8(%rdi), %rcx +; SCALAR-NEXT: movq 16(%rdi), %rax ; SCALAR-NEXT: movq %rax, %r8 ; SCALAR-NEXT: shrq $32, %r8 -; SCALAR-NEXT: movq %rcx, %r9 +; SCALAR-NEXT: movq (%rdi), %rcx +; SCALAR-NEXT: movq 8(%rdi), %rdi +; SCALAR-NEXT: movq %rdi, %r9 ; SCALAR-NEXT: shrq $32, %r9 -; SCALAR-NEXT: movq 16(%rdi), %rdi -; SCALAR-NEXT: movq %rdi, %r10 +; SCALAR-NEXT: movq %rcx, %r10 ; SCALAR-NEXT: shrq $32, %r10 ; SCALAR-NEXT: notl %r10d ; SCALAR-NEXT: shlq $32, %r10 -; SCALAR-NEXT: notl %edi -; SCALAR-NEXT: orq %r10, %rdi +; SCALAR-NEXT: notl %ecx +; SCALAR-NEXT: orq %r10, %rcx ; SCALAR-NEXT: notl %r9d ; SCALAR-NEXT: shlq $32, %r9 -; SCALAR-NEXT: notl %ecx -; SCALAR-NEXT: orq %r9, %rcx +; SCALAR-NEXT: notl %edi +; SCALAR-NEXT: orq %r9, %rdi ; SCALAR-NEXT: notl %r8d ; SCALAR-NEXT: shlq $32, %r8 ; SCALAR-NEXT: notl %eax ; SCALAR-NEXT: orq %r8, %rax -; SCALAR-NEXT: movq %rax, (%rsi) -; SCALAR-NEXT: movq %rcx, 8(%rsi) -; SCALAR-NEXT: movq %rdi, 16(%rsi) -; SCALAR-NEXT: movq %rax, (%rdx) -; SCALAR-NEXT: movq %rcx, 8(%rdx) -; SCALAR-NEXT: movq %rdi, 16(%rdx) -; SCALAR-NEXT: movq %rdi, 48(%rdx) -; SCALAR-NEXT: movq %rcx, 40(%rdx) -; SCALAR-NEXT: movq %rax, 32(%rdx) +; SCALAR-NEXT: movq %rax, 16(%rsi) +; SCALAR-NEXT: movq %rdi, 8(%rsi) +; SCALAR-NEXT: movq %rcx, (%rsi) +; SCALAR-NEXT: movq %rax, 16(%rdx) +; SCALAR-NEXT: movq %rdi, 8(%rdx) +; SCALAR-NEXT: movq %rcx, (%rdx) +; SCALAR-NEXT: movq %rax, 48(%rdx) +; SCALAR-NEXT: movq %rdi, 40(%rdx) +; SCALAR-NEXT: movq %rcx, 32(%rdx) ; SCALAR-NEXT: retq ; ; SSE2-LABEL: vec384_v6i32: @@ -4301,36 +3944,36 @@ define void @vec384_v6f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { ; SCALAR-LABEL: vec384_v6f32: ; SCALAR: # %bb.0: -; SCALAR-NEXT: movq (%rdi), %rax -; SCALAR-NEXT: movq 8(%rdi), %rcx +; SCALAR-NEXT: movq 16(%rdi), %rax ; SCALAR-NEXT: movq %rax, %r8 ; SCALAR-NEXT: shrq $32, %r8 -; SCALAR-NEXT: movq %rcx, %r9 +; SCALAR-NEXT: movq (%rdi), %rcx +; SCALAR-NEXT: movq 8(%rdi), %rdi +; SCALAR-NEXT: movq %rdi, %r9 ; SCALAR-NEXT: shrq $32, %r9 -; SCALAR-NEXT: movq 16(%rdi), %rdi -; SCALAR-NEXT: movq %rdi, %r10 +; SCALAR-NEXT: movq %rcx, %r10 ; SCALAR-NEXT: shrq $32, %r10 ; SCALAR-NEXT: notl %r10d ; SCALAR-NEXT: shlq $32, %r10 -; SCALAR-NEXT: notl %edi -; SCALAR-NEXT: orq %r10, %rdi +; SCALAR-NEXT: notl %ecx +; SCALAR-NEXT: orq %r10, %rcx ; SCALAR-NEXT: notl %r9d ; SCALAR-NEXT: shlq $32, %r9 -; SCALAR-NEXT: notl %ecx -; SCALAR-NEXT: orq %r9, %rcx +; SCALAR-NEXT: notl %edi +; SCALAR-NEXT: orq %r9, %rdi ; SCALAR-NEXT: notl %r8d ; SCALAR-NEXT: shlq $32, %r8 ; SCALAR-NEXT: notl %eax ; SCALAR-NEXT: orq %r8, %rax -; SCALAR-NEXT: movq %rax, (%rsi) -; SCALAR-NEXT: movq %rcx, 8(%rsi) -; SCALAR-NEXT: movq %rdi, 16(%rsi) -; SCALAR-NEXT: movq %rax, (%rdx) -; SCALAR-NEXT: movq %rcx, 8(%rdx) -; SCALAR-NEXT: movq %rdi, 16(%rdx) -; SCALAR-NEXT: movq %rdi, 48(%rdx) -; SCALAR-NEXT: movq %rcx, 40(%rdx) -; SCALAR-NEXT: movq %rax, 32(%rdx) +; SCALAR-NEXT: movq %rax, 16(%rsi) +; SCALAR-NEXT: movq %rdi, 8(%rsi) +; SCALAR-NEXT: movq %rcx, (%rsi) +; SCALAR-NEXT: movq %rax, 16(%rdx) +; SCALAR-NEXT: movq %rdi, 8(%rdx) +; SCALAR-NEXT: movq %rcx, (%rdx) +; SCALAR-NEXT: movq %rax, 48(%rdx) +; SCALAR-NEXT: movq %rdi, 40(%rdx) +; SCALAR-NEXT: movq %rcx, 32(%rdx) ; SCALAR-NEXT: retq ; ; SSE2-LABEL: vec384_v6f32: @@ -4465,53 +4108,18 @@ ; SCALAR-NEXT: popq %rbx ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec384_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec384_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX1-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec384_v8i8: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec384_v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; SSE-LABEL: vec384_v8i8: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: movq %rax, 16(%rdx) +; SSE-NEXT: movq %rax, 24(%rdx) +; SSE-NEXT: movq %rax, 32(%rdx) +; SSE-NEXT: movq %rax, 40(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <8 x i8> %in.subvec.not, store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -4648,11 +4256,11 @@ ; SCALAR-NEXT: shrl $16, %r12d ; SCALAR-NEXT: notb %r12b ; SCALAR-NEXT: movzbl %r12b, %r12d +; SCALAR-NEXT: shll $16, %r12d ; SCALAR-NEXT: notb %r15b ; SCALAR-NEXT: movzbl %r15b, %r15d -; SCALAR-NEXT: shll $8, %r15d +; SCALAR-NEXT: shll $24, %r15d ; SCALAR-NEXT: orl %r12d, %r15d -; SCALAR-NEXT: shll $16, %r15d ; SCALAR-NEXT: notb %r9b ; SCALAR-NEXT: movzbl %r9b, %r9d ; SCALAR-NEXT: notb %bpl @@ -4663,11 +4271,11 @@ ; SCALAR-NEXT: orl %r15d, %r9d ; SCALAR-NEXT: notb %r14b ; SCALAR-NEXT: movzbl %r14b, %ebp +; SCALAR-NEXT: shll $16, %ebp ; SCALAR-NEXT: notb %bl ; SCALAR-NEXT: movzbl %bl, %ebx -; SCALAR-NEXT: shll $8, %ebx +; SCALAR-NEXT: shll $24, %ebx ; SCALAR-NEXT: orl %ebp, %ebx -; SCALAR-NEXT: shll $16, %ebx ; SCALAR-NEXT: notb %r11b ; SCALAR-NEXT: movzbl %r11b, %r11d ; SCALAR-NEXT: notb %r10b @@ -4678,11 +4286,11 @@ ; SCALAR-NEXT: orl %ebx, %r10d ; SCALAR-NEXT: notb %r8b ; SCALAR-NEXT: movzbl %r8b, %r8d +; SCALAR-NEXT: shll $16, %r8d ; SCALAR-NEXT: notb %dil ; SCALAR-NEXT: movzbl %dil, %edi -; SCALAR-NEXT: shll $8, %edi +; SCALAR-NEXT: shll $24, %edi ; SCALAR-NEXT: orl %r8d, %edi -; SCALAR-NEXT: shll $16, %edi ; SCALAR-NEXT: notb %cl ; SCALAR-NEXT: movzbl %cl, %ecx ; SCALAR-NEXT: notb %al @@ -4827,20 +4435,20 @@ ; SCALAR: # %bb.0: ; SCALAR-NEXT: pushq %r14 ; SCALAR-NEXT: pushq %rbx -; SCALAR-NEXT: movq (%rdi), %rax -; SCALAR-NEXT: movq 8(%rdi), %rcx +; SCALAR-NEXT: movq 16(%rdi), %rax ; SCALAR-NEXT: movq %rax, %r8 ; SCALAR-NEXT: shrq $32, %r8 ; SCALAR-NEXT: movq %rax, %r9 ; SCALAR-NEXT: shrq $48, %r9 -; SCALAR-NEXT: movq %rcx, %r10 +; SCALAR-NEXT: movq (%rdi), %rcx +; SCALAR-NEXT: movq 8(%rdi), %rdi +; SCALAR-NEXT: movq %rdi, %r10 ; SCALAR-NEXT: shrq $32, %r10 -; SCALAR-NEXT: movq %rcx, %r11 +; SCALAR-NEXT: movq %rdi, %r11 ; SCALAR-NEXT: shrq $48, %r11 -; SCALAR-NEXT: movq 16(%rdi), %rdi -; SCALAR-NEXT: movq %rdi, %rbx +; SCALAR-NEXT: movq %rcx, %rbx ; SCALAR-NEXT: shrq $32, %rbx -; SCALAR-NEXT: movq %rdi, %r14 +; SCALAR-NEXT: movq %rcx, %r14 ; SCALAR-NEXT: shrq $48, %r14 ; SCALAR-NEXT: notl %r14d ; SCALAR-NEXT: shll $16, %r14d @@ -4848,16 +4456,16 @@ ; SCALAR-NEXT: movzwl %bx, %ebx ; SCALAR-NEXT: orl %r14d, %ebx ; SCALAR-NEXT: shlq $32, %rbx -; SCALAR-NEXT: notl %edi -; SCALAR-NEXT: orq %rbx, %rdi +; SCALAR-NEXT: notl %ecx +; SCALAR-NEXT: orq %rbx, %rcx ; SCALAR-NEXT: notl %r11d ; SCALAR-NEXT: shll $16, %r11d ; SCALAR-NEXT: notl %r10d ; SCALAR-NEXT: movzwl %r10w, %r10d ; SCALAR-NEXT: orl %r11d, %r10d ; SCALAR-NEXT: shlq $32, %r10 -; SCALAR-NEXT: notl %ecx -; SCALAR-NEXT: orq %r10, %rcx +; SCALAR-NEXT: notl %edi +; SCALAR-NEXT: orq %r10, %rdi ; SCALAR-NEXT: notl %r9d ; SCALAR-NEXT: shll $16, %r9d ; SCALAR-NEXT: notl %r8d @@ -4866,15 +4474,15 @@ ; SCALAR-NEXT: shlq $32, %r8 ; SCALAR-NEXT: notl %eax ; SCALAR-NEXT: orq %r8, %rax -; SCALAR-NEXT: movq %rax, (%rsi) -; SCALAR-NEXT: movq %rcx, 8(%rsi) -; SCALAR-NEXT: movq %rdi, 16(%rsi) -; SCALAR-NEXT: movq %rax, (%rdx) -; SCALAR-NEXT: movq %rcx, 8(%rdx) -; SCALAR-NEXT: movq %rdi, 16(%rdx) -; SCALAR-NEXT: movq %rdi, 48(%rdx) -; SCALAR-NEXT: movq %rcx, 40(%rdx) -; SCALAR-NEXT: movq %rax, 32(%rdx) +; SCALAR-NEXT: movq %rax, 16(%rsi) +; SCALAR-NEXT: movq %rdi, 8(%rsi) +; SCALAR-NEXT: movq %rcx, (%rsi) +; SCALAR-NEXT: movq %rax, 16(%rdx) +; SCALAR-NEXT: movq %rdi, 8(%rdx) +; SCALAR-NEXT: movq %rcx, (%rdx) +; SCALAR-NEXT: movq %rax, 48(%rdx) +; SCALAR-NEXT: movq %rdi, 40(%rdx) +; SCALAR-NEXT: movq %rcx, 32(%rdx) ; SCALAR-NEXT: popq %rbx ; SCALAR-NEXT: popq %r14 ; SCALAR-NEXT: retq @@ -5122,101 +4730,101 @@ define void @vec384_v24i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { ; SCALAR-LABEL: vec384_v24i8: ; SCALAR: # %bb.0: -; SCALAR-NEXT: movq (%rdi), %rax +; SCALAR-NEXT: movq 16(%rdi), %rax +; SCALAR-NEXT: movq (%rdi), %r8 ; SCALAR-NEXT: movq 8(%rdi), %rcx -; SCALAR-NEXT: movq 16(%rdi), %rdi -; SCALAR-NEXT: movq %rdi, %r8 -; SCALAR-NEXT: shrq $40, %r8 -; SCALAR-NEXT: movq %rdi, %r9 +; SCALAR-NEXT: movq %r8, %rdi +; SCALAR-NEXT: shrq $40, %rdi +; SCALAR-NEXT: movq %r8, %r9 ; SCALAR-NEXT: shrq $56, %r9 -; SCALAR-NEXT: movq %rdi, %r10 +; SCALAR-NEXT: movq %r8, %r10 ; SCALAR-NEXT: shrq $48, %r10 ; SCALAR-NEXT: notb %r10b ; SCALAR-NEXT: movzbl %r10b, %r10d +; SCALAR-NEXT: shll $16, %r10d +; SCALAR-NEXT: notb %r9b +; SCALAR-NEXT: movzbl %r9b, %r11d +; SCALAR-NEXT: shll $24, %r11d +; SCALAR-NEXT: orl %r10d, %r11d +; SCALAR-NEXT: movq %r8, %r9 +; SCALAR-NEXT: shrq $32, %r9 ; SCALAR-NEXT: notb %r9b ; SCALAR-NEXT: movzbl %r9b, %r9d -; SCALAR-NEXT: shll $8, %r9d -; SCALAR-NEXT: orl %r10d, %r9d -; SCALAR-NEXT: movq %rdi, %r10 -; SCALAR-NEXT: shrq $32, %r10 +; SCALAR-NEXT: notb %dil +; SCALAR-NEXT: movzbl %dil, %edi +; SCALAR-NEXT: shll $8, %edi +; SCALAR-NEXT: orl %r9d, %edi +; SCALAR-NEXT: movl %r8d, %r10d +; SCALAR-NEXT: shrl $24, %r10d +; SCALAR-NEXT: movzwl %di, %r9d +; SCALAR-NEXT: orl %r11d, %r9d +; SCALAR-NEXT: movl %r8d, %edi +; SCALAR-NEXT: shrl $16, %edi +; SCALAR-NEXT: notb %dil +; SCALAR-NEXT: movzbl %dil, %edi +; SCALAR-NEXT: shll $16, %edi ; SCALAR-NEXT: notb %r10b ; SCALAR-NEXT: movzbl %r10b, %r10d +; SCALAR-NEXT: shll $24, %r10d +; SCALAR-NEXT: orl %edi, %r10d +; SCALAR-NEXT: movl %r8d, %edi +; SCALAR-NEXT: shrl $8, %edi ; SCALAR-NEXT: notb %r8b ; SCALAR-NEXT: movzbl %r8b, %r8d -; SCALAR-NEXT: shll $8, %r8d -; SCALAR-NEXT: orl %r10d, %r8d -; SCALAR-NEXT: movl %edi, %r10d -; SCALAR-NEXT: shrl $24, %r10d -; SCALAR-NEXT: shll $16, %r9d -; SCALAR-NEXT: movzwl %r8w, %r8d -; SCALAR-NEXT: orl %r9d, %r8d -; SCALAR-NEXT: movl %edi, %r9d -; SCALAR-NEXT: shrl $16, %r9d +; SCALAR-NEXT: notb %dil +; SCALAR-NEXT: movzbl %dil, %edi +; SCALAR-NEXT: shll $8, %edi +; SCALAR-NEXT: orl %r8d, %edi +; SCALAR-NEXT: movq %rcx, %r8 +; SCALAR-NEXT: shrq $40, %r8 +; SCALAR-NEXT: movzwl %di, %edi +; SCALAR-NEXT: orl %r10d, %edi +; SCALAR-NEXT: movq %rcx, %r10 +; SCALAR-NEXT: shrq $56, %r10 +; SCALAR-NEXT: shlq $32, %r9 +; SCALAR-NEXT: orq %r9, %rdi +; SCALAR-NEXT: movq %rcx, %r9 +; SCALAR-NEXT: shrq $48, %r9 ; SCALAR-NEXT: notb %r9b ; SCALAR-NEXT: movzbl %r9b, %r9d +; SCALAR-NEXT: shll $16, %r9d ; SCALAR-NEXT: notb %r10b ; SCALAR-NEXT: movzbl %r10b, %r10d -; SCALAR-NEXT: shll $8, %r10d +; SCALAR-NEXT: shll $24, %r10d ; SCALAR-NEXT: orl %r9d, %r10d -; SCALAR-NEXT: movl %edi, %r9d -; SCALAR-NEXT: shrl $8, %r9d -; SCALAR-NEXT: notb %dil -; SCALAR-NEXT: movzbl %dil, %edi -; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movzbl %r9b, %r11d -; SCALAR-NEXT: shll $8, %r11d -; SCALAR-NEXT: orl %edi, %r11d ; SCALAR-NEXT: movq %rcx, %r9 -; SCALAR-NEXT: shrq $40, %r9 -; SCALAR-NEXT: shll $16, %r10d -; SCALAR-NEXT: movzwl %r11w, %edi -; SCALAR-NEXT: orl %r10d, %edi -; SCALAR-NEXT: movq %rcx, %r10 -; SCALAR-NEXT: shrq $56, %r10 -; SCALAR-NEXT: shlq $32, %r8 -; SCALAR-NEXT: orq %r8, %rdi -; SCALAR-NEXT: movq %rcx, %r8 -; SCALAR-NEXT: shrq $48, %r8 +; SCALAR-NEXT: shrq $32, %r9 +; SCALAR-NEXT: notb %r9b +; SCALAR-NEXT: movzbl %r9b, %r9d ; SCALAR-NEXT: notb %r8b ; SCALAR-NEXT: movzbl %r8b, %r8d +; SCALAR-NEXT: shll $8, %r8d +; SCALAR-NEXT: orl %r9d, %r8d +; SCALAR-NEXT: movl %ecx, %r9d +; SCALAR-NEXT: shrl $24, %r9d +; SCALAR-NEXT: movzwl %r8w, %r8d +; SCALAR-NEXT: orl %r10d, %r8d +; SCALAR-NEXT: movl %ecx, %r10d +; SCALAR-NEXT: shrl $16, %r10d ; SCALAR-NEXT: notb %r10b ; SCALAR-NEXT: movzbl %r10b, %r10d -; SCALAR-NEXT: shll $8, %r10d -; SCALAR-NEXT: orl %r8d, %r10d -; SCALAR-NEXT: movq %rcx, %r8 -; SCALAR-NEXT: shrq $32, %r8 -; SCALAR-NEXT: notb %r8b -; SCALAR-NEXT: movzbl %r8b, %r8d -; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movzbl %r9b, %r9d -; SCALAR-NEXT: shll $8, %r9d -; SCALAR-NEXT: orl %r8d, %r9d -; SCALAR-NEXT: movl %ecx, %r11d -; SCALAR-NEXT: shrl $24, %r11d ; SCALAR-NEXT: shll $16, %r10d -; SCALAR-NEXT: movzwl %r9w, %r8d -; SCALAR-NEXT: orl %r10d, %r8d -; SCALAR-NEXT: movl %ecx, %r9d -; SCALAR-NEXT: shrl $16, %r9d ; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movzbl %r9b, %r9d -; SCALAR-NEXT: notb %r11b -; SCALAR-NEXT: movzbl %r11b, %r10d -; SCALAR-NEXT: shll $8, %r10d -; SCALAR-NEXT: orl %r9d, %r10d +; SCALAR-NEXT: movzbl %r9b, %r11d +; SCALAR-NEXT: shll $24, %r11d +; SCALAR-NEXT: orl %r10d, %r11d ; SCALAR-NEXT: movl %ecx, %r9d ; SCALAR-NEXT: shrl $8, %r9d ; SCALAR-NEXT: notb %cl ; SCALAR-NEXT: movzbl %cl, %ecx ; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movzbl %r9b, %r11d -; SCALAR-NEXT: shll $8, %r11d -; SCALAR-NEXT: orl %ecx, %r11d +; SCALAR-NEXT: movzbl %r9b, %r10d +; SCALAR-NEXT: shll $8, %r10d +; SCALAR-NEXT: orl %ecx, %r10d ; SCALAR-NEXT: movq %rax, %r9 ; SCALAR-NEXT: shrq $40, %r9 -; SCALAR-NEXT: shll $16, %r10d -; SCALAR-NEXT: movzwl %r11w, %ecx -; SCALAR-NEXT: orl %r10d, %ecx +; SCALAR-NEXT: movzwl %r10w, %ecx +; SCALAR-NEXT: orl %r11d, %ecx ; SCALAR-NEXT: movq %rax, %r10 ; SCALAR-NEXT: shrq $56, %r10 ; SCALAR-NEXT: shlq $32, %r8 @@ -5225,9 +4833,10 @@ ; SCALAR-NEXT: shrq $48, %r8 ; SCALAR-NEXT: notb %r8b ; SCALAR-NEXT: movzbl %r8b, %r8d +; SCALAR-NEXT: shll $16, %r8d ; SCALAR-NEXT: notb %r10b ; SCALAR-NEXT: movzbl %r10b, %r10d -; SCALAR-NEXT: shll $8, %r10d +; SCALAR-NEXT: shll $24, %r10d ; SCALAR-NEXT: orl %r8d, %r10d ; SCALAR-NEXT: movq %rax, %r8 ; SCALAR-NEXT: shrq $32, %r8 @@ -5239,16 +4848,16 @@ ; SCALAR-NEXT: orl %r8d, %r9d ; SCALAR-NEXT: movl %eax, %r11d ; SCALAR-NEXT: shrl $24, %r11d -; SCALAR-NEXT: shll $16, %r10d ; SCALAR-NEXT: movzwl %r9w, %r8d ; SCALAR-NEXT: orl %r10d, %r8d ; SCALAR-NEXT: movl %eax, %r9d ; SCALAR-NEXT: shrl $16, %r9d ; SCALAR-NEXT: notb %r9b ; SCALAR-NEXT: movzbl %r9b, %r9d +; SCALAR-NEXT: shll $16, %r9d ; SCALAR-NEXT: notb %r11b ; SCALAR-NEXT: movzbl %r11b, %r10d -; SCALAR-NEXT: shll $8, %r10d +; SCALAR-NEXT: shll $24, %r10d ; SCALAR-NEXT: orl %r9d, %r10d ; SCALAR-NEXT: movl %eax, %r9d ; SCALAR-NEXT: shrl $8, %r9d @@ -5258,20 +4867,19 @@ ; SCALAR-NEXT: movzbl %r9b, %r9d ; SCALAR-NEXT: shll $8, %r9d ; SCALAR-NEXT: orl %eax, %r9d -; SCALAR-NEXT: shll $16, %r10d ; SCALAR-NEXT: movzwl %r9w, %eax ; SCALAR-NEXT: orl %r10d, %eax ; SCALAR-NEXT: shlq $32, %r8 ; SCALAR-NEXT: orq %r8, %rax -; SCALAR-NEXT: movq %rax, (%rsi) +; SCALAR-NEXT: movq %rax, 16(%rsi) ; SCALAR-NEXT: movq %rcx, 8(%rsi) -; SCALAR-NEXT: movq %rdi, 16(%rsi) -; SCALAR-NEXT: movq %rax, (%rdx) +; SCALAR-NEXT: movq %rdi, (%rsi) +; SCALAR-NEXT: movq %rax, 16(%rdx) ; SCALAR-NEXT: movq %rcx, 8(%rdx) -; SCALAR-NEXT: movq %rdi, 16(%rdx) -; SCALAR-NEXT: movq %rdi, 48(%rdx) +; SCALAR-NEXT: movq %rdi, (%rdx) +; SCALAR-NEXT: movq %rax, 48(%rdx) ; SCALAR-NEXT: movq %rcx, 40(%rdx) -; SCALAR-NEXT: movq %rax, 32(%rdx) +; SCALAR-NEXT: movq %rdi, 32(%rdx) ; SCALAR-NEXT: retq ; ; SSE2-LABEL: vec384_v24i8: @@ -5476,43 +5084,24 @@ ; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi) ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX1-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX1-NEXT: vmovdqa %xmm0, 48(%rdx) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-ONLY-LABEL: vec512_v2i8: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpxor (%rdi), %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512F-LABEL: vec512_v2i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpxor (%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: vec512_v2i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor (%rdi), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX512BW-NEXT: vpbroadcastw %xmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX2-LABEL: vec512_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <2 x i8> %in.subvec.not, store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -5735,53 +5324,20 @@ ; SCALAR-NEXT: movl %eax, 56(%rdx) ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec512_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm0, 48(%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec512_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec512_v2i32: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec512_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; SSE-LABEL: vec512_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: movq %rax, 16(%rdx) +; SSE-NEXT: movq %rax, 24(%rdx) +; SSE-NEXT: movq %rax, 32(%rdx) +; SSE-NEXT: movq %rax, 40(%rdx) +; SSE-NEXT: movq %rax, 48(%rdx) +; SSE-NEXT: movq %rax, 56(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <2 x i32> %in.subvec.not, store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -5831,53 +5387,20 @@ ; SCALAR-NEXT: movl %eax, 56(%rdx) ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec512_v2f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm0, 48(%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec512_v2f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec512_v2f32: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec512_v2f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; SSE-LABEL: vec512_v2f32: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: movq %rax, 16(%rdx) +; SSE-NEXT: movq %rax, 24(%rdx) +; SSE-NEXT: movq %rax, 32(%rdx) +; SSE-NEXT: movq %rax, 40(%rdx) +; SSE-NEXT: movq %rax, 48(%rdx) +; SSE-NEXT: movq %rax, 56(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 %in.subvec.int = xor <2 x i32> %in.subvec.not, %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float> @@ -6013,26 +5536,26 @@ define void @vec512_v2i128(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { ; ALL-LABEL: vec512_v2i128: ; ALL: # %bb.0: -; ALL-NEXT: movq 16(%rdi), %rax -; ALL-NEXT: movq 24(%rdi), %rcx +; ALL-NEXT: movq 24(%rdi), %rax +; ALL-NEXT: movq 16(%rdi), %rcx ; ALL-NEXT: movq (%rdi), %r8 ; ALL-NEXT: movq 8(%rdi), %rdi -; ALL-NEXT: notq %rdi ; ALL-NEXT: notq %r8 +; ALL-NEXT: notq %rdi ; ALL-NEXT: notq %rcx ; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, 16(%rsi) -; ALL-NEXT: movq %rcx, 24(%rsi) -; ALL-NEXT: movq %r8, (%rsi) +; ALL-NEXT: movq %rax, 24(%rsi) +; ALL-NEXT: movq %rcx, 16(%rsi) ; ALL-NEXT: movq %rdi, 8(%rsi) -; ALL-NEXT: movq %rax, 16(%rdx) -; ALL-NEXT: movq %rcx, 24(%rdx) -; ALL-NEXT: movq %r8, (%rdx) +; ALL-NEXT: movq %r8, (%rsi) +; ALL-NEXT: movq %rax, 24(%rdx) +; ALL-NEXT: movq %rcx, 16(%rdx) ; ALL-NEXT: movq %rdi, 8(%rdx) -; ALL-NEXT: movq %rax, 48(%rdx) -; ALL-NEXT: movq %rcx, 56(%rdx) -; ALL-NEXT: movq %r8, 32(%rdx) +; ALL-NEXT: movq %r8, (%rdx) +; ALL-NEXT: movq %rax, 56(%rdx) +; ALL-NEXT: movq %rcx, 48(%rdx) ; ALL-NEXT: movq %rdi, 40(%rdx) +; ALL-NEXT: movq %r8, 32(%rdx) ; ALL-NEXT: retq %in.subvec.not = load <2 x i128>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <2 x i128> %in.subvec.not, @@ -6256,53 +5779,20 @@ ; SCALAR-NEXT: movw %ax, 56(%rdx) ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec512_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm0, 48(%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec512_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec512_v4i16: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec512_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; SSE-LABEL: vec512_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: movq %rax, 16(%rdx) +; SSE-NEXT: movq %rax, 24(%rdx) +; SSE-NEXT: movq %rax, 32(%rdx) +; SSE-NEXT: movq %rax, 40(%rdx) +; SSE-NEXT: movq %rax, 48(%rdx) +; SSE-NEXT: movq %rax, 56(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <4 x i16> %in.subvec.not, store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -6448,15 +5938,15 @@ ; SSE2-LABEL: vec512_v4i64: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pxor (%rdi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, (%rsi) -; SSE2-NEXT: movdqa %xmm1, 16(%rsi) -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) -; SSE2-NEXT: movdqa %xmm1, 48(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) +; SSE2-NEXT: pxor 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm1, 32(%rdx) ; SSE2-NEXT: retq ; ; AVX1-LABEL: vec512_v4i64: @@ -6517,15 +6007,15 @@ ; SSE2-LABEL: vec512_v4f64: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pxor (%rdi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, (%rsi) -; SSE2-NEXT: movdqa %xmm1, 16(%rsi) -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) -; SSE2-NEXT: movdqa %xmm1, 48(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) +; SSE2-NEXT: pxor 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm1, 32(%rdx) ; SSE2-NEXT: retq ; ; AVX1-LABEL: vec512_v4f64: @@ -6654,53 +6144,20 @@ ; SCALAR-NEXT: popq %rbx ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec512_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm0, 48(%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec512_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec512_v8i8: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec512_v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; SSE-LABEL: vec512_v8i8: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: movq %rax, 16(%rdx) +; SSE-NEXT: movq %rax, 24(%rdx) +; SSE-NEXT: movq %rax, 32(%rdx) +; SSE-NEXT: movq %rax, 40(%rdx) +; SSE-NEXT: movq %rax, 48(%rdx) +; SSE-NEXT: movq %rax, 56(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <8 x i8> %in.subvec.not, store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -6871,15 +6328,15 @@ ; SSE2-LABEL: vec512_v8i32: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pxor (%rdi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, (%rsi) -; SSE2-NEXT: movdqa %xmm1, 16(%rsi) -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) -; SSE2-NEXT: movdqa %xmm1, 48(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) +; SSE2-NEXT: pxor 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm1, 32(%rdx) ; SSE2-NEXT: retq ; ; AVX1-LABEL: vec512_v8i32: @@ -6962,15 +6419,15 @@ ; SSE2-LABEL: vec512_v8f32: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pxor (%rdi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, (%rsi) -; SSE2-NEXT: movdqa %xmm1, 16(%rsi) -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) -; SSE2-NEXT: movdqa %xmm1, 48(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) +; SSE2-NEXT: pxor 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm1, 32(%rdx) ; SSE2-NEXT: retq ; ; AVX1-LABEL: vec512_v8f32: @@ -7361,15 +6818,15 @@ ; SSE2-LABEL: vec512_v16i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pxor (%rdi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, (%rsi) -; SSE2-NEXT: movdqa %xmm1, 16(%rsi) -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) -; SSE2-NEXT: movdqa %xmm1, 48(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) +; SSE2-NEXT: pxor 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm1, 32(%rdx) ; SSE2-NEXT: retq ; ; AVX1-LABEL: vec512_v16i16: @@ -7676,15 +7133,15 @@ ; SSE2-LABEL: vec512_v32i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pxor (%rdi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, (%rsi) -; SSE2-NEXT: movdqa %xmm1, 16(%rsi) -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) -; SSE2-NEXT: movdqa %xmm1, 48(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) +; SSE2-NEXT: pxor 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm1, 32(%rdx) ; SSE2-NEXT: retq ; ; AVX1-LABEL: vec512_v32i8: @@ -7717,4 +7174,6 @@ ret void } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX512BW: {{.*}} +; AVX512F: {{.*}} ; SSSE3: {{.*}} diff --git a/llvm/test/CodeGen/X86/switch.ll b/llvm/test/CodeGen/X86/switch.ll --- a/llvm/test/CodeGen/X86/switch.ll +++ b/llvm/test/CodeGen/X86/switch.ll @@ -1416,10 +1416,10 @@ define void @int_max_table_cluster(i8 %x) { ; CHECK-LABEL: int_max_table_cluster: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cmpb $-9, %dil +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: cmpl $247, %eax ; CHECK-NEXT: ja .LBB15_4 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: jmpq *.LJTI15_0(,%rax,8) ; CHECK-NEXT: .LBB15_2: # %bb0 ; CHECK-NEXT: xorl %edi, %edi @@ -2443,12 +2443,10 @@ define void @pr23738(i4 %x) { ; CHECK-LABEL: pr23738: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: andb $15, %al -; CHECK-NEXT: cmpb $11, %al +; CHECK-NEXT: andl $15, %edi +; CHECK-NEXT: cmpl $11, %edi ; CHECK-NEXT: ja .LBB23_2 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: andl $15, %edi ; CHECK-NEXT: movl $2051, %eax # imm = 0x803 ; CHECK-NEXT: btl %edi, %eax ; CHECK-NEXT: jae .LBB23_2 diff --git a/llvm/test/CodeGen/X86/test-shrink-bug.ll b/llvm/test/CodeGen/X86/test-shrink-bug.ll --- a/llvm/test/CodeGen/X86/test-shrink-bug.ll +++ b/llvm/test/CodeGen/X86/test-shrink-bug.ll @@ -68,8 +68,8 @@ ; CHECK-X64-NEXT: je .LBB1_3 ; CHECK-X64-NEXT: # %bb.1: ; CHECK-X64-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8] -; CHECK-X64-NEXT: pextrw $4, %xmm0, %eax +; CHECK-X64-NEXT: pslld $8, %xmm0 +; CHECK-X64-NEXT: pextrw $1, %xmm0, %eax ; CHECK-X64-NEXT: testb $1, %al ; CHECK-X64-NEXT: jne .LBB1_3 ; CHECK-X64-NEXT: # %bb.2: # %no diff --git a/llvm/test/CodeGen/X86/test-shrink.ll b/llvm/test/CodeGen/X86/test-shrink.ll --- a/llvm/test/CodeGen/X86/test-shrink.ll +++ b/llvm/test/CodeGen/X86/test-shrink.ll @@ -822,8 +822,8 @@ ; ; CHECK-X86-LABEL: and64_trunc_16_sign: ; CHECK-X86: # %bb.0: -; CHECK-X86-NEXT: movl $32768, %eax # imm = 0x8000 -; CHECK-X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; CHECK-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-X86-NEXT: andl $32768, %eax # imm = 0x8000 ; CHECK-X86-NEXT: testw %ax, %ax ; CHECK-X86-NEXT: js .LBB18_2 ; CHECK-X86-NEXT: # %bb.1: # %yes @@ -867,7 +867,8 @@ ; ; CHECK-X86-LABEL: and64_trunc_16_sign_minsize: ; CHECK-X86: # %bb.0: -; CHECK-X86-NEXT: testw $-32768, {{[0-9]+}}(%esp) # imm = 0x8000 +; CHECK-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-X86-NEXT: testw $-32768, %ax # imm = 0x8000 ; CHECK-X86-NEXT: js .LBB19_2 ; CHECK-X86-NEXT: # %bb.1: # %yes ; CHECK-X86-NEXT: calll bar@PLT diff --git a/llvm/test/CodeGen/X86/test-vs-bittest.ll b/llvm/test/CodeGen/X86/test-vs-bittest.ll --- a/llvm/test/CodeGen/X86/test-vs-bittest.ll +++ b/llvm/test/CodeGen/X86/test-vs-bittest.ll @@ -601,9 +601,10 @@ define i64 @is_lower_bit_clear_i64(i64 %x) { ; CHECK-LABEL: is_lower_bit_clear_i64: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testl $134217728, %edi # imm = 0x8000000 -; CHECK-NEXT: sete %al +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shrl $27, %eax +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: xorq $1, %rax ; CHECK-NEXT: retq %sh = lshr i64 %x, 27 %m = and i64 %sh, 1 diff --git a/llvm/test/CodeGen/X86/udiv_fix.ll b/llvm/test/CodeGen/X86/udiv_fix.ll --- a/llvm/test/CodeGen/X86/udiv_fix.ll +++ b/llvm/test/CodeGen/X86/udiv_fix.ll @@ -89,8 +89,8 @@ ; ; X86-LABEL: func3: ; X86: # %bb.0: +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %eax, %eax ; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: shll $4, %ecx diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll --- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll @@ -27,8 +27,7 @@ ; X86-LABEL: func: ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl %ax, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shll $8, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %ecx @@ -108,8 +107,8 @@ ; ; X86-LABEL: func3: ; X86: # %bb.0: +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %eax, %eax ; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: shll $4, %ecx @@ -172,11 +171,9 @@ ; X64: # %bb.0: ; X64-NEXT: pushq %rax ; X64-NEXT: movq %rsi, %rdx -; X64-NEXT: leaq (%rdi,%rdi), %rax ; X64-NEXT: movq %rdi, %rsi -; X64-NEXT: shrq $63, %rsi -; X64-NEXT: shldq $31, %rax, %rsi ; X64-NEXT: shlq $32, %rdi +; X64-NEXT: shrq $32, %rsi ; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: callq __udivti3@PLT ; X64-NEXT: cmpq $2, %rdx @@ -213,15 +210,17 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: calll __udivti3 ; X86-NEXT: addl $32, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl $-1, %eax -; X86-NEXT: movl $-1, %edx -; X86-NEXT: jne .LBB4_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl (%esp), %eax +; X86-NEXT: movl (%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: .LBB4_2: +; X86-NEXT: cmpl $-1, %edx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: cmovel %edx, %eax +; X86-NEXT: cmovel %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: cmovnel %ecx, %edx +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: leal -4(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %ebp @@ -283,15 +282,14 @@ ; X86-LABEL: func7: ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: shll $17, %edx -; X86-NEXT: shrl $15, %ecx -; X86-NEXT: andl $1, %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: shrl $15, %edx +; X86-NEXT: shll $17, %eax ; X86-NEXT: pushl $0 -; X86-NEXT: pushl %eax ; X86-NEXT: pushl %ecx ; X86-NEXT: pushl %edx +; X86-NEXT: pushl %eax ; X86-NEXT: calll __udivdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: cmpl $131071, %eax # imm = 0x1FFFF @@ -318,7 +316,7 @@ ; X64-NEXT: movq %xmm4, %rcx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx -; X64-NEXT: movq %rax, %xmm8 +; X64-NEXT: movq %rax, %xmm7 ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; X64-NEXT: movq %xmm3, %rax ; X64-NEXT: movdqa %xmm1, %xmm3 @@ -327,31 +325,30 @@ ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm3 -; X64-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm3[0] +; X64-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm3[0] ; X64-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] -; X64-NEXT: movdqa %xmm8, %xmm3 +; X64-NEXT: movdqa %xmm7, %xmm3 ; X64-NEXT: pxor %xmm4, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] -; X64-NEXT: movdqa {{.*#+}} xmm7 = [2147483649,2147483649,2147483649,2147483649] -; X64-NEXT: pcmpeqd %xmm7, %xmm6 ; X64-NEXT: movdqa {{.*#+}} xmm5 = [9223372043297226751,9223372043297226751] -; X64-NEXT: movdqa %xmm5, %xmm9 -; X64-NEXT: pcmpgtd %xmm3, %xmm9 -; X64-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; X64-NEXT: pand %xmm6, %xmm10 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3] -; X64-NEXT: por %xmm10, %xmm3 +; X64-NEXT: movdqa %xmm5, %xmm6 +; X64-NEXT: pcmpgtd %xmm3, %xmm6 +; X64-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] +; X64-NEXT: pcmpeqd %xmm5, %xmm3 +; X64-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] +; X64-NEXT: pand %xmm8, %xmm9 +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; X64-NEXT: por %xmm9, %xmm3 ; X64-NEXT: pcmpeqd %xmm6, %xmm6 -; X64-NEXT: pand %xmm3, %xmm8 +; X64-NEXT: pand %xmm3, %xmm7 ; X64-NEXT: pxor %xmm6, %xmm3 -; X64-NEXT: por %xmm8, %xmm3 +; X64-NEXT: por %xmm7, %xmm3 ; X64-NEXT: psrlq $1, %xmm3 ; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; X64-NEXT: movq %xmm2, %rax ; X64-NEXT: movd %xmm1, %ecx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx -; X64-NEXT: movq %rax, %xmm8 +; X64-NEXT: movq %rax, %xmm7 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; X64-NEXT: movq %xmm0, %rax ; X64-NEXT: psrlq $32, %xmm1 @@ -359,17 +356,18 @@ ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm0 -; X64-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; X64-NEXT: pxor %xmm8, %xmm4 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; X64-NEXT: pcmpeqd %xmm7, %xmm0 -; X64-NEXT: pcmpgtd %xmm4, %xmm5 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] -; X64-NEXT: pand %xmm0, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; X64-NEXT: por %xmm1, %xmm0 +; X64-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; X64-NEXT: pxor %xmm7, %xmm4 +; X64-NEXT: movdqa %xmm5, %xmm0 +; X64-NEXT: pcmpgtd %xmm4, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] +; X64-NEXT: pcmpeqd %xmm5, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; X64-NEXT: pand %xmm1, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-NEXT: por %xmm2, %xmm0 ; X64-NEXT: pxor %xmm0, %xmm6 -; X64-NEXT: pand %xmm8, %xmm0 +; X64-NEXT: pand %xmm7, %xmm0 ; X64-NEXT: por %xmm6, %xmm0 ; X64-NEXT: psrlq $1, %xmm0 ; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] @@ -382,66 +380,51 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax), %ecx -; X86-NEXT: shrl $31, %eax -; X86-NEXT: shldl $31, %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: pushl $0 ; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl $0 ; X86-NEXT: calll __udivdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, %edi -; X86-NEXT: leal (%ebx,%ebx), %eax -; X86-NEXT: shrl $31, %ebx -; X86-NEXT: shldl $31, %eax, %ebx ; X86-NEXT: pushl $0 -; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %esi ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl $0 ; X86-NEXT: calll __udivdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: leal (%esi,%esi), %eax -; X86-NEXT: shrl $31, %esi -; X86-NEXT: shldl $31, %eax, %esi ; X86-NEXT: pushl $0 +; X86-NEXT: pushl %ebp ; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %esi ; X86-NEXT: pushl $0 ; X86-NEXT: calll __udivdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: leal (%edx,%edx), %ecx -; X86-NEXT: shrl $31, %edx -; X86-NEXT: shldl $31, %ecx, %edx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: cmpl $2, %esi -; X86-NEXT: movl $-1, %edx -; X86-NEXT: cmovael %edx, %eax +; X86-NEXT: cmpl $2, %edx +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: cmovael %ecx, %eax ; X86-NEXT: movl $1, %ebp ; X86-NEXT: cmovael %ebp, %esi ; X86-NEXT: shldl $31, %eax, %esi ; X86-NEXT: cmpl $2, %ebx ; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: cmovael %edx, %eax +; X86-NEXT: cmovael %ecx, %eax ; X86-NEXT: cmovael %ebp, %ebx ; X86-NEXT: shldl $31, %eax, %ebx ; X86-NEXT: cmpl $2, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: cmovael %edx, %eax +; X86-NEXT: cmovael %ecx, %eax ; X86-NEXT: cmovael %ebp, %edi ; X86-NEXT: shldl $31, %eax, %edi ; X86-NEXT: pushl $0 ; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %ecx +; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl $0 ; X86-NEXT: calll __udivdi3 ; X86-NEXT: addl $16, %esp diff --git a/llvm/test/CodeGen/X86/umax.ll b/llvm/test/CodeGen/X86/umax.ll --- a/llvm/test/CodeGen/X86/umax.ll +++ b/llvm/test/CodeGen/X86/umax.ll @@ -285,34 +285,38 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: cmpl $1, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: cmpl $1, %edi +; X86-NEXT: movl %edi, %esi +; X86-NEXT: adcl $0, %esi ; X86-NEXT: testl %edx, %edx -; X86-NEXT: movl $1, %edi -; X86-NEXT: cmovnel %eax, %edi -; X86-NEXT: cmovel %ebx, %edi -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: negl %ebp -; X86-NEXT: movl $0, %ebp -; X86-NEXT: sbbl %esi, %ebp ; X86-NEXT: movl $1, %ebp -; X86-NEXT: cmovbl %eax, %ebp -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: cmovbl %edx, %ebx -; X86-NEXT: orl %esi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %esi, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: cmovel %edi, %ebp -; X86-NEXT: cmovel %edx, %ebx -; X86-NEXT: movl %ebx, 4(%eax) -; X86-NEXT: movl %ebp, (%eax) +; X86-NEXT: cmovnel %edi, %ebp +; X86-NEXT: cmovel %esi, %ebp +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: negl %ebx +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %ecx, %ebx +; X86-NEXT: movl $1, %ebx +; X86-NEXT: cmovbl %edx, %esi +; X86-NEXT: cmovbl %edi, %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: cmovel %ebp, %ebx +; X86-NEXT: cmovel %edx, %esi +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: cmovel %ecx, %edx +; X86-NEXT: cmovel %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %ebx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/umul-with-overflow.ll b/llvm/test/CodeGen/X86/umul-with-overflow.ll --- a/llvm/test/CodeGen/X86/umul-with-overflow.ll +++ b/llvm/test/CodeGen/X86/umul-with-overflow.ll @@ -520,82 +520,81 @@ ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx ; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r8, %r11 -; X64-NEXT: movq %rcx, %r8 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rax, %r15 ; X64-NEXT: addq %rbx, %r15 ; X64-NEXT: adcq $0, %r14 ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r12 +; X64-NEXT: mulq %r11 +; X64-NEXT: movq %r11, %r12 ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %r15, %rbx ; X64-NEXT: adcq %r14, %rbp ; X64-NEXT: setb %al -; X64-NEXT: movzbl %al, %r10d -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movzbl %al, %r11d +; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: addq %rbp, %r13 -; X64-NEXT: adcq %r10, %r12 -; X64-NEXT: movq %r8, %rax +; X64-NEXT: adcq %r11, %r12 +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %r14 -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r8, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %r15, %r10 +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %r15, %r11 ; X64-NEXT: adcq $0, %rbp -; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %r10, %r15 +; X64-NEXT: addq %r11, %r15 ; X64-NEXT: adcq %rbp, %rdx -; X64-NEXT: imulq %r9, %r11 +; X64-NEXT: imulq %r9, %r8 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 ; X64-NEXT: addq %r13, %r14 ; X64-NEXT: adcq %r12, %r15 -; X64-NEXT: adcq %rdx, %r11 +; X64-NEXT: adcq %rdx, %r8 ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rdx, %r11 ; X64-NEXT: movq %rax, %r12 -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r10, %rbp +; X64-NEXT: addq %r11, %rbp ; X64-NEXT: adcq $0, %r13 -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r10 +; X64-NEXT: mulq %r11 ; X64-NEXT: addq %rbp, %rax ; X64-NEXT: adcq %r13, %rdx -; X64-NEXT: imulq %r10, %rcx -; X64-NEXT: addq %rdx, %rcx +; X64-NEXT: imulq %r11, %r10 ; X64-NEXT: addq %r14, %r12 ; X64-NEXT: adcq %r15, %rax -; X64-NEXT: adcq %r11, %rcx -; X64-NEXT: imulq %r9, %r8 +; X64-NEXT: adcq %rdx, %r10 +; X64-NEXT: imulq %r9, %rcx +; X64-NEXT: addq %r8, %rcx +; X64-NEXT: addq %r10, %rcx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; X64-NEXT: imulq {{[0-9]+}}(%rsp), %rdx ; X64-NEXT: imulq {{[0-9]+}}(%rsp), %rsi ; X64-NEXT: addq %rdx, %rsi -; X64-NEXT: addq %r8, %rsi ; X64-NEXT: addq %rcx, %rsi ; X64-NEXT: movq %rbx, 8(%rdi) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload diff --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll --- a/llvm/test/CodeGen/X86/umul_fix.ll +++ b/llvm/test/CodeGen/X86/umul_fix.ll @@ -76,11 +76,11 @@ ; X64-LABEL: func3: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: andb $15, %al -; X64-NEXT: andb $15, %sil -; X64-NEXT: # kill: def $al killed $al killed $eax -; X64-NEXT: mulb %sil +; X64-NEXT: andl $15, %esi +; X64-NEXT: andl $15, %eax +; X64-NEXT: imull %esi, %eax ; X64-NEXT: shrb $2, %al +; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; ; X86-LABEL: func3: @@ -89,8 +89,11 @@ ; X86-NEXT: andb $15, %al ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: andb $15, %cl -; X86-NEXT: mulb %cl +; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: imull %ecx, %eax ; X86-NEXT: shrb $2, %al +; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl %tmp = call i4 @llvm.umul.fix.i4(i4 %x, i4 %y, i32 2) ret i4 %tmp diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll --- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll @@ -39,7 +39,7 @@ ; CHECK-NOBMI-NEXT: # kill: def $esi killed $esi def $rsi ; CHECK-NOBMI-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NOBMI-NEXT: andl $4080, %edi # imm = 0xFF0 -; CHECK-NOBMI-NEXT: andl $-4081, %esi # imm = 0xF00F +; CHECK-NOBMI-NEXT: andl $61455, %esi # imm = 0xF00F ; CHECK-NOBMI-NEXT: leal (%rsi,%rdi), %eax ; CHECK-NOBMI-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NOBMI-NEXT: retq @@ -49,7 +49,7 @@ ; CHECK-BMI-NEXT: # kill: def $esi killed $esi def $rsi ; CHECK-BMI-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-BMI-NEXT: andl $4080, %edi # imm = 0xFF0 -; CHECK-BMI-NEXT: andl $-4081, %esi # imm = 0xF00F +; CHECK-BMI-NEXT: andl $61455, %esi # imm = 0xF00F ; CHECK-BMI-NEXT: leal (%rsi,%rdi), %eax ; CHECK-BMI-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-BMI-NEXT: retq diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll --- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll @@ -39,7 +39,7 @@ ; CHECK-NOBMI-NEXT: # kill: def $esi killed $esi def $rsi ; CHECK-NOBMI-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NOBMI-NEXT: andl $21845, %edi # imm = 0x5555 -; CHECK-NOBMI-NEXT: andl $-21846, %esi # imm = 0xAAAA +; CHECK-NOBMI-NEXT: andl $43690, %esi # imm = 0xAAAA ; CHECK-NOBMI-NEXT: leal (%rsi,%rdi), %eax ; CHECK-NOBMI-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NOBMI-NEXT: retq @@ -49,7 +49,7 @@ ; CHECK-BMI-NEXT: # kill: def $esi killed $esi def $rsi ; CHECK-BMI-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-BMI-NEXT: andl $21845, %edi # imm = 0x5555 -; CHECK-BMI-NEXT: andl $-21846, %esi # imm = 0xAAAA +; CHECK-BMI-NEXT: andl $43690, %esi # imm = 0xAAAA ; CHECK-BMI-NEXT: leal (%rsi,%rdi), %eax ; CHECK-BMI-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-BMI-NEXT: retq diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll --- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll @@ -39,7 +39,7 @@ ; CHECK-NOBMI-NEXT: # kill: def $esi killed $esi def $rsi ; CHECK-NOBMI-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NOBMI-NEXT: andl $3855, %edi # imm = 0xF0F -; CHECK-NOBMI-NEXT: andl $-3856, %esi # imm = 0xF0F0 +; CHECK-NOBMI-NEXT: andl $61680, %esi # imm = 0xF0F0 ; CHECK-NOBMI-NEXT: leal (%rsi,%rdi), %eax ; CHECK-NOBMI-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NOBMI-NEXT: retq @@ -49,7 +49,7 @@ ; CHECK-BMI-NEXT: # kill: def $esi killed $esi def $rsi ; CHECK-BMI-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-BMI-NEXT: andl $3855, %edi # imm = 0xF0F -; CHECK-BMI-NEXT: andl $-3856, %esi # imm = 0xF0F0 +; CHECK-BMI-NEXT: andl $61680, %esi # imm = 0xF0F0 ; CHECK-BMI-NEXT: leal (%rsi,%rdi), %eax ; CHECK-BMI-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-BMI-NEXT: retq diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll --- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll @@ -37,7 +37,7 @@ ; CHECK-NOBMI-LABEL: out16_constmask: ; CHECK-NOBMI: # %bb.0: ; CHECK-NOBMI-NEXT: movzbl %dil, %eax -; CHECK-NOBMI-NEXT: andl $-256, %esi +; CHECK-NOBMI-NEXT: andl $65280, %esi # imm = 0xFF00 ; CHECK-NOBMI-NEXT: orl %esi, %eax ; CHECK-NOBMI-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NOBMI-NEXT: retq @@ -45,7 +45,7 @@ ; CHECK-BMI-LABEL: out16_constmask: ; CHECK-BMI: # %bb.0: ; CHECK-BMI-NEXT: movzbl %dil, %eax -; CHECK-BMI-NEXT: andl $-256, %esi +; CHECK-BMI-NEXT: andl $65280, %esi # imm = 0xFF00 ; CHECK-BMI-NEXT: orl %esi, %eax ; CHECK-BMI-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-BMI-NEXT: retq diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll --- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll @@ -33,11 +33,10 @@ define i16 @out16(i16 %x, i16 %y, i16 %mask) { ; CHECK-NOBMI-LABEL: out16: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movl %edx, %eax -; CHECK-NOBMI-NEXT: andl %edx, %edi -; CHECK-NOBMI-NEXT: notl %eax -; CHECK-NOBMI-NEXT: andl %esi, %eax -; CHECK-NOBMI-NEXT: orl %edi, %eax +; CHECK-NOBMI-NEXT: movl %edi, %eax +; CHECK-NOBMI-NEXT: xorl %esi, %eax +; CHECK-NOBMI-NEXT: andl %edx, %eax +; CHECK-NOBMI-NEXT: xorl %esi, %eax ; CHECK-NOBMI-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NOBMI-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll --- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll @@ -11,12 +11,26 @@ ; CHECK-SSE1-LABEL: out_constant_varx_mone: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 -; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 -; CHECK-SSE1-NEXT: andps (%rsi), %xmm0 -; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) +; CHECK-SSE1-NEXT: movq (%rcx), %rdx +; CHECK-SSE1-NEXT: movq 8(%rcx), %rcx +; CHECK-SSE1-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rcx +; CHECK-SSE1-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rdx +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN] +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: andps (%rsi), %xmm1 +; CHECK-SSE1-NEXT: orps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: movaps %xmm1, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_constant_varx_mone: @@ -85,8 +99,24 @@ ; CHECK-SSE1-LABEL: out_constant_varx_mone_invmask: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movaps (%rsi), %xmm0 -; CHECK-SSE1-NEXT: orps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: movq (%rcx), %rdx +; CHECK-SSE1-NEXT: movq 8(%rcx), %rcx +; CHECK-SSE1-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rcx +; CHECK-SSE1-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rdx +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-SSE1-NEXT: movaps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: andnps (%rsi), %xmm0 +; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; @@ -187,12 +217,12 @@ ; CHECK-SSE1-LABEL: in_constant_varx_42: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm0 = [5.88545355E-44,5.88545355E-44,5.88545355E-44,5.88545355E-44] ; CHECK-SSE1-NEXT: movaps (%rsi), %xmm1 -; CHECK-SSE1-NEXT: andps %xmm0, %xmm1 -; CHECK-SSE1-NEXT: andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) +; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: andps (%rcx), %xmm1 +; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: movaps %xmm1, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: in_constant_varx_42: @@ -263,10 +293,11 @@ ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, %xmm1 -; CHECK-SSE1-NEXT: andnps (%rsi), %xmm1 -; CHECK-SSE1-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm1 = [5.88545355E-44,5.88545355E-44,5.88545355E-44,5.88545355E-44] +; CHECK-SSE1-NEXT: movaps (%rsi), %xmm2 +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm2 +; CHECK-SSE1-NEXT: andnps %xmm2, %xmm0 +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; @@ -299,8 +330,24 @@ ; CHECK-SSE1-LABEL: out_constant_mone_vary: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movaps (%rdx), %xmm0 -; CHECK-SSE1-NEXT: orps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: movq (%rcx), %rsi +; CHECK-SSE1-NEXT: movq 8(%rcx), %rcx +; CHECK-SSE1-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rcx +; CHECK-SSE1-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rsi +; CHECK-SSE1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-SSE1-NEXT: movaps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: andnps (%rdx), %xmm0 +; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; @@ -329,8 +376,24 @@ ; CHECK-SSE1-LABEL: in_constant_mone_vary: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 -; CHECK-SSE1-NEXT: orps (%rdx), %xmm0 +; CHECK-SSE1-NEXT: movq (%rdx), %rsi +; CHECK-SSE1-NEXT: movq 8(%rdx), %rdx +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rdx +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rsi +; CHECK-SSE1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-SSE1-NEXT: movaps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: andnps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; @@ -359,12 +422,26 @@ ; CHECK-SSE1-LABEL: out_constant_mone_vary_invmask: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 -; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 -; CHECK-SSE1-NEXT: andps (%rdx), %xmm0 -; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) +; CHECK-SSE1-NEXT: movq (%rcx), %rsi +; CHECK-SSE1-NEXT: movq 8(%rcx), %rcx +; CHECK-SSE1-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rcx +; CHECK-SSE1-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rsi +; CHECK-SSE1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN] +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: andps (%rdx), %xmm1 +; CHECK-SSE1-NEXT: orps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: movaps %xmm1, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_constant_mone_vary_invmask: @@ -399,10 +476,27 @@ ; CHECK-SSE1-LABEL: in_constant_mone_vary_invmask: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: movq (%rdx), %rsi +; CHECK-SSE1-NEXT: movq 8(%rdx), %rdx +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rdx +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rsi +; CHECK-SSE1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 ; CHECK-SSE1-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE1-NEXT: orps (%rdx), %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) +; CHECK-SSE1-NEXT: movaps %xmm1, %xmm2 +; CHECK-SSE1-NEXT: andnps %xmm0, %xmm2 +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm2 +; CHECK-SSE1-NEXT: movaps %xmm2, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: in_constant_mone_vary_invmask: @@ -469,11 +563,25 @@ ; CHECK-SSE1-LABEL: in_constant_42_vary: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, %xmm1 -; CHECK-SSE1-NEXT: andnps (%rdx), %xmm1 -; CHECK-SSE1-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: movq (%rdx), %rsi +; CHECK-SSE1-NEXT: movq 8(%rdx), %rdx +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rdx +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rsi +; CHECK-SSE1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm0 = [5.88545355E-44,5.88545355E-44,5.88545355E-44,5.88545355E-44] +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: andps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; @@ -544,11 +652,26 @@ ; CHECK-SSE1-LABEL: in_constant_42_vary_invmask: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: movq (%rdx), %rsi +; CHECK-SSE1-NEXT: movq 8(%rdx), %rdx +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rdx +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rsi +; CHECK-SSE1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 -; CHECK-SSE1-NEXT: movaps (%rdx), %xmm1 -; CHECK-SSE1-NEXT: andps %xmm0, %xmm1 -; CHECK-SSE1-NEXT: andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm2 = [5.88545355E-44,5.88545355E-44,5.88545355E-44,5.88545355E-44] +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm2 +; CHECK-SSE1-NEXT: andnps %xmm2, %xmm0 +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll --- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll @@ -86,11 +86,10 @@ define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind { ; CHECK-LABEL: out_v1i16: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: andl %edx, %edi -; CHECK-NEXT: notl %eax -; CHECK-NEXT: andl %esi, %eax -; CHECK-NEXT: orl %edi, %eax +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: xorl %esi, %eax +; CHECK-NEXT: andl %edx, %eax +; CHECK-NEXT: xorl %esi, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq %mx = and <1 x i16> %x, %mask @@ -235,32 +234,28 @@ define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind { ; CHECK-BASELINE-LABEL: out_v2i16: ; CHECK-BASELINE: # %bb.0: -; CHECK-BASELINE-NEXT: movl %r8d, %eax +; CHECK-BASELINE-NEXT: movl %edi, %eax +; CHECK-BASELINE-NEXT: xorl %edx, %eax +; CHECK-BASELINE-NEXT: andl %r8d, %eax +; CHECK-BASELINE-NEXT: xorl %edx, %eax +; CHECK-BASELINE-NEXT: xorl %ecx, %esi ; CHECK-BASELINE-NEXT: andl %r9d, %esi -; CHECK-BASELINE-NEXT: andl %r8d, %edi -; CHECK-BASELINE-NEXT: notl %eax -; CHECK-BASELINE-NEXT: notl %r9d -; CHECK-BASELINE-NEXT: andl %ecx, %r9d -; CHECK-BASELINE-NEXT: orl %esi, %r9d -; CHECK-BASELINE-NEXT: andl %edx, %eax -; CHECK-BASELINE-NEXT: orl %edi, %eax +; CHECK-BASELINE-NEXT: xorl %ecx, %esi ; CHECK-BASELINE-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-BASELINE-NEXT: movl %r9d, %edx +; CHECK-BASELINE-NEXT: movl %esi, %edx ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v2i16: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movl %r8d, %eax +; CHECK-SSE1-NEXT: movl %edi, %eax +; CHECK-SSE1-NEXT: xorl %edx, %eax +; CHECK-SSE1-NEXT: andl %r8d, %eax +; CHECK-SSE1-NEXT: xorl %edx, %eax +; CHECK-SSE1-NEXT: xorl %ecx, %esi ; CHECK-SSE1-NEXT: andl %r9d, %esi -; CHECK-SSE1-NEXT: andl %r8d, %edi -; CHECK-SSE1-NEXT: notl %eax -; CHECK-SSE1-NEXT: notl %r9d -; CHECK-SSE1-NEXT: andl %ecx, %r9d -; CHECK-SSE1-NEXT: orl %esi, %r9d -; CHECK-SSE1-NEXT: andl %edx, %eax -; CHECK-SSE1-NEXT: orl %edi, %eax +; CHECK-SSE1-NEXT: xorl %ecx, %esi ; CHECK-SSE1-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-SSE1-NEXT: movl %r9d, %edx +; CHECK-SSE1-NEXT: movl %esi, %edx ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v2i16: @@ -439,49 +434,55 @@ ; CHECK-BASELINE-LABEL: out_v4i16: ; CHECK-BASELINE: # %bb.0: ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d -; CHECK-BASELINE-NEXT: xorl %r11d, %edx -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx -; CHECK-BASELINE-NEXT: xorl %r11d, %edx -; CHECK-BASELINE-NEXT: xorl %r10d, %ecx -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-BASELINE-NEXT: xorl %r10d, %ecx -; CHECK-BASELINE-NEXT: xorl %edi, %r8d -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w -; CHECK-BASELINE-NEXT: xorl %edi, %r8d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edi +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; CHECK-BASELINE-NEXT: andl %r11d, %r8d +; CHECK-BASELINE-NEXT: andl %r10d, %ecx +; CHECK-BASELINE-NEXT: andl %edi, %edx +; CHECK-BASELINE-NEXT: notl %edi +; CHECK-BASELINE-NEXT: notl %r10d +; CHECK-BASELINE-NEXT: notl %r11d +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r11w +; CHECK-BASELINE-NEXT: orl %r8d, %r11d +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r10w +; CHECK-BASELINE-NEXT: orl %ecx, %r10d +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %di +; CHECK-BASELINE-NEXT: orl %edx, %edi ; CHECK-BASELINE-NEXT: xorl %r9d, %esi -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-BASELINE-NEXT: andl {{[0-9]+}}(%rsp), %esi ; CHECK-BASELINE-NEXT: xorl %r9d, %esi ; CHECK-BASELINE-NEXT: movw %si, (%rax) -; CHECK-BASELINE-NEXT: movw %r8w, 6(%rax) -; CHECK-BASELINE-NEXT: movw %cx, 4(%rax) -; CHECK-BASELINE-NEXT: movw %dx, 2(%rax) +; CHECK-BASELINE-NEXT: movw %r11w, 6(%rax) +; CHECK-BASELINE-NEXT: movw %r10w, 4(%rax) +; CHECK-BASELINE-NEXT: movw %di, 2(%rax) ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v4i16: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d -; CHECK-SSE1-NEXT: xorl %r11d, %edx -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx -; CHECK-SSE1-NEXT: xorl %r11d, %edx -; CHECK-SSE1-NEXT: xorl %r10d, %ecx -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-SSE1-NEXT: xorl %r10d, %ecx -; CHECK-SSE1-NEXT: xorl %edi, %r8d -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w -; CHECK-SSE1-NEXT: xorl %edi, %r8d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edi +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; CHECK-SSE1-NEXT: andl %r11d, %r8d +; CHECK-SSE1-NEXT: andl %r10d, %ecx +; CHECK-SSE1-NEXT: andl %edi, %edx +; CHECK-SSE1-NEXT: notl %edi +; CHECK-SSE1-NEXT: notl %r10d +; CHECK-SSE1-NEXT: notl %r11d +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r11w +; CHECK-SSE1-NEXT: orl %r8d, %r11d +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r10w +; CHECK-SSE1-NEXT: orl %ecx, %r10d +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %di +; CHECK-SSE1-NEXT: orl %edx, %edi ; CHECK-SSE1-NEXT: xorl %r9d, %esi -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-SSE1-NEXT: andl {{[0-9]+}}(%rsp), %esi ; CHECK-SSE1-NEXT: xorl %r9d, %esi ; CHECK-SSE1-NEXT: movw %si, (%rax) -; CHECK-SSE1-NEXT: movw %r8w, 6(%rax) -; CHECK-SSE1-NEXT: movw %cx, 4(%rax) -; CHECK-SSE1-NEXT: movw %dx, 2(%rax) +; CHECK-SSE1-NEXT: movw %r11w, 6(%rax) +; CHECK-SSE1-NEXT: movw %r10w, 4(%rax) +; CHECK-SSE1-NEXT: movw %di, 2(%rax) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v4i16: @@ -506,43 +507,47 @@ ; CHECK-BASELINE-LABEL: out_v4i16_undef: ; CHECK-BASELINE: # %bb.0: ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edi +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: andl %r10d, %r8d +; CHECK-BASELINE-NEXT: andl %edi, %edx ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-BASELINE-NEXT: xorl %r10d, %edx -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx -; CHECK-BASELINE-NEXT: xorl %r10d, %edx -; CHECK-BASELINE-NEXT: xorl %edi, %r8d -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w -; CHECK-BASELINE-NEXT: xorl %edi, %r8d +; CHECK-BASELINE-NEXT: notl %edi +; CHECK-BASELINE-NEXT: notl %r10d +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r10w +; CHECK-BASELINE-NEXT: orl %r8d, %r10d +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %di +; CHECK-BASELINE-NEXT: orl %edx, %edi ; CHECK-BASELINE-NEXT: xorl %r9d, %esi -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-BASELINE-NEXT: andl {{[0-9]+}}(%rsp), %esi ; CHECK-BASELINE-NEXT: xorl %r9d, %esi ; CHECK-BASELINE-NEXT: movw %cx, 4(%rax) ; CHECK-BASELINE-NEXT: movw %si, (%rax) -; CHECK-BASELINE-NEXT: movw %r8w, 6(%rax) -; CHECK-BASELINE-NEXT: movw %dx, 2(%rax) +; CHECK-BASELINE-NEXT: movw %r10w, 6(%rax) +; CHECK-BASELINE-NEXT: movw %di, 2(%rax) ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v4i16_undef: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edi +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: andl %r10d, %r8d +; CHECK-SSE1-NEXT: andl %edi, %edx ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-SSE1-NEXT: xorl %r10d, %edx -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx -; CHECK-SSE1-NEXT: xorl %r10d, %edx -; CHECK-SSE1-NEXT: xorl %edi, %r8d -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w -; CHECK-SSE1-NEXT: xorl %edi, %r8d +; CHECK-SSE1-NEXT: notl %edi +; CHECK-SSE1-NEXT: notl %r10d +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r10w +; CHECK-SSE1-NEXT: orl %r8d, %r10d +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %di +; CHECK-SSE1-NEXT: orl %edx, %edi ; CHECK-SSE1-NEXT: xorl %r9d, %esi -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-SSE1-NEXT: andl {{[0-9]+}}(%rsp), %esi ; CHECK-SSE1-NEXT: xorl %r9d, %esi ; CHECK-SSE1-NEXT: movw %cx, 4(%rax) ; CHECK-SSE1-NEXT: movw %si, (%rax) -; CHECK-SSE1-NEXT: movw %r8w, 6(%rax) -; CHECK-SSE1-NEXT: movw %dx, 2(%rax) +; CHECK-SSE1-NEXT: movw %r10w, 6(%rax) +; CHECK-SSE1-NEXT: movw %di, 2(%rax) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v4i16_undef: @@ -877,118 +882,118 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind { ; CHECK-BASELINE-LABEL: out_v8i16: ; CHECK-BASELINE: # %bb.0: -; CHECK-BASELINE-NEXT: pushq %rbp -; CHECK-BASELINE-NEXT: pushq %r15 -; CHECK-BASELINE-NEXT: pushq %r14 -; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edi +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r11d ; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r14d -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r15d -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r12d -; CHECK-BASELINE-NEXT: xorl %r12d, %esi -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si -; CHECK-BASELINE-NEXT: xorl %r12d, %esi -; CHECK-BASELINE-NEXT: xorl %r15d, %edx -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx -; CHECK-BASELINE-NEXT: xorl %r15d, %edx -; CHECK-BASELINE-NEXT: xorl %r14d, %ecx -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-BASELINE-NEXT: xorl %r14d, %ecx -; CHECK-BASELINE-NEXT: xorl %ebp, %r8d -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w -; CHECK-BASELINE-NEXT: xorl %ebp, %r8d -; CHECK-BASELINE-NEXT: xorl %ebx, %r9d -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r9w -; CHECK-BASELINE-NEXT: xorl %ebx, %r9d -; CHECK-BASELINE-NEXT: movl %r11d, %ebx -; CHECK-BASELINE-NEXT: xorw {{[0-9]+}}(%rsp), %bx -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %bx -; CHECK-BASELINE-NEXT: xorl %r11d, %ebx -; CHECK-BASELINE-NEXT: movl %r10d, %r11d -; CHECK-BASELINE-NEXT: xorw {{[0-9]+}}(%rsp), %r11w +; CHECK-BASELINE-NEXT: andw %r11w, %bx +; CHECK-BASELINE-NEXT: notl %r11d ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r11w -; CHECK-BASELINE-NEXT: xorl %r10d, %r11d -; CHECK-BASELINE-NEXT: movl %edi, %r10d -; CHECK-BASELINE-NEXT: xorw {{[0-9]+}}(%rsp), %r10w +; CHECK-BASELINE-NEXT: orl %ebx, %r11d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: andw %r10w, %bx +; CHECK-BASELINE-NEXT: notl %r10d ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r10w -; CHECK-BASELINE-NEXT: xorl %edi, %r10d -; CHECK-BASELINE-NEXT: movw %r10w, 14(%rax) -; CHECK-BASELINE-NEXT: movw %r11w, 12(%rax) -; CHECK-BASELINE-NEXT: movw %bx, 10(%rax) -; CHECK-BASELINE-NEXT: movw %r9w, 8(%rax) -; CHECK-BASELINE-NEXT: movw %r8w, 6(%rax) -; CHECK-BASELINE-NEXT: movw %cx, 4(%rax) -; CHECK-BASELINE-NEXT: movw %dx, 2(%rax) -; CHECK-BASELINE-NEXT: movw %si, (%rax) +; CHECK-BASELINE-NEXT: orl %ebx, %r10d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: andw %di, %bx +; CHECK-BASELINE-NEXT: notl %edi +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %di +; CHECK-BASELINE-NEXT: orl %ebx, %edi +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: andl %ebx, %r9d +; CHECK-BASELINE-NEXT: notl %ebx +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %bx +; CHECK-BASELINE-NEXT: orl %r9d, %ebx +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r9d +; CHECK-BASELINE-NEXT: andl %r9d, %r8d +; CHECK-BASELINE-NEXT: notl %r9d +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r9w +; CHECK-BASELINE-NEXT: orl %r8d, %r9d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r8d +; CHECK-BASELINE-NEXT: andl %r8d, %ecx +; CHECK-BASELINE-NEXT: notl %r8d +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w +; CHECK-BASELINE-NEXT: orl %ecx, %r8d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; CHECK-BASELINE-NEXT: andl %ecx, %edx +; CHECK-BASELINE-NEXT: notl %ecx +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx +; CHECK-BASELINE-NEXT: orl %edx, %ecx +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edx +; CHECK-BASELINE-NEXT: andl %edx, %esi +; CHECK-BASELINE-NEXT: notl %edx +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx +; CHECK-BASELINE-NEXT: orl %esi, %edx +; CHECK-BASELINE-NEXT: movw %r11w, 14(%rax) +; CHECK-BASELINE-NEXT: movw %r10w, 12(%rax) +; CHECK-BASELINE-NEXT: movw %di, 10(%rax) +; CHECK-BASELINE-NEXT: movw %bx, 8(%rax) +; CHECK-BASELINE-NEXT: movw %r9w, 6(%rax) +; CHECK-BASELINE-NEXT: movw %r8w, 4(%rax) +; CHECK-BASELINE-NEXT: movw %cx, 2(%rax) +; CHECK-BASELINE-NEXT: movw %dx, (%rax) ; CHECK-BASELINE-NEXT: popq %rbx -; CHECK-BASELINE-NEXT: popq %r12 -; CHECK-BASELINE-NEXT: popq %r14 -; CHECK-BASELINE-NEXT: popq %r15 -; CHECK-BASELINE-NEXT: popq %rbp ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v8i16: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: pushq %rbp -; CHECK-SSE1-NEXT: pushq %r15 -; CHECK-SSE1-NEXT: pushq %r14 -; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edi +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r11d ; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r14d -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r15d -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r12d -; CHECK-SSE1-NEXT: xorl %r12d, %esi -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si -; CHECK-SSE1-NEXT: xorl %r12d, %esi -; CHECK-SSE1-NEXT: xorl %r15d, %edx -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx -; CHECK-SSE1-NEXT: xorl %r15d, %edx -; CHECK-SSE1-NEXT: xorl %r14d, %ecx -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-SSE1-NEXT: xorl %r14d, %ecx -; CHECK-SSE1-NEXT: xorl %ebp, %r8d -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w -; CHECK-SSE1-NEXT: xorl %ebp, %r8d -; CHECK-SSE1-NEXT: xorl %ebx, %r9d -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r9w -; CHECK-SSE1-NEXT: xorl %ebx, %r9d -; CHECK-SSE1-NEXT: movl %r11d, %ebx -; CHECK-SSE1-NEXT: xorw {{[0-9]+}}(%rsp), %bx -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %bx -; CHECK-SSE1-NEXT: xorl %r11d, %ebx -; CHECK-SSE1-NEXT: movl %r10d, %r11d -; CHECK-SSE1-NEXT: xorw {{[0-9]+}}(%rsp), %r11w +; CHECK-SSE1-NEXT: andw %r11w, %bx +; CHECK-SSE1-NEXT: notl %r11d ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r11w -; CHECK-SSE1-NEXT: xorl %r10d, %r11d -; CHECK-SSE1-NEXT: movl %edi, %r10d -; CHECK-SSE1-NEXT: xorw {{[0-9]+}}(%rsp), %r10w +; CHECK-SSE1-NEXT: orl %ebx, %r11d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: andw %r10w, %bx +; CHECK-SSE1-NEXT: notl %r10d ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r10w -; CHECK-SSE1-NEXT: xorl %edi, %r10d -; CHECK-SSE1-NEXT: movw %r10w, 14(%rax) -; CHECK-SSE1-NEXT: movw %r11w, 12(%rax) -; CHECK-SSE1-NEXT: movw %bx, 10(%rax) -; CHECK-SSE1-NEXT: movw %r9w, 8(%rax) -; CHECK-SSE1-NEXT: movw %r8w, 6(%rax) -; CHECK-SSE1-NEXT: movw %cx, 4(%rax) -; CHECK-SSE1-NEXT: movw %dx, 2(%rax) -; CHECK-SSE1-NEXT: movw %si, (%rax) +; CHECK-SSE1-NEXT: orl %ebx, %r10d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: andw %di, %bx +; CHECK-SSE1-NEXT: notl %edi +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %di +; CHECK-SSE1-NEXT: orl %ebx, %edi +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: andl %ebx, %r9d +; CHECK-SSE1-NEXT: notl %ebx +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %bx +; CHECK-SSE1-NEXT: orl %r9d, %ebx +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r9d +; CHECK-SSE1-NEXT: andl %r9d, %r8d +; CHECK-SSE1-NEXT: notl %r9d +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r9w +; CHECK-SSE1-NEXT: orl %r8d, %r9d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r8d +; CHECK-SSE1-NEXT: andl %r8d, %ecx +; CHECK-SSE1-NEXT: notl %r8d +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w +; CHECK-SSE1-NEXT: orl %ecx, %r8d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; CHECK-SSE1-NEXT: andl %ecx, %edx +; CHECK-SSE1-NEXT: notl %ecx +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx +; CHECK-SSE1-NEXT: orl %edx, %ecx +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edx +; CHECK-SSE1-NEXT: andl %edx, %esi +; CHECK-SSE1-NEXT: notl %edx +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx +; CHECK-SSE1-NEXT: orl %esi, %edx +; CHECK-SSE1-NEXT: movw %r11w, 14(%rax) +; CHECK-SSE1-NEXT: movw %r10w, 12(%rax) +; CHECK-SSE1-NEXT: movw %di, 10(%rax) +; CHECK-SSE1-NEXT: movw %bx, 8(%rax) +; CHECK-SSE1-NEXT: movw %r9w, 6(%rax) +; CHECK-SSE1-NEXT: movw %r8w, 4(%rax) +; CHECK-SSE1-NEXT: movw %cx, 2(%rax) +; CHECK-SSE1-NEXT: movw %dx, (%rax) ; CHECK-SSE1-NEXT: popq %rbx -; CHECK-SSE1-NEXT: popq %r12 -; CHECK-SSE1-NEXT: popq %r14 -; CHECK-SSE1-NEXT: popq %r15 -; CHECK-SSE1-NEXT: popq %rbp ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v8i16: @@ -1759,113 +1764,135 @@ ; CHECK-BASELINE-NEXT: pushq %r13 ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx -; CHECK-BASELINE-NEXT: movzwl 18(%rdx), %r15d -; CHECK-BASELINE-NEXT: movzwl 16(%rdx), %r14d -; CHECK-BASELINE-NEXT: movzwl 14(%rdx), %ebp -; CHECK-BASELINE-NEXT: movzwl 12(%rdx), %ebx -; CHECK-BASELINE-NEXT: movzwl 10(%rdx), %r13d -; CHECK-BASELINE-NEXT: movzwl 8(%rdx), %r11d -; CHECK-BASELINE-NEXT: movzwl 6(%rdx), %r10d -; CHECK-BASELINE-NEXT: movzwl 4(%rdx), %r9d -; CHECK-BASELINE-NEXT: movzwl (%rdx), %r8d -; CHECK-BASELINE-NEXT: movzwl 2(%rdx), %r12d -; CHECK-BASELINE-NEXT: movzwl (%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r8w, %ax -; CHECK-BASELINE-NEXT: andw (%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r8d -; CHECK-BASELINE-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 2(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r12w, %ax -; CHECK-BASELINE-NEXT: andw 2(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r12d -; CHECK-BASELINE-NEXT: movzwl 4(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r9w, %ax -; CHECK-BASELINE-NEXT: andw 4(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r9d -; CHECK-BASELINE-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 6(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r10w, %ax -; CHECK-BASELINE-NEXT: andw 6(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r10d -; CHECK-BASELINE-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 8(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r11w, %ax -; CHECK-BASELINE-NEXT: andw 8(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r11d -; CHECK-BASELINE-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 10(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r13w, %ax -; CHECK-BASELINE-NEXT: andw 10(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r13d -; CHECK-BASELINE-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 12(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %bx, %ax -; CHECK-BASELINE-NEXT: andw 12(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %ebx -; CHECK-BASELINE-NEXT: movzwl 14(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %bp, %ax -; CHECK-BASELINE-NEXT: andw 14(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %ebp -; CHECK-BASELINE-NEXT: movzwl 16(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r14w, %ax -; CHECK-BASELINE-NEXT: andw 16(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r14d -; CHECK-BASELINE-NEXT: movzwl 18(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r15w, %ax -; CHECK-BASELINE-NEXT: andw 18(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r15d -; CHECK-BASELINE-NEXT: movzwl 20(%rdx), %r13d -; CHECK-BASELINE-NEXT: movzwl 20(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r13w, %ax -; CHECK-BASELINE-NEXT: andw 20(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r13d -; CHECK-BASELINE-NEXT: movzwl 22(%rdx), %r9d -; CHECK-BASELINE-NEXT: movzwl 22(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r9w, %ax -; CHECK-BASELINE-NEXT: andw 22(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r9d -; CHECK-BASELINE-NEXT: movzwl 24(%rdx), %r8d -; CHECK-BASELINE-NEXT: movzwl 24(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r8w, %ax -; CHECK-BASELINE-NEXT: andw 24(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r8d -; CHECK-BASELINE-NEXT: movzwl 26(%rdx), %eax -; CHECK-BASELINE-NEXT: movzwl 26(%rsi), %r10d -; CHECK-BASELINE-NEXT: xorw %ax, %r10w -; CHECK-BASELINE-NEXT: andw 26(%rcx), %r10w -; CHECK-BASELINE-NEXT: xorl %r10d, %eax -; CHECK-BASELINE-NEXT: movzwl 28(%rdx), %r10d -; CHECK-BASELINE-NEXT: movzwl 28(%rsi), %r11d -; CHECK-BASELINE-NEXT: xorw %r10w, %r11w -; CHECK-BASELINE-NEXT: andw 28(%rcx), %r11w -; CHECK-BASELINE-NEXT: xorl %r11d, %r10d -; CHECK-BASELINE-NEXT: movzwl 30(%rdx), %edx -; CHECK-BASELINE-NEXT: movzwl 30(%rsi), %esi -; CHECK-BASELINE-NEXT: xorw %dx, %si -; CHECK-BASELINE-NEXT: andw 30(%rcx), %si -; CHECK-BASELINE-NEXT: xorl %esi, %edx -; CHECK-BASELINE-NEXT: movw %dx, 30(%rdi) -; CHECK-BASELINE-NEXT: movw %r10w, 28(%rdi) -; CHECK-BASELINE-NEXT: movw %ax, 26(%rdi) -; CHECK-BASELINE-NEXT: movw %r8w, 24(%rdi) -; CHECK-BASELINE-NEXT: movw %r9w, 22(%rdi) -; CHECK-BASELINE-NEXT: movw %r13w, 20(%rdi) -; CHECK-BASELINE-NEXT: movw %r15w, 18(%rdi) -; CHECK-BASELINE-NEXT: movw %r14w, 16(%rdi) -; CHECK-BASELINE-NEXT: movw %bp, 14(%rdi) -; CHECK-BASELINE-NEXT: movw %bx, 12(%rdi) +; CHECK-BASELINE-NEXT: movq %rcx, %r9 +; CHECK-BASELINE-NEXT: movq %rdx, %r10 +; CHECK-BASELINE-NEXT: movq %rsi, %r8 +; CHECK-BASELINE-NEXT: movq %rdi, %r11 +; CHECK-BASELINE-NEXT: movl 12(%rcx), %eax +; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 14(%rcx), %edx +; CHECK-BASELINE-NEXT: movl 16(%rcx), %esi +; CHECK-BASELINE-NEXT: movzwl 18(%rcx), %edi +; CHECK-BASELINE-NEXT: movl 20(%rcx), %ecx +; CHECK-BASELINE-NEXT: movzwl 22(%r9), %ebx +; CHECK-BASELINE-NEXT: movl 24(%r9), %ebp +; CHECK-BASELINE-NEXT: movzwl 26(%r9), %r14d +; CHECK-BASELINE-NEXT: movl 28(%r9), %r15d +; CHECK-BASELINE-NEXT: movzwl 30(%r9), %r12d +; CHECK-BASELINE-NEXT: movzwl 30(%r8), %r13d +; CHECK-BASELINE-NEXT: andw %r12w, %r13w +; CHECK-BASELINE-NEXT: notl %r12d +; CHECK-BASELINE-NEXT: andw 30(%r10), %r12w +; CHECK-BASELINE-NEXT: orl %r13d, %r12d +; CHECK-BASELINE-NEXT: movzwl 28(%r8), %eax +; CHECK-BASELINE-NEXT: andw %r15w, %ax +; CHECK-BASELINE-NEXT: notl %r15d +; CHECK-BASELINE-NEXT: andw 28(%r10), %r15w +; CHECK-BASELINE-NEXT: orl %eax, %r15d +; CHECK-BASELINE-NEXT: movzwl 26(%r8), %eax +; CHECK-BASELINE-NEXT: andw %r14w, %ax +; CHECK-BASELINE-NEXT: notl %r14d +; CHECK-BASELINE-NEXT: andw 26(%r10), %r14w +; CHECK-BASELINE-NEXT: orl %eax, %r14d +; CHECK-BASELINE-NEXT: movzwl 24(%r8), %eax +; CHECK-BASELINE-NEXT: andw %bp, %ax +; CHECK-BASELINE-NEXT: notl %ebp +; CHECK-BASELINE-NEXT: andw 24(%r10), %bp +; CHECK-BASELINE-NEXT: orl %eax, %ebp +; CHECK-BASELINE-NEXT: movzwl 22(%r8), %eax +; CHECK-BASELINE-NEXT: andw %bx, %ax +; CHECK-BASELINE-NEXT: notl %ebx +; CHECK-BASELINE-NEXT: andw 22(%r10), %bx +; CHECK-BASELINE-NEXT: orl %eax, %ebx +; CHECK-BASELINE-NEXT: movzwl 20(%r8), %eax +; CHECK-BASELINE-NEXT: andw %cx, %ax +; CHECK-BASELINE-NEXT: notl %ecx +; CHECK-BASELINE-NEXT: andw 20(%r10), %cx +; CHECK-BASELINE-NEXT: orl %eax, %ecx +; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 18(%r8), %eax +; CHECK-BASELINE-NEXT: andw %di, %ax +; CHECK-BASELINE-NEXT: notl %edi +; CHECK-BASELINE-NEXT: andw 18(%r10), %di +; CHECK-BASELINE-NEXT: orl %eax, %edi +; CHECK-BASELINE-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 16(%r8), %eax +; CHECK-BASELINE-NEXT: andw %si, %ax +; CHECK-BASELINE-NEXT: notl %esi +; CHECK-BASELINE-NEXT: andw 16(%r10), %si +; CHECK-BASELINE-NEXT: orl %eax, %esi +; CHECK-BASELINE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 14(%r8), %eax +; CHECK-BASELINE-NEXT: andw %dx, %ax +; CHECK-BASELINE-NEXT: notl %edx +; CHECK-BASELINE-NEXT: andw 14(%r10), %dx +; CHECK-BASELINE-NEXT: orl %eax, %edx +; CHECK-BASELINE-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 12(%r8), %eax +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-BASELINE-NEXT: andw %cx, %ax +; CHECK-BASELINE-NEXT: notl %ecx +; CHECK-BASELINE-NEXT: andw 12(%r10), %cx +; CHECK-BASELINE-NEXT: orl %eax, %ecx +; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 10(%r9), %r13d +; CHECK-BASELINE-NEXT: movzwl 10(%r8), %eax +; CHECK-BASELINE-NEXT: andw %r13w, %ax +; CHECK-BASELINE-NEXT: notl %r13d +; CHECK-BASELINE-NEXT: andw 10(%r10), %r13w +; CHECK-BASELINE-NEXT: orl %eax, %r13d +; CHECK-BASELINE-NEXT: movl 8(%r9), %edi +; CHECK-BASELINE-NEXT: movzwl 8(%r8), %eax +; CHECK-BASELINE-NEXT: andw %di, %ax +; CHECK-BASELINE-NEXT: notl %edi +; CHECK-BASELINE-NEXT: andw 8(%r10), %di +; CHECK-BASELINE-NEXT: orl %eax, %edi +; CHECK-BASELINE-NEXT: movzwl 6(%r9), %esi +; CHECK-BASELINE-NEXT: movzwl 6(%r8), %eax +; CHECK-BASELINE-NEXT: andw %si, %ax +; CHECK-BASELINE-NEXT: notl %esi +; CHECK-BASELINE-NEXT: andw 6(%r10), %si +; CHECK-BASELINE-NEXT: orl %eax, %esi +; CHECK-BASELINE-NEXT: movl 4(%r9), %edx +; CHECK-BASELINE-NEXT: movzwl 4(%r8), %eax +; CHECK-BASELINE-NEXT: andw %dx, %ax +; CHECK-BASELINE-NEXT: notl %edx +; CHECK-BASELINE-NEXT: andw 4(%r10), %dx +; CHECK-BASELINE-NEXT: orl %eax, %edx +; CHECK-BASELINE-NEXT: movzwl 2(%r9), %ecx +; CHECK-BASELINE-NEXT: movzwl 2(%r8), %eax +; CHECK-BASELINE-NEXT: andw %cx, %ax +; CHECK-BASELINE-NEXT: notl %ecx +; CHECK-BASELINE-NEXT: andw 2(%r10), %cx +; CHECK-BASELINE-NEXT: orl %eax, %ecx +; CHECK-BASELINE-NEXT: movl (%r9), %r9d +; CHECK-BASELINE-NEXT: movzwl (%r8), %eax +; CHECK-BASELINE-NEXT: andw %r9w, %ax +; CHECK-BASELINE-NEXT: notl %r9d +; CHECK-BASELINE-NEXT: andw (%r10), %r9w +; CHECK-BASELINE-NEXT: orl %eax, %r9d +; CHECK-BASELINE-NEXT: movw %r12w, 30(%r11) +; CHECK-BASELINE-NEXT: movw %r15w, 28(%r11) +; CHECK-BASELINE-NEXT: movw %r14w, 26(%r11) +; CHECK-BASELINE-NEXT: movw %bp, 24(%r11) +; CHECK-BASELINE-NEXT: movw %bx, 22(%r11) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, 10(%rdi) +; CHECK-BASELINE-NEXT: movw %ax, 20(%r11) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, 8(%rdi) +; CHECK-BASELINE-NEXT: movw %ax, 18(%r11) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, 6(%rdi) +; CHECK-BASELINE-NEXT: movw %ax, 16(%r11) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, 4(%rdi) -; CHECK-BASELINE-NEXT: movw %r12w, 2(%rdi) +; CHECK-BASELINE-NEXT: movw %ax, 14(%r11) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, (%rdi) -; CHECK-BASELINE-NEXT: movq %rdi, %rax +; CHECK-BASELINE-NEXT: movw %ax, 12(%r11) +; CHECK-BASELINE-NEXT: movw %r13w, 10(%r11) +; CHECK-BASELINE-NEXT: movw %di, 8(%r11) +; CHECK-BASELINE-NEXT: movw %si, 6(%r11) +; CHECK-BASELINE-NEXT: movw %dx, 4(%r11) +; CHECK-BASELINE-NEXT: movw %cx, 2(%r11) +; CHECK-BASELINE-NEXT: movw %r9w, (%r11) +; CHECK-BASELINE-NEXT: movq %r11, %rax ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: popq %r12 ; CHECK-BASELINE-NEXT: popq %r13 @@ -1882,113 +1909,135 @@ ; CHECK-SSE1-NEXT: pushq %r13 ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx -; CHECK-SSE1-NEXT: movzwl 18(%rdx), %r15d -; CHECK-SSE1-NEXT: movzwl 16(%rdx), %r14d -; CHECK-SSE1-NEXT: movzwl 14(%rdx), %ebp -; CHECK-SSE1-NEXT: movzwl 12(%rdx), %ebx -; CHECK-SSE1-NEXT: movzwl 10(%rdx), %r13d -; CHECK-SSE1-NEXT: movzwl 8(%rdx), %r11d -; CHECK-SSE1-NEXT: movzwl 6(%rdx), %r10d -; CHECK-SSE1-NEXT: movzwl 4(%rdx), %r9d -; CHECK-SSE1-NEXT: movzwl (%rdx), %r8d -; CHECK-SSE1-NEXT: movzwl 2(%rdx), %r12d -; CHECK-SSE1-NEXT: movzwl (%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r8w, %ax -; CHECK-SSE1-NEXT: andw (%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r8d -; CHECK-SSE1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 2(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r12w, %ax -; CHECK-SSE1-NEXT: andw 2(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r12d -; CHECK-SSE1-NEXT: movzwl 4(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r9w, %ax -; CHECK-SSE1-NEXT: andw 4(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r9d -; CHECK-SSE1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 6(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r10w, %ax -; CHECK-SSE1-NEXT: andw 6(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r10d -; CHECK-SSE1-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 8(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r11w, %ax -; CHECK-SSE1-NEXT: andw 8(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r11d -; CHECK-SSE1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 10(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r13w, %ax -; CHECK-SSE1-NEXT: andw 10(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r13d -; CHECK-SSE1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 12(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %bx, %ax -; CHECK-SSE1-NEXT: andw 12(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %ebx -; CHECK-SSE1-NEXT: movzwl 14(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %bp, %ax -; CHECK-SSE1-NEXT: andw 14(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %ebp -; CHECK-SSE1-NEXT: movzwl 16(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r14w, %ax -; CHECK-SSE1-NEXT: andw 16(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r14d -; CHECK-SSE1-NEXT: movzwl 18(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r15w, %ax -; CHECK-SSE1-NEXT: andw 18(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r15d -; CHECK-SSE1-NEXT: movzwl 20(%rdx), %r13d -; CHECK-SSE1-NEXT: movzwl 20(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r13w, %ax -; CHECK-SSE1-NEXT: andw 20(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r13d -; CHECK-SSE1-NEXT: movzwl 22(%rdx), %r9d -; CHECK-SSE1-NEXT: movzwl 22(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r9w, %ax -; CHECK-SSE1-NEXT: andw 22(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r9d -; CHECK-SSE1-NEXT: movzwl 24(%rdx), %r8d -; CHECK-SSE1-NEXT: movzwl 24(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r8w, %ax -; CHECK-SSE1-NEXT: andw 24(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r8d -; CHECK-SSE1-NEXT: movzwl 26(%rdx), %eax -; CHECK-SSE1-NEXT: movzwl 26(%rsi), %r10d -; CHECK-SSE1-NEXT: xorw %ax, %r10w -; CHECK-SSE1-NEXT: andw 26(%rcx), %r10w -; CHECK-SSE1-NEXT: xorl %r10d, %eax -; CHECK-SSE1-NEXT: movzwl 28(%rdx), %r10d -; CHECK-SSE1-NEXT: movzwl 28(%rsi), %r11d -; CHECK-SSE1-NEXT: xorw %r10w, %r11w -; CHECK-SSE1-NEXT: andw 28(%rcx), %r11w -; CHECK-SSE1-NEXT: xorl %r11d, %r10d -; CHECK-SSE1-NEXT: movzwl 30(%rdx), %edx -; CHECK-SSE1-NEXT: movzwl 30(%rsi), %esi -; CHECK-SSE1-NEXT: xorw %dx, %si -; CHECK-SSE1-NEXT: andw 30(%rcx), %si -; CHECK-SSE1-NEXT: xorl %esi, %edx -; CHECK-SSE1-NEXT: movw %dx, 30(%rdi) -; CHECK-SSE1-NEXT: movw %r10w, 28(%rdi) -; CHECK-SSE1-NEXT: movw %ax, 26(%rdi) -; CHECK-SSE1-NEXT: movw %r8w, 24(%rdi) -; CHECK-SSE1-NEXT: movw %r9w, 22(%rdi) -; CHECK-SSE1-NEXT: movw %r13w, 20(%rdi) -; CHECK-SSE1-NEXT: movw %r15w, 18(%rdi) -; CHECK-SSE1-NEXT: movw %r14w, 16(%rdi) -; CHECK-SSE1-NEXT: movw %bp, 14(%rdi) -; CHECK-SSE1-NEXT: movw %bx, 12(%rdi) +; CHECK-SSE1-NEXT: movq %rcx, %r9 +; CHECK-SSE1-NEXT: movq %rdx, %r10 +; CHECK-SSE1-NEXT: movq %rsi, %r8 +; CHECK-SSE1-NEXT: movq %rdi, %r11 +; CHECK-SSE1-NEXT: movl 12(%rcx), %eax +; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 14(%rcx), %edx +; CHECK-SSE1-NEXT: movl 16(%rcx), %esi +; CHECK-SSE1-NEXT: movzwl 18(%rcx), %edi +; CHECK-SSE1-NEXT: movl 20(%rcx), %ecx +; CHECK-SSE1-NEXT: movzwl 22(%r9), %ebx +; CHECK-SSE1-NEXT: movl 24(%r9), %ebp +; CHECK-SSE1-NEXT: movzwl 26(%r9), %r14d +; CHECK-SSE1-NEXT: movl 28(%r9), %r15d +; CHECK-SSE1-NEXT: movzwl 30(%r9), %r12d +; CHECK-SSE1-NEXT: movzwl 30(%r8), %r13d +; CHECK-SSE1-NEXT: andw %r12w, %r13w +; CHECK-SSE1-NEXT: notl %r12d +; CHECK-SSE1-NEXT: andw 30(%r10), %r12w +; CHECK-SSE1-NEXT: orl %r13d, %r12d +; CHECK-SSE1-NEXT: movzwl 28(%r8), %eax +; CHECK-SSE1-NEXT: andw %r15w, %ax +; CHECK-SSE1-NEXT: notl %r15d +; CHECK-SSE1-NEXT: andw 28(%r10), %r15w +; CHECK-SSE1-NEXT: orl %eax, %r15d +; CHECK-SSE1-NEXT: movzwl 26(%r8), %eax +; CHECK-SSE1-NEXT: andw %r14w, %ax +; CHECK-SSE1-NEXT: notl %r14d +; CHECK-SSE1-NEXT: andw 26(%r10), %r14w +; CHECK-SSE1-NEXT: orl %eax, %r14d +; CHECK-SSE1-NEXT: movzwl 24(%r8), %eax +; CHECK-SSE1-NEXT: andw %bp, %ax +; CHECK-SSE1-NEXT: notl %ebp +; CHECK-SSE1-NEXT: andw 24(%r10), %bp +; CHECK-SSE1-NEXT: orl %eax, %ebp +; CHECK-SSE1-NEXT: movzwl 22(%r8), %eax +; CHECK-SSE1-NEXT: andw %bx, %ax +; CHECK-SSE1-NEXT: notl %ebx +; CHECK-SSE1-NEXT: andw 22(%r10), %bx +; CHECK-SSE1-NEXT: orl %eax, %ebx +; CHECK-SSE1-NEXT: movzwl 20(%r8), %eax +; CHECK-SSE1-NEXT: andw %cx, %ax +; CHECK-SSE1-NEXT: notl %ecx +; CHECK-SSE1-NEXT: andw 20(%r10), %cx +; CHECK-SSE1-NEXT: orl %eax, %ecx +; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 18(%r8), %eax +; CHECK-SSE1-NEXT: andw %di, %ax +; CHECK-SSE1-NEXT: notl %edi +; CHECK-SSE1-NEXT: andw 18(%r10), %di +; CHECK-SSE1-NEXT: orl %eax, %edi +; CHECK-SSE1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 16(%r8), %eax +; CHECK-SSE1-NEXT: andw %si, %ax +; CHECK-SSE1-NEXT: notl %esi +; CHECK-SSE1-NEXT: andw 16(%r10), %si +; CHECK-SSE1-NEXT: orl %eax, %esi +; CHECK-SSE1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 14(%r8), %eax +; CHECK-SSE1-NEXT: andw %dx, %ax +; CHECK-SSE1-NEXT: notl %edx +; CHECK-SSE1-NEXT: andw 14(%r10), %dx +; CHECK-SSE1-NEXT: orl %eax, %edx +; CHECK-SSE1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 12(%r8), %eax +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-SSE1-NEXT: andw %cx, %ax +; CHECK-SSE1-NEXT: notl %ecx +; CHECK-SSE1-NEXT: andw 12(%r10), %cx +; CHECK-SSE1-NEXT: orl %eax, %ecx +; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 10(%r9), %r13d +; CHECK-SSE1-NEXT: movzwl 10(%r8), %eax +; CHECK-SSE1-NEXT: andw %r13w, %ax +; CHECK-SSE1-NEXT: notl %r13d +; CHECK-SSE1-NEXT: andw 10(%r10), %r13w +; CHECK-SSE1-NEXT: orl %eax, %r13d +; CHECK-SSE1-NEXT: movl 8(%r9), %edi +; CHECK-SSE1-NEXT: movzwl 8(%r8), %eax +; CHECK-SSE1-NEXT: andw %di, %ax +; CHECK-SSE1-NEXT: notl %edi +; CHECK-SSE1-NEXT: andw 8(%r10), %di +; CHECK-SSE1-NEXT: orl %eax, %edi +; CHECK-SSE1-NEXT: movzwl 6(%r9), %esi +; CHECK-SSE1-NEXT: movzwl 6(%r8), %eax +; CHECK-SSE1-NEXT: andw %si, %ax +; CHECK-SSE1-NEXT: notl %esi +; CHECK-SSE1-NEXT: andw 6(%r10), %si +; CHECK-SSE1-NEXT: orl %eax, %esi +; CHECK-SSE1-NEXT: movl 4(%r9), %edx +; CHECK-SSE1-NEXT: movzwl 4(%r8), %eax +; CHECK-SSE1-NEXT: andw %dx, %ax +; CHECK-SSE1-NEXT: notl %edx +; CHECK-SSE1-NEXT: andw 4(%r10), %dx +; CHECK-SSE1-NEXT: orl %eax, %edx +; CHECK-SSE1-NEXT: movzwl 2(%r9), %ecx +; CHECK-SSE1-NEXT: movzwl 2(%r8), %eax +; CHECK-SSE1-NEXT: andw %cx, %ax +; CHECK-SSE1-NEXT: notl %ecx +; CHECK-SSE1-NEXT: andw 2(%r10), %cx +; CHECK-SSE1-NEXT: orl %eax, %ecx +; CHECK-SSE1-NEXT: movl (%r9), %r9d +; CHECK-SSE1-NEXT: movzwl (%r8), %eax +; CHECK-SSE1-NEXT: andw %r9w, %ax +; CHECK-SSE1-NEXT: notl %r9d +; CHECK-SSE1-NEXT: andw (%r10), %r9w +; CHECK-SSE1-NEXT: orl %eax, %r9d +; CHECK-SSE1-NEXT: movw %r12w, 30(%r11) +; CHECK-SSE1-NEXT: movw %r15w, 28(%r11) +; CHECK-SSE1-NEXT: movw %r14w, 26(%r11) +; CHECK-SSE1-NEXT: movw %bp, 24(%r11) +; CHECK-SSE1-NEXT: movw %bx, 22(%r11) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, 10(%rdi) +; CHECK-SSE1-NEXT: movw %ax, 20(%r11) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, 8(%rdi) +; CHECK-SSE1-NEXT: movw %ax, 18(%r11) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, 6(%rdi) +; CHECK-SSE1-NEXT: movw %ax, 16(%r11) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, 4(%rdi) -; CHECK-SSE1-NEXT: movw %r12w, 2(%rdi) +; CHECK-SSE1-NEXT: movw %ax, 14(%r11) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, (%rdi) -; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: movw %ax, 12(%r11) +; CHECK-SSE1-NEXT: movw %r13w, 10(%r11) +; CHECK-SSE1-NEXT: movw %di, 8(%r11) +; CHECK-SSE1-NEXT: movw %si, 6(%r11) +; CHECK-SSE1-NEXT: movw %dx, 4(%r11) +; CHECK-SSE1-NEXT: movw %cx, 2(%r11) +; CHECK-SSE1-NEXT: movw %r9w, (%r11) +; CHECK-SSE1-NEXT: movq %r11, %rax ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: popq %r12 ; CHECK-SSE1-NEXT: popq %r13 @@ -3144,12 +3193,26 @@ ; CHECK-SSE1-LABEL: in_v4i32: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, %xmm1 -; CHECK-SSE1-NEXT: andnps (%rdx), %xmm1 -; CHECK-SSE1-NEXT: andps (%rsi), %xmm0 -; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) +; CHECK-SSE1-NEXT: movq (%rdx), %rdi +; CHECK-SSE1-NEXT: movq 8(%rdx), %rdx +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rdx +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rdi +; CHECK-SSE1-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-SSE1-NEXT: movaps (%rsi), %xmm0 +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: andps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, (%rax) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: in_v4i32: diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll @@ -62,22 +62,33 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind { ; X86-LABEL: test_urem_odd_setne: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %eax -; X86-NEXT: andb $15, %al -; X86-NEXT: cmpb $4, %al -; X86-NEXT: setae %al +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andb $15, %cl +; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: leal (%ecx,%ecx,2), %edx +; X86-NEXT: leal (%ecx,%edx,4), %ecx +; X86-NEXT: shrb $6, %cl +; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: leal (%ecx,%ecx,4), %ecx +; X86-NEXT: subb %cl, %al +; X86-NEXT: testb $15, %al +; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_odd_setne: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: leal (%rdi,%rdi,2), %eax -; X64-NEXT: leal (%rdi,%rax,4), %eax +; X64-NEXT: movl %edi, %eax ; X64-NEXT: andb $15, %al -; X64-NEXT: cmpb $4, %al -; X64-NEXT: setae %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: leal (%rax,%rax,2), %ecx +; X64-NEXT: leal (%rax,%rcx,4), %eax +; X64-NEXT: shrb $6, %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: leal (%rax,%rax,4), %eax +; X64-NEXT: subb %al, %dil +; X64-NEXT: testb $15, %dil +; X64-NEXT: setne %al ; X64-NEXT: retq %urem = urem i4 %X, 5 %cmp = icmp ne i4 %urem, 0 diff --git a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll --- a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll @@ -5,18 +5,27 @@ define i1 @t32_3_1(i32 %X) nounwind { ; X86-LABEL: t32_3_1: ; X86: # %bb.0: -; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB -; X86-NEXT: addl $1431655765, %eax # imm = 0x55555555 -; X86-NEXT: cmpl $1431655765, %eax # imm = 0x55555555 -; X86-NEXT: setb %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl %edx +; X86-NEXT: leal (%edx,%edx,2), %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: cmpl $1, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t32_3_1: ; X64: # %bb.0: -; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB -; X64-NEXT: addl $1431655765, %eax # imm = 0x55555555 -; X64-NEXT: cmpl $1431655765, %eax # imm = 0x55555555 -; X64-NEXT: setb %al +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $33, %rcx +; X64-NEXT: leal (%rcx,%rcx,2), %eax +; X64-NEXT: subl %eax, %edi +; X64-NEXT: cmpl $1, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 3 %cmp = icmp eq i32 %urem, 1 @@ -26,18 +35,27 @@ define i1 @t32_3_2(i32 %X) nounwind { ; X86-LABEL: t32_3_2: ; X86: # %bb.0: -; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB -; X86-NEXT: addl $-1431655766, %eax # imm = 0xAAAAAAAA -; X86-NEXT: cmpl $1431655765, %eax # imm = 0x55555555 -; X86-NEXT: setb %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl %edx +; X86-NEXT: leal (%edx,%edx,2), %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: cmpl $2, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t32_3_2: ; X64: # %bb.0: -; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB -; X64-NEXT: addl $-1431655766, %eax # imm = 0xAAAAAAAA -; X64-NEXT: cmpl $1431655765, %eax # imm = 0x55555555 -; X64-NEXT: setb %al +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $33, %rcx +; X64-NEXT: leal (%rcx,%rcx,2), %eax +; X64-NEXT: subl %eax, %edi +; X64-NEXT: cmpl $2, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 3 %cmp = icmp eq i32 %urem, 2 @@ -48,18 +66,27 @@ define i1 @t32_5_1(i32 %X) nounwind { ; X86-LABEL: t32_5_1: ; X86: # %bb.0: -; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %eax # imm = 0xCCCCCCCD -; X86-NEXT: addl $858993459, %eax # imm = 0x33333333 -; X86-NEXT: cmpl $858993459, %eax # imm = 0x33333333 -; X86-NEXT: setb %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $2, %edx +; X86-NEXT: leal (%edx,%edx,4), %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: cmpl $1, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t32_5_1: ; X64: # %bb.0: -; X64-NEXT: imull $-858993459, %edi, %eax # imm = 0xCCCCCCCD -; X64-NEXT: addl $858993459, %eax # imm = 0x33333333 -; X64-NEXT: cmpl $858993459, %eax # imm = 0x33333333 -; X64-NEXT: setb %al +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $3435973837, %ecx # imm = 0xCCCCCCCD +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $34, %rcx +; X64-NEXT: leal (%rcx,%rcx,4), %eax +; X64-NEXT: subl %eax, %edi +; X64-NEXT: cmpl $1, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 5 %cmp = icmp eq i32 %urem, 1 @@ -69,18 +96,27 @@ define i1 @t32_5_2(i32 %X) nounwind { ; X86-LABEL: t32_5_2: ; X86: # %bb.0: -; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %eax # imm = 0xCCCCCCCD -; X86-NEXT: addl $1717986918, %eax # imm = 0x66666666 -; X86-NEXT: cmpl $858993459, %eax # imm = 0x33333333 -; X86-NEXT: setb %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $2, %edx +; X86-NEXT: leal (%edx,%edx,4), %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: cmpl $2, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t32_5_2: ; X64: # %bb.0: -; X64-NEXT: imull $-858993459, %edi, %eax # imm = 0xCCCCCCCD -; X64-NEXT: addl $1717986918, %eax # imm = 0x66666666 -; X64-NEXT: cmpl $858993459, %eax # imm = 0x33333333 -; X64-NEXT: setb %al +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $3435973837, %ecx # imm = 0xCCCCCCCD +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $34, %rcx +; X64-NEXT: leal (%rcx,%rcx,4), %eax +; X64-NEXT: subl %eax, %edi +; X64-NEXT: cmpl $2, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 5 %cmp = icmp eq i32 %urem, 2 @@ -90,18 +126,27 @@ define i1 @t32_5_3(i32 %X) nounwind { ; X86-LABEL: t32_5_3: ; X86: # %bb.0: -; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %eax # imm = 0xCCCCCCCD -; X86-NEXT: addl $-1717986919, %eax # imm = 0x99999999 -; X86-NEXT: cmpl $858993459, %eax # imm = 0x33333333 -; X86-NEXT: setb %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $2, %edx +; X86-NEXT: leal (%edx,%edx,4), %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: cmpl $3, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t32_5_3: ; X64: # %bb.0: -; X64-NEXT: imull $-858993459, %edi, %eax # imm = 0xCCCCCCCD -; X64-NEXT: addl $-1717986919, %eax # imm = 0x99999999 -; X64-NEXT: cmpl $858993459, %eax # imm = 0x33333333 -; X64-NEXT: setb %al +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $3435973837, %ecx # imm = 0xCCCCCCCD +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $34, %rcx +; X64-NEXT: leal (%rcx,%rcx,4), %eax +; X64-NEXT: subl %eax, %edi +; X64-NEXT: cmpl $3, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 5 %cmp = icmp eq i32 %urem, 3 @@ -111,18 +156,27 @@ define i1 @t32_5_4(i32 %X) nounwind { ; X86-LABEL: t32_5_4: ; X86: # %bb.0: -; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %eax # imm = 0xCCCCCCCD -; X86-NEXT: addl $-858993460, %eax # imm = 0xCCCCCCCC -; X86-NEXT: cmpl $858993459, %eax # imm = 0x33333333 -; X86-NEXT: setb %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $2, %edx +; X86-NEXT: leal (%edx,%edx,4), %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: cmpl $4, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t32_5_4: ; X64: # %bb.0: -; X64-NEXT: imull $-858993459, %edi, %eax # imm = 0xCCCCCCCD -; X64-NEXT: addl $-858993460, %eax # imm = 0xCCCCCCCC -; X64-NEXT: cmpl $858993459, %eax # imm = 0x33333333 -; X64-NEXT: setb %al +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $3435973837, %ecx # imm = 0xCCCCCCCD +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $34, %rcx +; X64-NEXT: leal (%rcx,%rcx,4), %eax +; X64-NEXT: subl %eax, %edi +; X64-NEXT: cmpl $4, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 5 %cmp = icmp eq i32 %urem, 4 @@ -133,20 +187,29 @@ define i1 @t32_6_1(i32 %X) nounwind { ; X86-LABEL: t32_6_1: ; X86: # %bb.0: -; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB -; X86-NEXT: addl $1431655765, %eax # imm = 0x55555555 -; X86-NEXT: rorl %eax -; X86-NEXT: cmpl $715827883, %eax # imm = 0x2AAAAAAB -; X86-NEXT: setb %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl %edx +; X86-NEXT: andl $-2, %edx +; X86-NEXT: leal (%edx,%edx,2), %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: cmpl $1, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t32_6_1: ; X64: # %bb.0: -; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB -; X64-NEXT: addl $1431655765, %eax # imm = 0x55555555 -; X64-NEXT: rorl %eax -; X64-NEXT: cmpl $715827883, %eax # imm = 0x2AAAAAAB -; X64-NEXT: setb %al +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $34, %rcx +; X64-NEXT: addl %ecx, %ecx +; X64-NEXT: leal (%rcx,%rcx,2), %eax +; X64-NEXT: subl %eax, %edi +; X64-NEXT: cmpl $1, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 6 %cmp = icmp eq i32 %urem, 1 @@ -156,20 +219,29 @@ define i1 @t32_6_2(i32 %X) nounwind { ; X86-LABEL: t32_6_2: ; X86: # %bb.0: -; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB -; X86-NEXT: addl $-1431655766, %eax # imm = 0xAAAAAAAA -; X86-NEXT: rorl %eax -; X86-NEXT: cmpl $715827883, %eax # imm = 0x2AAAAAAB -; X86-NEXT: setb %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl %edx +; X86-NEXT: andl $-2, %edx +; X86-NEXT: leal (%edx,%edx,2), %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: cmpl $2, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t32_6_2: ; X64: # %bb.0: -; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB -; X64-NEXT: addl $-1431655766, %eax # imm = 0xAAAAAAAA -; X64-NEXT: rorl %eax -; X64-NEXT: cmpl $715827883, %eax # imm = 0x2AAAAAAB -; X64-NEXT: setb %al +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $34, %rcx +; X64-NEXT: addl %ecx, %ecx +; X64-NEXT: leal (%rcx,%rcx,2), %eax +; X64-NEXT: subl %eax, %edi +; X64-NEXT: cmpl $2, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 6 %cmp = icmp eq i32 %urem, 2 @@ -179,20 +251,29 @@ define i1 @t32_6_3(i32 %X) nounwind { ; X86-LABEL: t32_6_3: ; X86: # %bb.0: -; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB -; X86-NEXT: decl %eax -; X86-NEXT: rorl %eax -; X86-NEXT: cmpl $715827883, %eax # imm = 0x2AAAAAAB -; X86-NEXT: setb %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl %edx +; X86-NEXT: andl $-2, %edx +; X86-NEXT: leal (%edx,%edx,2), %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: cmpl $3, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t32_6_3: ; X64: # %bb.0: -; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB -; X64-NEXT: decl %eax -; X64-NEXT: rorl %eax -; X64-NEXT: cmpl $715827883, %eax # imm = 0x2AAAAAAB -; X64-NEXT: setb %al +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $34, %rcx +; X64-NEXT: addl %ecx, %ecx +; X64-NEXT: leal (%rcx,%rcx,2), %eax +; X64-NEXT: subl %eax, %edi +; X64-NEXT: cmpl $3, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 6 %cmp = icmp eq i32 %urem, 3 @@ -202,20 +283,29 @@ define i1 @t32_6_4(i32 %X) nounwind { ; X86-LABEL: t32_6_4: ; X86: # %bb.0: -; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB -; X86-NEXT: addl $1431655764, %eax # imm = 0x55555554 -; X86-NEXT: rorl %eax -; X86-NEXT: cmpl $715827882, %eax # imm = 0x2AAAAAAA -; X86-NEXT: setb %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl %edx +; X86-NEXT: andl $-2, %edx +; X86-NEXT: leal (%edx,%edx,2), %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: cmpl $4, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t32_6_4: ; X64: # %bb.0: -; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB -; X64-NEXT: addl $1431655764, %eax # imm = 0x55555554 -; X64-NEXT: rorl %eax -; X64-NEXT: cmpl $715827882, %eax # imm = 0x2AAAAAAA -; X64-NEXT: setb %al +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $34, %rcx +; X64-NEXT: addl %ecx, %ecx +; X64-NEXT: leal (%rcx,%rcx,2), %eax +; X64-NEXT: subl %eax, %edi +; X64-NEXT: cmpl $4, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 6 %cmp = icmp eq i32 %urem, 4 @@ -225,20 +315,29 @@ define i1 @t32_6_5(i32 %X) nounwind { ; X86-LABEL: t32_6_5: ; X86: # %bb.0: -; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB -; X86-NEXT: addl $-1431655767, %eax # imm = 0xAAAAAAA9 -; X86-NEXT: rorl %eax -; X86-NEXT: cmpl $715827882, %eax # imm = 0x2AAAAAAA -; X86-NEXT: setb %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl %edx +; X86-NEXT: andl $-2, %edx +; X86-NEXT: leal (%edx,%edx,2), %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: cmpl $5, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t32_6_5: ; X64: # %bb.0: -; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB -; X64-NEXT: addl $-1431655767, %eax # imm = 0xAAAAAAA9 -; X64-NEXT: rorl %eax -; X64-NEXT: cmpl $715827882, %eax # imm = 0x2AAAAAAA -; X64-NEXT: setb %al +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $34, %rcx +; X64-NEXT: addl %ecx, %ecx +; X64-NEXT: leal (%rcx,%rcx,2), %eax +; X64-NEXT: subl %eax, %edi +; X64-NEXT: cmpl $5, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 6 %cmp = icmp eq i32 %urem, 5 @@ -251,20 +350,24 @@ define i1 @t16_3_2(i16 %X) nounwind { ; X86-LABEL: t16_3_2: ; X86: # %bb.0: -; X86-NEXT: imull $-21845, {{[0-9]+}}(%esp), %eax # imm = 0xAAAB -; X86-NEXT: addl $-21846, %eax # imm = 0xAAAA -; X86-NEXT: movzwl %ax, %eax -; X86-NEXT: cmpl $21845, %eax # imm = 0x5555 -; X86-NEXT: setb %al +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull $43691, %eax, %ecx # imm = 0xAAAB +; X86-NEXT: shrl $17, %ecx +; X86-NEXT: leal (%ecx,%ecx,2), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: cmpw $2, %ax +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t16_3_2: ; X64: # %bb.0: -; X64-NEXT: imull $-21845, %edi, %eax # imm = 0xAAAB -; X64-NEXT: addl $-21846, %eax # imm = 0xAAAA -; X64-NEXT: movzwl %ax, %eax -; X64-NEXT: cmpl $21845, %eax # imm = 0x5555 -; X64-NEXT: setb %al +; X64-NEXT: movzwl %di, %eax +; X64-NEXT: imull $43691, %eax, %eax # imm = 0xAAAB +; X64-NEXT: shrl $17, %eax +; X64-NEXT: leal (%rax,%rax,2), %eax +; X64-NEXT: subl %eax, %edi +; X64-NEXT: cmpw $2, %di +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i16 %X, 3 %cmp = icmp eq i16 %urem, 2 @@ -274,18 +377,24 @@ define i1 @t8_3_2(i8 %X) nounwind { ; X86-LABEL: t8_3_2: ; X86: # %bb.0: -; X86-NEXT: imull $-85, {{[0-9]+}}(%esp), %eax -; X86-NEXT: addb $-86, %al -; X86-NEXT: cmpb $85, %al -; X86-NEXT: setb %al +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull $171, %eax, %ecx +; X86-NEXT: shrl $9, %ecx +; X86-NEXT: leal (%ecx,%ecx,2), %ecx +; X86-NEXT: subb %cl, %al +; X86-NEXT: cmpb $2, %al +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t8_3_2: ; X64: # %bb.0: -; X64-NEXT: imull $-85, %edi, %eax -; X64-NEXT: addb $-86, %al -; X64-NEXT: cmpb $85, %al -; X64-NEXT: setb %al +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: imull $171, %eax, %ecx +; X64-NEXT: shrl $9, %ecx +; X64-NEXT: leal (%rcx,%rcx,2), %ecx +; X64-NEXT: subb %cl, %al +; X64-NEXT: cmpb $2, %al +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i8 %X, 3 %cmp = icmp eq i8 %urem, 2 @@ -312,13 +421,14 @@ ; ; X64-LABEL: t64_3_2: ; X64: # %bb.0: -; X64-NEXT: movabsq $-6148914691236517205, %rax # imm = 0xAAAAAAAAAAAAAAAB -; X64-NEXT: imulq %rdi, %rax -; X64-NEXT: movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA -; X64-NEXT: addq %rax, %rcx -; X64-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 -; X64-NEXT: cmpq %rax, %rcx -; X64-NEXT: setb %al +; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: shrq %rdx +; X64-NEXT: leaq (%rdx,%rdx,2), %rax +; X64-NEXT: subq %rax, %rdi +; X64-NEXT: cmpq $2, %rdi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i64 %X, 3 %cmp = icmp eq i64 %urem, 2 diff --git a/llvm/test/CodeGen/X86/urem-seteq-optsize.ll b/llvm/test/CodeGen/X86/urem-seteq-optsize.ll --- a/llvm/test/CodeGen/X86/urem-seteq-optsize.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-optsize.ll @@ -47,10 +47,15 @@ define i32 @test_optsize(i32 %X) optsize nounwind readnone { ; X86-LABEL: test_optsize: ; X86: # %bb.0: -; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %eax # imm = 0xCCCCCCCD -; X86-NEXT: cmpl $858993460, %eax # imm = 0x33333334 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $2, %edx +; X86-NEXT: leal (%edx,%edx,4), %eax +; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: movl $42, %eax -; X86-NEXT: jb .LBB1_2 +; X86-NEXT: je .LBB1_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: movl $-10, %eax ; X86-NEXT: .LBB1_2: @@ -58,11 +63,15 @@ ; ; X64-LABEL: test_optsize: ; X64: # %bb.0: -; X64-NEXT: imull $-858993459, %edi, %eax # imm = 0xCCCCCCCD -; X64-NEXT: cmpl $858993460, %eax # imm = 0x33333334 +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $3435973837, %ecx # imm = 0xCCCCCCCD +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $34, %rcx +; X64-NEXT: leal (%rcx,%rcx,4), %eax +; X64-NEXT: cmpl %eax, %edi ; X64-NEXT: movl $42, %ecx ; X64-NEXT: movl $-10, %eax -; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: cmovel %ecx, %eax ; X64-NEXT: retq %rem = urem i32 %X, 5 %cmp = icmp eq i32 %rem, 0 diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -9,68 +9,108 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: por %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,1374389535,1374389535] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: psrld $1, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: psrld $5, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; CHECK-SSE2-NEXT: psrld $3, %xmm1 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,306783378,171798691,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,1374389535,1374389535] +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: psrld $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm3 +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: psrld $3, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpsrld $5, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -86,33 +126,95 @@ define <4 x i32> @test_urem_odd_allones_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_allones_eq: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,2147483649,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_allones_eq: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,1,858993459] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,2147483649,u> +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX-LABEL: test_urem_odd_allones_eq: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq +; CHECK-AVX1-LABEL: test_urem_odd_allones_eq: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_odd_allones_eq: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_odd_allones_eq: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> @@ -121,33 +223,96 @@ define <4 x i32> @test_urem_odd_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_allones_ne: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,2147483649,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_allones_ne: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993460,858993460,2,858993460] -; CHECK-SSE41-NEXT: pmaxud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,2147483649,u> +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX-LABEL: test_urem_odd_allones_ne: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq +; CHECK-AVX1-LABEL: test_urem_odd_allones_ne: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_odd_allones_ne: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_odd_allones_ne: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpandnd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp ne <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> @@ -158,73 +323,101 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_allones_eq: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_allones_eq: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,1,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_allones_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_allones_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_even_allones_eq: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -236,75 +429,104 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_allones_ne: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_allones_ne: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783379,306783379,2,306783379] -; CHECK-SSE41-NEXT: pmaxud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_allones_ne: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_allones_ne: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_even_allones_ne: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpandnd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp ne <4 x i32> %urem, @@ -316,68 +538,108 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_allones_eq: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: por %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: psrld $1, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: psrld $5, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_eq: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,306783378,1,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535] +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: psrld $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm3 +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_allones_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpsrld $5, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_allones_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_eq: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -389,70 +651,111 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_allones_ne: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: por %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: psrld $1, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: psrld $5, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; CHECK-SSE2-NEXT: psrld $31, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_ne: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993460,306783379,2,42949673] -; CHECK-SSE41-NEXT: pmaxud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535] +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: psrld $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm3 +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_allones_ne: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpsrld $5, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_allones_ne: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_ne: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpandnd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp ne <4 x i32> %urem, @@ -466,62 +769,89 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,268435456,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrlq $32, %xmm0 -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <1,u,268435456,u> -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-SSE41-NEXT: psrlq $32, %xmm1 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,268435455,858993459] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,268435456,u> +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -535,73 +865,98 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,268435455,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_even_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -615,72 +970,104 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,268435456,1374389535] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: psrld $1, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE2-NEXT: psrld $2, %xmm3 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: psrld $5, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,306783378,268435455,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,268435456,1374389535] +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: psrld $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm3 +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpsrld $5, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -696,48 +1083,98 @@ define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-SSE2-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD +; CHECK-SSE2-NEXT: movd %eax, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,4294967295,858993459] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD +; CHECK-SSE41-NEXT: movd %eax, %xmm1 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD +; CHECK-AVX1-NEXT: vmovd %eax, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD +; CHECK-AVX2-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD +; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -751,63 +1188,105 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: pslld $31, %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-SSE2-NEXT: movd %eax, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: psrld $1, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pslld $31, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,4294967295,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-SSE41-NEXT: movd %eax, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: psrld $1, %xmm2 +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-AVX1-NEXT: vmovd %eax, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-AVX2-NEXT: vmovd %eax, %xmm3 +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_even_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprord $1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm3 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -821,68 +1300,103 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,2454267027,u,1374389535> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: psrld $1, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: psrld $5, %xmm2 +; CHECK-SSE2-NEXT: movaps %xmm0, %xmm3 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: por %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,306783378,4294967295,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,2454267027,u,1374389535> +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: psrld $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $5, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -898,62 +1412,89 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_INT_MIN: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,2,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrlq $32, %xmm0 -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_INT_MIN: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <1,u,2,u> -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-SSE41-NEXT: psrlq $32, %xmm1 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,1,858993459] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,2,u> +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_INT_MIN: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_INT_MIN: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_INT_MIN: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -967,73 +1508,98 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_INT_MIN: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_INT_MIN: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,1,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_INT_MIN: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_INT_MIN: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_even_INT_MIN: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1047,72 +1613,104 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_INT_MIN: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,2,1374389535] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: psrld $1, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE2-NEXT: psrld $2, %xmm3 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: psrld $5, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_INT_MIN: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,306783378,1,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,2,1374389535] +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: psrld $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm3 +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_INT_MIN: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpsrld $5, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_INT_MIN: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_INT_MIN: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1128,62 +1726,95 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE2-NEXT: psrld $2, %xmm3 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrlq $32, %xmm0 -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm3[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <1,u,268435456,u> +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-SSE41-NEXT: psrlq $32, %xmm1 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,1,268435455,858993459] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1197,72 +1828,104 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE2-NEXT: psrld $2, %xmm3 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm3[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,1,268435455,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm3 +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1276,72 +1939,96 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrld $5, %xmm3 +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm3[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535] +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,1,268435455,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE41-NEXT: psrld $5, %xmm3 +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $5, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1357,33 +2044,96 @@ define <4 x i32> @test_urem_odd_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_allones_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,2147483649,u,3435973837> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,1,4294967295,858993459] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,2147483649,u,3435973837> +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX-LABEL: test_urem_odd_allones_and_one: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq +; CHECK-AVX1-LABEL: test_urem_odd_allones_and_one: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_odd_allones_and_one: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_odd_allones_and_one: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> @@ -1394,72 +2144,102 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_allones_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_allones_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,2147483649,u,2454267027> +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,1,4294967295,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_allones_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_allones_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1474,67 +2254,97 @@ ; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_one: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $5, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: por %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,2147483649,u,1374389535> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,2147483649,u,1374389535> +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: psrld $2, %xmm2 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,1,4294967295,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_allones_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_allones_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1550,68 +2360,90 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,268435456,u,3435973837> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: por %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,268435455,4294967295,858993459] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,268435456,u,3435973837> +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_poweroftwo_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1625,72 +2457,99 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,268435456,u,2454267027> +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,268435455,4294967295,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_even_poweroftwo_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1705,67 +2564,94 @@ ; CHECK-SSE2-LABEL: test_urem_odd_even_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $5, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: por %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,268435456,u,1374389535> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,268435455,4294967295,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,268435456,u,1374389535> +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_poweroftwo_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1780,62 +2666,98 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrlq $32, %xmm0 -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,268435456,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE2-NEXT: psrld $2, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <1,u,268435456,u> +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,268435456,u> ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-SSE41-NEXT: psrlq $32, %xmm1 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,1,268435455,4294967295] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm0[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483649,2147483649,2147483649,2147483649] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483649,2147483649,2147483649,2147483649] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1848,62 +2770,107 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrld $2, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrlq $32, %xmm0 -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <2147483648,u,268435456,u> -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-SSE41-NEXT: psrlq $32, %xmm1 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,1,268435455,4294967295] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm0[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483649,2147483649,2147483649,2147483649] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483649,2147483649,2147483649,2147483649] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1912,3 +2879,5 @@ %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-AVX: {{.*}} diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll @@ -8,52 +8,81 @@ define <4 x i1> @t32_3(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: t32_3: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t32_3: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655765,1431655764,1431655764,1431655764] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t32_3: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t32_3: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t32_3: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm1, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -63,53 +92,81 @@ define <4 x i1> @t32_5(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: t32_5: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t32_5: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,858993458,858993458] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pslld $2, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t32_5: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpslld $2, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t32_5: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [858993458,858993458,858993458,858993458] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpslld $2, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t32_5: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpslld $2, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -119,68 +176,81 @@ define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: t32_6_part0: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,6,6,6] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm0 -; CHECK-SSE2-NEXT: psrld $1, %xmm0 -; CHECK-SSE2-NEXT: pslld $31, %xmm3 -; CHECK-SSE2-NEXT: por %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t32_6_part0: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pslld $31, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t32_6_part0: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t32_6_part0: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6] +; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t32_6_part0: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprord $1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -190,67 +260,81 @@ define <4 x i1> @t32_6_part1(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: t32_6_part1: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,6,6,6] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm0 -; CHECK-SSE2-NEXT: psrld $1, %xmm0 -; CHECK-SSE2-NEXT: pslld $31, %xmm3 -; CHECK-SSE2-NEXT: por %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t32_6_part1: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pslld $31, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [715827881,715827881,715827882,715827882] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t32_6_part1: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t32_6_part1: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6] +; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t32_6_part1: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprord $1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -260,59 +344,79 @@ define <4 x i1> @t32_tautological(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: t32_tautological: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: psrld $1, %xmm2 +; CHECK-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t32_tautological: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295,4294967295,1431655764] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psrld $1, %xmm2 +; CHECK-SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t32_tautological: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t32_tautological: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t32_tautological: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll @@ -9,49 +9,89 @@ define <4 x i32> @test_urem_odd_25(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_25: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psrld $3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [25,25,25,25] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_25: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [171798691,171798691,171798691,171798691] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $3, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_25: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_25: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [171798691,171798691,171798691,171798691] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $3, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [25,25,25,25] +; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_25: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $3, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -65,64 +105,89 @@ define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_100: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psrld $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [100,100,100,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: pslld $30, %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_100: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pslld $30, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_100: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_100: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $2, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpslld $30, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $5, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [100,100,100,100] +; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_even_100: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprord $2, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $5, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -138,33 +203,95 @@ define <4 x i32> @test_urem_odd_neg25(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_neg25: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,2147483661,2147483661,1374389535] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $3, %xmm2 +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_neg25: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [171798691,1,1,171798691] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,2147483661,2147483661,1374389535] +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrld $3, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX-LABEL: test_urem_odd_neg25: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq +; CHECK-AVX1-LABEL: test_urem_odd_neg25: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_odd_neg25: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_odd_neg25: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> @@ -176,60 +303,92 @@ ; CHECK-SSE2-LABEL: test_urem_even_neg100: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: psrld $5, %xmm1 ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: pslld $30, %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: psrld $27, %xmm2 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_neg100: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pslld $30, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,42949672,1,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psrld $5, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: psrld $27, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_neg100: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $27, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_neg100: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $2, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpslld $30, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [536870925,536870925,536870925,536870925] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_even_neg100: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprord $2, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [536870925,536870925,536870925,536870925] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -550,50 +709,42 @@ ; We could lower remainder of division by all-ones much better elsewhere. define <4 x i32> @test_urem_allones(<4 x i32> %X) nounwind { -; CHECK-SSE2-LABEL: test_urem_allones: -; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: psubd %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq -; -; CHECK-SSE41-LABEL: test_urem_allones: -; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm0, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] -; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: psrld $31, %xmm0 -; CHECK-SSE41-NEXT: retq +; CHECK-SSE-LABEL: test_urem_allones: +; CHECK-SSE: # %bb.0: +; CHECK-SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; CHECK-SSE-NEXT: pcmpeqd %xmm0, %xmm1 +; CHECK-SSE-NEXT: pandn %xmm0, %xmm1 +; CHECK-SSE-NEXT: pxor %xmm0, %xmm0 +; CHECK-SSE-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE-NEXT: psrld $31, %xmm0 +; CHECK-SSE-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_allones: ; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_allones: ; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_allones: ; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 +; CHECK-AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll @@ -25,54 +25,89 @@ define <4 x i1> @t1_all_odd_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: t1_all_odd_eq: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-SSE2-NEXT: movl $-1431655765, %eax # imm = 0xAAAAAAAB +; CHECK-SSE2-NEXT: movd %eax, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[2,3] +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t1_all_odd_eq: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655765,4294967295,4294967295,4294967295] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movl $-1431655765, %eax # imm = 0xAAAAAAAB +; CHECK-SSE41-NEXT: movd %eax, %xmm1 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t1_all_odd_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: movl $-1431655765, %eax # imm = 0xAAAAAAAB +; CHECK-AVX1-NEXT: vmovd %eax, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t1_all_odd_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [954437177,954437177,954437177,954437177] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: movl $-1431655765, %eax # imm = 0xAAAAAAAB +; CHECK-AVX2-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t1_all_odd_eq: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [954437177,954437177,954437177,954437177] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: movl $-1431655765, %eax # imm = 0xAAAAAAAB +; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -82,60 +117,98 @@ define <4 x i1> @t1_all_odd_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: t1_all_odd_ne: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-SSE2-NEXT: movl $-1431655765, %eax # imm = 0xAAAAAAAB +; CHECK-SSE2-NEXT: movd %eax, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[2,3] +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t1_all_odd_ne: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655765,4294967295,4294967295,4294967295] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movl $-1431655765, %eax # imm = 0xAAAAAAAB +; CHECK-SSE41-NEXT: movd %eax, %xmm1 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t1_all_odd_ne: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: movl $-1431655765, %eax # imm = 0xAAAAAAAB +; CHECK-AVX1-NEXT: vmovd %eax, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t1_all_odd_ne: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [954437177,954437177,954437177,954437177] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: movl $-1431655765, %eax # imm = 0xAAAAAAAB +; CHECK-AVX2-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t1_all_odd_ne: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [954437177,954437177,954437177,954437177] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: movl $-1431655765, %eax # imm = 0xAAAAAAAB +; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp ne <4 x i32> %urem, @@ -145,48 +218,71 @@ define <8 x i1> @t2_narrow(<8 x i16> %X) nounwind { ; CHECK-SSE2-LABEL: t2_narrow: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqw %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,0,65535,65535,0] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pand %xmm1, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = <43691,u,u,58255,43691,u,u,58255> +; CHECK-SSE2-NEXT: pmulhuw %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: psrlw $3, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: psrlw $1, %xmm3 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: pandn %xmm3, %xmm1 +; CHECK-SSE2-NEXT: por %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: psubw %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t2_narrow: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [21845,65535,65535,65535,21845,65535,65535,65535] -; CHECK-SSE41-NEXT: pminuw %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqw %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <43691,u,u,58255,43691,u,u,58255> +; CHECK-SSE41-NEXT: pmulhuw %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrlw $3, %xmm2 +; CHECK-SSE41-NEXT: psrlw $1, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3,4],xmm0[5,6],xmm1[7] +; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubw %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t2_narrow: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpsrlw $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3,4],xmm0[5,6],xmm1[7] +; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t2_narrow: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsrlw $3, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3,4],xmm0[5,6],xmm1[7] +; CHECK-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t2_narrow: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlw $3, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrlw $1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3,4],xmm0[5,6],xmm1[7] +; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %urem = urem <8 x i16> %X, %cmp = icmp eq <8 x i16> %urem, @@ -207,16 +303,18 @@ ; CHECK-SSE2-NEXT: psllq $32, %xmm0 ; CHECK-SSE2-NEXT: paddq %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; CHECK-SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15372286730238776661,9223372034707292159] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm0, %xmm1 -; CHECK-SSE2-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero +; CHECK-SSE2-NEXT: pand %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: por %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t3_wide: @@ -232,16 +330,18 @@ ; CHECK-SSE41-NEXT: psllq $32, %xmm0 ; CHECK-SSE41-NEXT: paddq %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmovsxdq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pand %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15372286730238776661,9223372034707292159] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pcmpgtd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmovsxdq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm0, %xmm1 -; CHECK-SSE41-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero +; CHECK-SSE41-NEXT: pand %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: por %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm0, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t3_wide: @@ -289,9 +389,12 @@ ; CHECK-AVX512VL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpaddq %xmm0, %xmm2, %xmm0 -; CHECK-AVX512VL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; CHECK-AVX512VL-NEXT: vpcmpleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; CHECK-AVX512VL-NEXT: movb $2, %al +; CHECK-AVX512VL-NEXT: kmovw %eax, %k1 +; CHECK-AVX512VL-NEXT: kxorw %k1, %k0, %k1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; CHECK-AVX512VL-NEXT: retq %urem = urem <2 x i64> %X, %cmp = icmp eq <2 x i64> %urem, diff --git a/llvm/test/CodeGen/X86/urem-seteq.ll b/llvm/test/CodeGen/X86/urem-seteq.ll --- a/llvm/test/CodeGen/X86/urem-seteq.ll +++ b/llvm/test/CodeGen/X86/urem-seteq.ll @@ -9,18 +9,27 @@ define i32 @test_urem_odd(i32 %X) nounwind { ; X86-LABEL: test_urem_odd: ; X86: # %bb.0: -; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %ecx # imm = 0xCCCCCCCD +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $2, %edx +; X86-NEXT: leal (%edx,%edx,4), %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $858993460, %ecx # imm = 0x33333334 -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_odd: ; X64: # %bb.0: -; X64-NEXT: imull $-858993459, %edi, %ecx # imm = 0xCCCCCCCD +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $3435973837, %ecx # imm = 0xCCCCCCCD +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $34, %rcx +; X64-NEXT: leal (%rcx,%rcx,4), %ecx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $858993460, %ecx # imm = 0x33333334 -; X64-NEXT: setb %al +; X64-NEXT: cmpl %ecx, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 5 %cmp = icmp eq i32 %urem, 0 @@ -31,18 +40,28 @@ define i32 @test_urem_odd_25(i32 %X) nounwind { ; X86-LABEL: test_urem_odd_25: ; X86: # %bb.0: -; X86-NEXT: imull $-1030792151, {{[0-9]+}}(%esp), %ecx # imm = 0xC28F5C29 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1374389535, %edx # imm = 0x51EB851F +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: leal (%edx,%edx,4), %eax +; X86-NEXT: leal (%eax,%eax,4), %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $171798692, %ecx # imm = 0xA3D70A4 -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_odd_25: ; X64: # %bb.0: -; X64-NEXT: imull $-1030792151, %edi, %ecx # imm = 0xC28F5C29 +; X64-NEXT: movl %edi, %eax +; X64-NEXT: imulq $1374389535, %rax, %rax # imm = 0x51EB851F +; X64-NEXT: shrq $35, %rax +; X64-NEXT: leal (%rax,%rax,4), %eax +; X64-NEXT: leal (%rax,%rax,4), %ecx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $171798692, %ecx # imm = 0xA3D70A4 -; X64-NEXT: setb %al +; X64-NEXT: cmpl %ecx, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 25 %cmp = icmp eq i32 %urem, 0 @@ -54,18 +73,27 @@ define i32 @test_urem_odd_bit30(i32 %X) nounwind { ; X86-LABEL: test_urem_odd_bit30: ; X86: # %bb.0: -; X86-NEXT: imull $1789569707, {{[0-9]+}}(%esp), %ecx # imm = 0x6AAAAAAB +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-11, %edx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $30, %edx +; X86-NEXT: imull $1073741827, %edx, %edx # imm = 0x40000003 ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $4, %ecx -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_odd_bit30: ; X64: # %bb.0: -; X64-NEXT: imull $1789569707, %edi, %ecx # imm = 0x6AAAAAAB +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $4294967285, %ecx # imm = 0xFFFFFFF5 +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $62, %rcx +; X64-NEXT: imull $1073741827, %ecx, %ecx # imm = 0x40000003 ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $4, %ecx -; X64-NEXT: setb %al +; X64-NEXT: cmpl %ecx, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 1073741827 %cmp = icmp eq i32 %urem, 0 @@ -77,18 +105,28 @@ define i32 @test_urem_odd_bit31(i32 %X) nounwind { ; X86-LABEL: test_urem_odd_bit31: ; X86: # %bb.0: -; X86-NEXT: imull $715827883, {{[0-9]+}}(%esp), %ecx # imm = 0x2AAAAAAB +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1073741823, %edx # imm = 0x3FFFFFFF +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $29, %edx +; X86-NEXT: imull $-2147483645, %edx, %edx # imm = 0x80000003 ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $2, %ecx -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_odd_bit31: ; X64: # %bb.0: -; X64-NEXT: imull $715827883, %edi, %ecx # imm = 0x2AAAAAAB +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: shlq $30, %rcx +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: shrq $61, %rcx +; X64-NEXT: imull $-2147483645, %ecx, %ecx # imm = 0x80000003 ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $2, %ecx -; X64-NEXT: setb %al +; X64-NEXT: cmpl %ecx, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 2147483651 %cmp = icmp eq i32 %urem, 0 @@ -103,23 +141,33 @@ define i16 @test_urem_even(i16 %X) nounwind { ; X86-LABEL: test_urem_even: ; X86: # %bb.0: -; X86-NEXT: imull $28087, {{[0-9]+}}(%esp), %eax # imm = 0x6DB7 -; X86-NEXT: rorw %ax -; X86-NEXT: movzwl %ax, %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrl %eax +; X86-NEXT: imull $18725, %eax, %edx # imm = 0x4925 +; X86-NEXT: shrl $17, %edx +; X86-NEXT: leal (%edx,%edx), %eax +; X86-NEXT: shll $4, %edx +; X86-NEXT: subl %eax, %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $4682, %ecx # imm = 0x124A -; X86-NEXT: setae %al +; X86-NEXT: cmpw %dx, %cx +; X86-NEXT: setne %al ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; ; X64-LABEL: test_urem_even: ; X64: # %bb.0: -; X64-NEXT: imull $28087, %edi, %eax # imm = 0x6DB7 -; X64-NEXT: rorw %ax -; X64-NEXT: movzwl %ax, %ecx +; X64-NEXT: movzwl %di, %ecx +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: shrl %eax +; X64-NEXT: imull $18725, %eax, %edx # imm = 0x4925 +; X64-NEXT: shrl $17, %edx +; X64-NEXT: leal (%rdx,%rdx), %eax +; X64-NEXT: shll $4, %edx +; X64-NEXT: subl %eax, %edx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $4682, %ecx # imm = 0x124A -; X64-NEXT: setae %al +; X64-NEXT: cmpw %dx, %cx +; X64-NEXT: setne %al ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %urem = urem i16 %X, 14 @@ -131,20 +179,26 @@ define i32 @test_urem_even_100(i32 %X) nounwind { ; X86-LABEL: test_urem_even_100: ; X86: # %bb.0: -; X86-NEXT: imull $-1030792151, {{[0-9]+}}(%esp), %ecx # imm = 0xC28F5C29 -; X86-NEXT: rorl $2, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1374389535, %edx # imm = 0x51EB851F +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $5, %edx +; X86-NEXT: imull $100, %edx, %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $42949673, %ecx # imm = 0x28F5C29 -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_even_100: ; X64: # %bb.0: -; X64-NEXT: imull $-1030792151, %edi, %ecx # imm = 0xC28F5C29 -; X64-NEXT: rorl $2, %ecx +; X64-NEXT: movl %edi, %eax +; X64-NEXT: imulq $1374389535, %rax, %rax # imm = 0x51EB851F +; X64-NEXT: shrq $37, %rax +; X64-NEXT: imull $100, %eax, %ecx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $42949673, %ecx # imm = 0x28F5C29 -; X64-NEXT: setb %al +; X64-NEXT: cmpl %ecx, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 100 %cmp = icmp eq i32 %urem, 0 @@ -156,20 +210,27 @@ define i32 @test_urem_even_bit30(i32 %X) nounwind { ; X86-LABEL: test_urem_even_bit30: ; X86: # %bb.0: -; X86-NEXT: imull $-51622203, {{[0-9]+}}(%esp), %ecx # imm = 0xFCEC4EC5 -; X86-NEXT: rorl $3, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-415, %edx # imm = 0xFE61 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $30, %edx +; X86-NEXT: imull $1073741928, %edx, %edx # imm = 0x40000068 ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $4, %ecx -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_even_bit30: ; X64: # %bb.0: -; X64-NEXT: imull $-51622203, %edi, %ecx # imm = 0xFCEC4EC5 -; X64-NEXT: rorl $3, %ecx +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $4294966881, %ecx # imm = 0xFFFFFE61 +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $62, %rcx +; X64-NEXT: imull $1073741928, %ecx, %ecx # imm = 0x40000068 ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $4, %ecx -; X64-NEXT: setb %al +; X64-NEXT: cmpl %ecx, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 1073741928 %cmp = icmp eq i32 %urem, 0 @@ -181,20 +242,26 @@ define i32 @test_urem_even_bit31(i32 %X) nounwind { ; X86-LABEL: test_urem_even_bit31: ; X86: # %bb.0: -; X86-NEXT: imull $-1157956869, {{[0-9]+}}(%esp), %ecx # imm = 0xBAFAFAFB -; X86-NEXT: rorl %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $2147483547, %edx # imm = 0x7FFFFF9B +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $30, %edx +; X86-NEXT: imull $-2147483546, %edx, %edx # imm = 0x80000066 ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $2, %ecx -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_even_bit31: ; X64: # %bb.0: -; X64-NEXT: imull $-1157956869, %edi, %ecx # imm = 0xBAFAFAFB -; X64-NEXT: rorl %ecx +; X64-NEXT: movl %edi, %eax +; X64-NEXT: imulq $2147483547, %rax, %rax # imm = 0x7FFFFF9B +; X64-NEXT: shrq $62, %rax +; X64-NEXT: imull $-2147483546, %eax, %ecx # imm = 0x80000066 ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $2, %ecx -; X64-NEXT: setb %al +; X64-NEXT: cmpl %ecx, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 2147483750 %cmp = icmp eq i32 %urem, 0 @@ -210,18 +277,27 @@ define i32 @test_urem_odd_setne(i32 %X) nounwind { ; X86-LABEL: test_urem_odd_setne: ; X86: # %bb.0: -; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %ecx # imm = 0xCCCCCCCD +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $2, %edx +; X86-NEXT: leal (%edx,%edx,4), %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $858993460, %ecx # imm = 0x33333334 -; X86-NEXT: setae %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_odd_setne: ; X64: # %bb.0: -; X64-NEXT: imull $-858993459, %edi, %ecx # imm = 0xCCCCCCCD +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $3435973837, %ecx # imm = 0xCCCCCCCD +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $34, %rcx +; X64-NEXT: leal (%rcx,%rcx,4), %ecx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $858993460, %ecx # imm = 0x33333334 -; X64-NEXT: setae %al +; X64-NEXT: cmpl %ecx, %edi +; X64-NEXT: setne %al ; X64-NEXT: retq %urem = urem i32 %X, 5 %cmp = icmp ne i32 %urem, 0 @@ -233,18 +309,27 @@ define i32 @test_urem_negative_odd(i32 %X) nounwind { ; X86-LABEL: test_urem_negative_odd: ; X86: # %bb.0: -; X86-NEXT: imull $858993459, {{[0-9]+}}(%esp), %ecx # imm = 0x33333333 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-2147483645, %edx # imm = 0x80000003 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $31, %edx +; X86-NEXT: leal (%edx,%edx,4), %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $2, %ecx -; X86-NEXT: setae %al +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_negative_odd: ; X64: # %bb.0: -; X64-NEXT: imull $858993459, %edi, %ecx # imm = 0x33333333 +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $2147483651, %ecx # imm = 0x80000003 +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $63, %rcx +; X64-NEXT: leal (%rcx,%rcx,4), %ecx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $2, %ecx -; X64-NEXT: setae %al +; X64-NEXT: addl %edi, %ecx +; X64-NEXT: setne %al ; X64-NEXT: retq %urem = urem i32 %X, -5 %cmp = icmp ne i32 %urem, 0 @@ -254,20 +339,30 @@ define i32 @test_urem_negative_even(i32 %X) nounwind { ; X86-LABEL: test_urem_negative_even: ; X86: # %bb.0: -; X86-NEXT: imull $-920350135, {{[0-9]+}}(%esp), %ecx # imm = 0xC9249249 -; X86-NEXT: rorl %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrl %eax +; X86-NEXT: movl $268435457, %edx # imm = 0x10000001 +; X86-NEXT: mull %edx +; X86-NEXT: shrl $27, %edx +; X86-NEXT: imull $-14, %edx, %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $2, %ecx -; X86-NEXT: setae %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_negative_even: ; X64: # %bb.0: -; X64-NEXT: imull $-920350135, %edi, %ecx # imm = 0xC9249249 -; X64-NEXT: rorl %ecx +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shrl %eax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: shlq $28, %rcx +; X64-NEXT: addq %rax, %rcx +; X64-NEXT: shrq $59, %rcx +; X64-NEXT: imull $-14, %ecx, %ecx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $2, %ecx -; X64-NEXT: setae %al +; X64-NEXT: cmpl %ecx, %edi +; X64-NEXT: setne %al ; X64-NEXT: retq %urem = urem i32 %X, -14 %cmp = icmp ne i32 %urem, 0 @@ -337,19 +432,26 @@ define i32 @test_urem_allones(i32 %X) nounwind { ; X86-LABEL: test_urem_allones: ; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl $-1, %eax +; X86-NEXT: je .LBB14_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: .LBB14_2: ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $2, %ecx -; X86-NEXT: setb %al +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_allones: ; X64: # %bb.0: -; X64-NEXT: negl %edi +; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: cmpl $-1, %edi +; X64-NEXT: cmovnel %edi, %ecx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $2, %edi -; X64-NEXT: setb %al +; X64-NEXT: testl %ecx, %ecx +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 4294967295 %cmp = icmp eq i32 %urem, 0 @@ -362,7 +464,16 @@ ; https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=34366 define void @ossfuzz34366() { ; X86-LABEL: ossfuzz34366: +; X86: # %bb.0: +; X86-NEXT: cmpl $0, (%eax) +; X86-NEXT: sete (%eax) +; X86-NEXT: retl +; ; X64-LABEL: ossfuzz34366: +; X64: # %bb.0: +; X64-NEXT: cmpq $0, (%rax) +; X64-NEXT: sete (%rax) +; X64-NEXT: retq %L10 = load i448, ptr undef, align 4 %B18 = urem i448 %L10, -363419362147803445274661903944002267176820680343659030140745099590319644056698961663095525356881782780381260803133088966767300814307328 %C13 = icmp ule i448 %B18, 0 diff --git a/llvm/test/CodeGen/X86/ushl_sat.ll b/llvm/test/CodeGen/X86/ushl_sat.ll --- a/llvm/test/CodeGen/X86/ushl_sat.ll +++ b/llvm/test/CodeGen/X86/ushl_sat.ll @@ -111,7 +111,7 @@ ; X86: # %bb.0: ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll $7, %ecx ; X86-NEXT: addl %eax, %eax ; X86-NEXT: movl %eax, %edx diff --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll --- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll @@ -24,6 +24,7 @@ ; X64-NEXT: pcmpeqd %xmm5, %xmm0 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] ; X64-NEXT: pand %xmm1, %xmm0 +; X64-NEXT: andpd %xmm0, %xmm4 ; X64-NEXT: pcmpeqd %xmm1, %xmm1 ; X64-NEXT: pxor %xmm1, %xmm0 ; X64-NEXT: por %xmm4, %xmm0 @@ -148,6 +149,7 @@ ; X64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0] ; X64-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm2[0,3] ; X64-NEXT: pcmpeqd %xmm5, %xmm0 +; X64-NEXT: pand %xmm0, %xmm6 ; X64-NEXT: pcmpeqd %xmm1, %xmm1 ; X64-NEXT: pxor %xmm1, %xmm0 ; X64-NEXT: por %xmm6, %xmm0 @@ -271,6 +273,7 @@ ; X64-NEXT: pand %xmm1, %xmm3 ; X64-NEXT: por %xmm4, %xmm3 ; X64-NEXT: pcmpeqw %xmm3, %xmm0 +; X64-NEXT: pand %xmm0, %xmm2 ; X64-NEXT: pcmpeqd %xmm1, %xmm1 ; X64-NEXT: pxor %xmm1, %xmm0 ; X64-NEXT: por %xmm2, %xmm0 @@ -447,6 +450,7 @@ ; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; X64-NEXT: por %xmm4, %xmm1 ; X64-NEXT: pcmpeqb %xmm1, %xmm0 +; X64-NEXT: pand %xmm0, %xmm2 ; X64-NEXT: pcmpeqd %xmm1, %xmm1 ; X64-NEXT: pxor %xmm1, %xmm0 ; X64-NEXT: por %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll --- a/llvm/test/CodeGen/X86/usub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll @@ -1100,14 +1100,14 @@ ; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64] -; SSE-NEXT: psubusw %xmm4, %xmm1 ; SSE-NEXT: psubusw %xmm4, %xmm0 -; SSE-NEXT: psubusw %xmm4, %xmm3 +; SSE-NEXT: psubusw %xmm4, %xmm1 ; SSE-NEXT: psubusw %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm2, 32(%rdi) +; SSE-NEXT: psubusw %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm3, 48(%rdi) -; SSE-NEXT: movdqa %xmm0, (%rdi) +; SSE-NEXT: movdqa %xmm2, 32(%rdi) ; SSE-NEXT: movdqa %xmm1, 16(%rdi) +; SSE-NEXT: movdqa %xmm0, (%rdi) ; SSE-NEXT: retq ; ; AVX1-LABEL: PR48223: @@ -1117,14 +1117,14 @@ ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64] -; AVX1-NEXT: vpsubusw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsubusw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsubusw %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpsubusw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, (%rdi) -; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi) -; AVX1-NEXT: vmovdqa %xmm2, 32(%rdi) +; AVX1-NEXT: vpsubusw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpsubusw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsubusw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vmovdqa %xmm3, 48(%rdi) +; AVX1-NEXT: vmovdqa %xmm2, 32(%rdi) +; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm0, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR48223: @@ -1132,10 +1132,10 @@ ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; AVX2-NEXT: vpsubusw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsubusw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdi) +; AVX2-NEXT: vpsubusw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rdi) +; AVX2-NEXT: vmovdqa %ymm0, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1144,10 +1144,17 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; AVX512F-NEXT: vpsubusw %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsubusw %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdi) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdi) +; AVX512F-NEXT: vpmaxuw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm3 +; AVX512F-NEXT: vpmaxuw %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [65472,65472,65472,65472,65472,65472,65472,65472,65472,65472,65472,65472,65472,65472,65472,65472] +; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpandq %zmm0, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/v8i1-masks.ll b/llvm/test/CodeGen/X86/v8i1-masks.ll --- a/llvm/test/CodeGen/X86/v8i1-masks.ll +++ b/llvm/test/CodeGen/X86/v8i1-masks.ll @@ -180,21 +180,25 @@ define <8 x i32> @and_mask_constant(<8 x i32> %v0, <8 x i32> %v1) { ; X86-LABEL: and_mask_constant: ; X86: ## %bb.0: -; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; X86-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; X86-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm2 +; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] +; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: and_mask_constant: ; X64: ## %bb.0: -; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; X64-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; X64-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm2 +; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X64-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; X64-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-NEXT: retq ; @@ -202,14 +206,26 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X86-AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4],xmm0[5,6],xmm1[7] +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; X86-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: and_mask_constant: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4],xmm0[5,6],xmm1[7] +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: retq ; ; X86-AVX512-LABEL: and_mask_constant: diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll --- a/llvm/test/CodeGen/X86/var-permute-128.ll +++ b/llvm/test/CodeGen/X86/var-permute-128.ll @@ -226,69 +226,77 @@ define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind { ; SSE3-LABEL: var_shuffle_v16i8: ; SSE3: # %bb.0: +; SSE3-NEXT: pushq %rbp +; SSE3-NEXT: pushq %r15 +; SSE3-NEXT: pushq %r14 +; SSE3-NEXT: pushq %r13 +; SSE3-NEXT: pushq %r12 +; SSE3-NEXT: pushq %rbx ; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: andl $15, %ebp +; SSE3-NEXT: movzbl -24(%rsp,%rbp), %eax ; SSE3-NEXT: movd %eax, %xmm2 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r13d +; SSE3-NEXT: movzbl -24(%rsp,%r13), %eax ; SSE3-NEXT: movd %eax, %xmm4 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r12d +; SSE3-NEXT: movzbl -24(%rsp,%r12), %eax ; SSE3-NEXT: movd %eax, %xmm3 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r15d +; SSE3-NEXT: movzbl -24(%rsp,%r15), %eax ; SSE3-NEXT: movd %eax, %xmm6 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r14d +; SSE3-NEXT: movzbl -24(%rsp,%r14), %eax ; SSE3-NEXT: movd %eax, %xmm7 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %ebx +; SSE3-NEXT: movzbl -24(%rsp,%rbx), %eax ; SSE3-NEXT: movd %eax, %xmm8 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r11d +; SSE3-NEXT: movzbl -24(%rsp,%r11), %eax ; SSE3-NEXT: movd %eax, %xmm5 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r10d +; SSE3-NEXT: movzbl -24(%rsp,%r10), %eax ; SSE3-NEXT: movd %eax, %xmm9 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r9d +; SSE3-NEXT: movzbl -24(%rsp,%r9), %eax ; SSE3-NEXT: movd %eax, %xmm10 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r8d +; SSE3-NEXT: movzbl -24(%rsp,%r8), %eax ; SSE3-NEXT: movd %eax, %xmm12 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %edi +; SSE3-NEXT: movzbl -24(%rsp,%rdi), %eax ; SSE3-NEXT: movd %eax, %xmm11 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %esi +; SSE3-NEXT: movzbl -24(%rsp,%rsi), %eax ; SSE3-NEXT: movd %eax, %xmm13 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %edx +; SSE3-NEXT: movzbl -24(%rsp,%rdx), %eax ; SSE3-NEXT: movd %eax, %xmm14 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %ecx +; SSE3-NEXT: movzbl -24(%rsp,%rcx), %eax ; SSE3-NEXT: movd %eax, %xmm15 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm0 @@ -307,6 +315,12 @@ ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE3-NEXT: popq %rbx +; SSE3-NEXT: popq %r12 +; SSE3-NEXT: popq %r13 +; SSE3-NEXT: popq %r14 +; SSE3-NEXT: popq %r15 +; SSE3-NEXT: popq %rbp ; SSE3-NEXT: retq ; ; SSSE3-LABEL: var_shuffle_v16i8: @@ -490,69 +504,77 @@ define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %indices) nounwind { ; SSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8: ; SSE3: # %bb.0: +; SSE3-NEXT: pushq %rbp +; SSE3-NEXT: pushq %r15 +; SSE3-NEXT: pushq %r14 +; SSE3-NEXT: pushq %r13 +; SSE3-NEXT: pushq %r12 +; SSE3-NEXT: pushq %rbx ; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: andl $15, %ebp +; SSE3-NEXT: movzbl -24(%rsp,%rbp), %eax ; SSE3-NEXT: movd %eax, %xmm2 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r13d +; SSE3-NEXT: movzbl -24(%rsp,%r13), %eax ; SSE3-NEXT: movd %eax, %xmm4 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r12d +; SSE3-NEXT: movzbl -24(%rsp,%r12), %eax ; SSE3-NEXT: movd %eax, %xmm3 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r15d +; SSE3-NEXT: movzbl -24(%rsp,%r15), %eax ; SSE3-NEXT: movd %eax, %xmm6 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r14d +; SSE3-NEXT: movzbl -24(%rsp,%r14), %eax ; SSE3-NEXT: movd %eax, %xmm7 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %ebx +; SSE3-NEXT: movzbl -24(%rsp,%rbx), %eax ; SSE3-NEXT: movd %eax, %xmm8 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r11d +; SSE3-NEXT: movzbl -24(%rsp,%r11), %eax ; SSE3-NEXT: movd %eax, %xmm5 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r10d +; SSE3-NEXT: movzbl -24(%rsp,%r10), %eax ; SSE3-NEXT: movd %eax, %xmm9 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r9d +; SSE3-NEXT: movzbl -24(%rsp,%r9), %eax ; SSE3-NEXT: movd %eax, %xmm10 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r8d +; SSE3-NEXT: movzbl -24(%rsp,%r8), %eax ; SSE3-NEXT: movd %eax, %xmm12 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %edi +; SSE3-NEXT: movzbl -24(%rsp,%rdi), %eax ; SSE3-NEXT: movd %eax, %xmm11 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %esi +; SSE3-NEXT: movzbl -24(%rsp,%rsi), %eax ; SSE3-NEXT: movd %eax, %xmm13 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %edx +; SSE3-NEXT: movzbl -24(%rsp,%rdx), %eax ; SSE3-NEXT: movd %eax, %xmm14 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %ecx +; SSE3-NEXT: movzbl -24(%rsp,%rcx), %eax ; SSE3-NEXT: movd %eax, %xmm15 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm0 @@ -571,6 +593,12 @@ ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE3-NEXT: popq %rbx +; SSE3-NEXT: popq %r12 +; SSE3-NEXT: popq %r13 +; SSE3-NEXT: popq %r14 +; SSE3-NEXT: popq %r15 +; SSE3-NEXT: popq %rbp ; SSE3-NEXT: retq ; ; SSSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8: @@ -1156,24 +1184,30 @@ ; ; XOP-LABEL: indices_convert: ; XOP: # %bb.0: # %bb -; XOP-NEXT: vmovdqa (%rax), %xmm0 -; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1 -; XOP-NEXT: vpermil2pd $0, %xmm1, %xmm0, %xmm0, %xmm0 +; XOP-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,3,2,3] +; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; XOP-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; XOP-NEXT: vpermil2pd $0, %xmm0, (%rax), %xmm1, %xmm0 ; XOP-NEXT: vmovupd %xmm0, (%rax) +; XOP-NEXT: vzeroupper ; XOP-NEXT: retq ; ; AVX1-LABEL: indices_convert: ; AVX1: # %bb.0: # %bb -; AVX1-NEXT: vmovdqa (%rax), %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,3,2,3] +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vmovapd (%rax), %xmm1 +; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpermilpd %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX1-NEXT: vpermilpd %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-NEXT: vmovupd %xmm0, (%rax) +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: indices_convert: diff --git a/llvm/test/CodeGen/X86/var-permute-512.ll b/llvm/test/CodeGen/X86/var-permute-512.ll --- a/llvm/test/CodeGen/X86/var-permute-512.ll +++ b/llvm/test/CodeGen/X86/var-permute-512.ll @@ -170,12 +170,10 @@ ; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrw $5, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512F-NEXT: vpinsrw $5, %eax, %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrw $6, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512F-NEXT: vpinsrw $6, %eax, %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrw $7, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax ; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax @@ -422,12 +420,10 @@ ; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $13, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $13, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $14, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $14, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $15, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax @@ -471,12 +467,10 @@ ; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $12, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $13, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $13, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $14, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax @@ -521,12 +515,10 @@ ; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $11, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $12, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $13, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax @@ -647,12 +639,10 @@ ; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $13, %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; AVX512BW-NEXT: vpinsrb $13, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $14, %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; AVX512BW-NEXT: vpinsrb $14, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $15, %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax @@ -696,12 +686,10 @@ ; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; AVX512BW-NEXT: vpinsrb $13, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax @@ -746,12 +734,10 @@ ; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $11, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 +; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $12, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $13, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax @@ -1102,8 +1088,8 @@ ; AVX512F-NEXT: vpextrd $1, %xmm5, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 ; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 +; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 ; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 ; AVX512F-NEXT: andl $63, %esi ; AVX512F-NEXT: vpinsrb $10, (%rsp,%rsi), %xmm0, %xmm0 @@ -1123,18 +1109,18 @@ ; AVX512F-NEXT: vpextrd $3, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm4, %eax +; AVX512F-NEXT: vmovd %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vmovd %eax, %xmm1 -; AVX512F-NEXT: vpextrd $1, %xmm4, %eax +; AVX512F-NEXT: vpextrd $1, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $2, %xmm4, %eax +; AVX512F-NEXT: vpextrd $2, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $3, %xmm4, %eax -; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-NEXT: vpextrd $3, %xmm3, %eax +; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm1, %xmm1 ; AVX512F-NEXT: vmovd %xmm5, %eax @@ -1147,7 +1133,7 @@ ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm1, %xmm1 ; AVX512F-NEXT: vpextrd $3, %xmm5, %eax -; AVX512F-NEXT: vextracti32x4 $2, %zmm4, %xmm5 +; AVX512F-NEXT: vextracti32x4 $2, %zmm3, %xmm5 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm1, %xmm1 ; AVX512F-NEXT: vmovd %xmm5, %eax @@ -1160,24 +1146,30 @@ ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm1, %xmm6 ; AVX512F-NEXT: vpextrd $3, %xmm5, %eax -; AVX512F-NEXT: vextracti32x4 $3, %zmm4, %xmm1 +; AVX512F-NEXT: vextracti32x4 $3, %zmm3, %xmm1 ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm6, %xmm4 +; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm6, %xmm3 ; AVX512F-NEXT: vmovd %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vmovd %xmm3, %eax +; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm3, %xmm3 +; AVX512F-NEXT: vpextrd $1, %xmm1, %eax +; AVX512F-NEXT: andl $63, %eax +; AVX512F-NEXT: vpinsrb $13, (%rsp,%rax), %xmm3, %xmm3 +; AVX512F-NEXT: vpextrd $2, %xmm1, %eax +; AVX512F-NEXT: andl $63, %eax +; AVX512F-NEXT: vpinsrb $14, (%rsp,%rax), %xmm3, %xmm3 +; AVX512F-NEXT: vmovd %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vmovd %eax, %xmm5 -; AVX512F-NEXT: vpextrd $1, %xmm3, %eax +; AVX512F-NEXT: vpextrd $1, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm5, %xmm5 -; AVX512F-NEXT: vpextrd $2, %xmm3, %eax +; AVX512F-NEXT: vpextrd $2, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm5, %xmm5 -; AVX512F-NEXT: vpextrd $3, %xmm3, %eax -; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512F-NEXT: vpextrd $3, %xmm4, %eax +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm6 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm5, %xmm5 ; AVX512F-NEXT: vmovd %xmm6, %eax @@ -1188,114 +1180,100 @@ ; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm5, %xmm5 ; AVX512F-NEXT: vpextrd $2, %xmm6, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm5, %xmm7 +; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm5, %xmm5 ; AVX512F-NEXT: vpextrd $3, %xmm6, %eax -; AVX512F-NEXT: vextracti32x4 $2, %zmm3, %xmm5 +; AVX512F-NEXT: vextracti32x4 $2, %zmm4, %xmm6 ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm7, %xmm6 -; AVX512F-NEXT: vmovd %xmm5, %eax +; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm5, %xmm5 +; AVX512F-NEXT: vmovd %xmm6, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm6, %xmm6 -; AVX512F-NEXT: vpextrd $1, %xmm5, %eax +; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm5, %xmm5 +; AVX512F-NEXT: vpextrd $1, %xmm6, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm6, %xmm6 -; AVX512F-NEXT: vpextrd $2, %xmm5, %eax +; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm5, %xmm5 +; AVX512F-NEXT: vpextrd $2, %xmm6, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm6, %xmm6 +; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm5, %xmm5 +; AVX512F-NEXT: vpextrd $3, %xmm6, %eax +; AVX512F-NEXT: vextracti32x4 $3, %zmm4, %xmm4 +; AVX512F-NEXT: andl $63, %eax +; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm5, %xmm5 +; AVX512F-NEXT: vmovd %xmm4, %eax +; AVX512F-NEXT: andl $63, %eax +; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm5, %xmm5 +; AVX512F-NEXT: vpextrd $1, %xmm4, %eax +; AVX512F-NEXT: andl $63, %eax +; AVX512F-NEXT: vpinsrb $13, (%rsp,%rax), %xmm5, %xmm5 ; AVX512F-NEXT: vmovd %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vmovd %eax, %xmm7 +; AVX512F-NEXT: vmovd %eax, %xmm6 ; AVX512F-NEXT: vpextrd $1, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm7, %xmm7 +; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm6, %xmm6 ; AVX512F-NEXT: vpextrd $2, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm7, %xmm7 +; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm6, %xmm6 ; AVX512F-NEXT: vpextrd $3, %xmm2, %eax -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm7 ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm7, %xmm7 -; AVX512F-NEXT: vmovd %xmm8, %eax +; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm6, %xmm6 +; AVX512F-NEXT: vmovd %xmm7, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm7, %xmm7 -; AVX512F-NEXT: vpextrd $1, %xmm8, %eax +; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm6, %xmm6 +; AVX512F-NEXT: vpextrd $1, %xmm7, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm7, %xmm7 -; AVX512F-NEXT: vpextrd $2, %xmm8, %eax +; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm6, %xmm6 +; AVX512F-NEXT: vpextrd $2, %xmm7, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm7, %xmm7 -; AVX512F-NEXT: vpextrd $3, %xmm8, %eax -; AVX512F-NEXT: vextracti32x4 $2, %zmm2, %xmm8 +; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm6, %xmm6 +; AVX512F-NEXT: vpextrd $3, %xmm7, %eax +; AVX512F-NEXT: vextracti32x4 $2, %zmm2, %xmm7 ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm7, %xmm7 -; AVX512F-NEXT: vmovd %xmm8, %eax +; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm6, %xmm6 +; AVX512F-NEXT: vmovd %xmm7, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm7, %xmm7 -; AVX512F-NEXT: vpextrd $1, %xmm8, %eax +; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm6, %xmm6 +; AVX512F-NEXT: vpextrd $1, %xmm7, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm7, %xmm7 -; AVX512F-NEXT: vpextrd $2, %xmm8, %eax +; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm6, %xmm6 +; AVX512F-NEXT: vpextrd $2, %xmm7, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $10, %eax, %xmm7, %xmm7 -; AVX512F-NEXT: vpextrd $3, %xmm8, %eax +; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm6, %xmm6 +; AVX512F-NEXT: vpextrd $3, %xmm7, %eax ; AVX512F-NEXT: vextracti32x4 $3, %zmm2, %xmm2 ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $11, %eax, %xmm7, %xmm7 +; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm6, %xmm6 ; AVX512F-NEXT: vmovd %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $12, %eax, %xmm7, %xmm7 +; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm6, %xmm6 ; AVX512F-NEXT: vpextrd $1, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $13, %eax, %xmm7, %xmm7 +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm6, %xmm6 ; AVX512F-NEXT: vpextrd $2, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $14, %eax, %xmm7, %xmm7 +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6 ; AVX512F-NEXT: vpextrd $3, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $15, %eax, %xmm7, %xmm2 -; AVX512F-NEXT: vpextrd $3, %xmm5, %eax -; AVX512F-NEXT: vextracti32x4 $3, %zmm3, %xmm3 -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $11, %eax, %xmm6, %xmm5 -; AVX512F-NEXT: vmovd %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX512F-NEXT: vpextrd $1, %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 -; AVX512F-NEXT: vpextrd $2, %xmm3, %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm6, %xmm2 +; AVX512F-NEXT: vpextrd $2, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX512F-NEXT: vpextrd $3, %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $15, %eax, %xmm5, %xmm3 -; AVX512F-NEXT: vpextrd $1, %xmm1, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $2, %xmm1, %eax +; AVX512F-NEXT: vpextrd $3, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm5, %xmm4 ; AVX512F-NEXT: vpextrd $3, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1 +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm3, %xmm1 ; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 ; AVX512F-NEXT: vcvtdq2ps %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512F-NEXT: vpmovsxbd %xmm4, %zmm3 ; AVX512F-NEXT: vcvtdq2ps %zmm3, %zmm3 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 ; AVX512F-NEXT: vcvtdq2ps %zmm1, %zmm1 @@ -1353,8 +1331,8 @@ ; AVX512BW-NEXT: vpextrd $1, %xmm5, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 ; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 +; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 ; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 ; AVX512BW-NEXT: andl $63, %esi ; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rsi), %xmm0, %xmm0 @@ -1374,18 +1352,18 @@ ; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm4, %eax +; AVX512BW-NEXT: vmovd %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vmovd %eax, %xmm1 -; AVX512BW-NEXT: vpextrd $1, %xmm4, %eax +; AVX512BW-NEXT: vpextrd $1, %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $2, %xmm4, %eax +; AVX512BW-NEXT: vpextrd $2, %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $3, %xmm4, %eax -; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512BW-NEXT: vpextrd $3, %xmm3, %eax +; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm1, %xmm1 ; AVX512BW-NEXT: vmovd %xmm5, %eax @@ -1398,7 +1376,7 @@ ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm1, %xmm1 ; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax -; AVX512BW-NEXT: vextracti32x4 $2, %zmm4, %xmm5 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, %xmm5 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm1, %xmm1 ; AVX512BW-NEXT: vmovd %xmm5, %eax @@ -1411,24 +1389,30 @@ ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rax), %xmm1, %xmm6 ; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax -; AVX512BW-NEXT: vextracti32x4 $3, %zmm4, %xmm1 +; AVX512BW-NEXT: vextracti32x4 $3, %zmm3, %xmm1 ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm6, %xmm4 +; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm6, %xmm3 ; AVX512BW-NEXT: vmovd %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vmovd %xmm3, %eax +; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rax), %xmm3, %xmm3 +; AVX512BW-NEXT: vpextrd $1, %xmm1, %eax +; AVX512BW-NEXT: andl $63, %eax +; AVX512BW-NEXT: vpinsrb $13, (%rsp,%rax), %xmm3, %xmm3 +; AVX512BW-NEXT: vpextrd $2, %xmm1, %eax +; AVX512BW-NEXT: andl $63, %eax +; AVX512BW-NEXT: vpinsrb $14, (%rsp,%rax), %xmm3, %xmm3 +; AVX512BW-NEXT: vmovd %xmm4, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vmovd %eax, %xmm5 -; AVX512BW-NEXT: vpextrd $1, %xmm3, %eax +; AVX512BW-NEXT: vpextrd $1, %xmm4, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrd $2, %xmm3, %eax +; AVX512BW-NEXT: vpextrd $2, %xmm4, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrd $3, %xmm3, %eax -; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512BW-NEXT: vpextrd $3, %xmm4, %eax +; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm6 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm5, %xmm5 ; AVX512BW-NEXT: vmovd %xmm6, %eax @@ -1439,114 +1423,100 @@ ; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrd $2, %xmm6, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm5, %xmm7 +; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrd $3, %xmm6, %eax -; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, %xmm5 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm4, %xmm6 ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm7, %xmm6 -; AVX512BW-NEXT: vmovd %xmm5, %eax +; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm5, %xmm5 +; AVX512BW-NEXT: vmovd %xmm6, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm6, %xmm6 -; AVX512BW-NEXT: vpextrd $1, %xmm5, %eax +; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrd $1, %xmm6, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm6, %xmm6 -; AVX512BW-NEXT: vpextrd $2, %xmm5, %eax +; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrd $2, %xmm6, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rax), %xmm6, %xmm6 +; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rax), %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrd $3, %xmm6, %eax +; AVX512BW-NEXT: vextracti32x4 $3, %zmm4, %xmm4 +; AVX512BW-NEXT: andl $63, %eax +; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm5, %xmm5 +; AVX512BW-NEXT: vmovd %xmm4, %eax +; AVX512BW-NEXT: andl $63, %eax +; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rax), %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrd $1, %xmm4, %eax +; AVX512BW-NEXT: andl $63, %eax +; AVX512BW-NEXT: vpinsrb $13, (%rsp,%rax), %xmm5, %xmm5 ; AVX512BW-NEXT: vmovd %xmm2, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vmovd %eax, %xmm7 +; AVX512BW-NEXT: vmovd %eax, %xmm6 ; AVX512BW-NEXT: vpextrd $1, %xmm2, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm7, %xmm7 +; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrd $2, %xmm2, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm7, %xmm7 +; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrd $3, %xmm2, %eax -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm7 ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm7, %xmm7 -; AVX512BW-NEXT: vmovd %xmm8, %eax +; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm6, %xmm6 +; AVX512BW-NEXT: vmovd %xmm7, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm7, %xmm7 -; AVX512BW-NEXT: vpextrd $1, %xmm8, %eax +; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrd $1, %xmm7, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm7, %xmm7 -; AVX512BW-NEXT: vpextrd $2, %xmm8, %eax +; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrd $2, %xmm7, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm7, %xmm7 -; AVX512BW-NEXT: vpextrd $3, %xmm8, %eax -; AVX512BW-NEXT: vextracti32x4 $2, %zmm2, %xmm8 +; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrd $3, %xmm7, %eax +; AVX512BW-NEXT: vextracti32x4 $2, %zmm2, %xmm7 ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm7, %xmm7 -; AVX512BW-NEXT: vmovd %xmm8, %eax +; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm6, %xmm6 +; AVX512BW-NEXT: vmovd %xmm7, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm7, %xmm7 -; AVX512BW-NEXT: vpextrd $1, %xmm8, %eax +; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrd $1, %xmm7, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm7, %xmm7 -; AVX512BW-NEXT: vpextrd $2, %xmm8, %eax +; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrd $2, %xmm7, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm7, %xmm7 -; AVX512BW-NEXT: vpextrd $3, %xmm8, %eax +; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rax), %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrd $3, %xmm7, %eax ; AVX512BW-NEXT: vextracti32x4 $3, %zmm2, %xmm2 ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm7, %xmm7 +; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm6, %xmm6 ; AVX512BW-NEXT: vmovd %xmm2, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm7, %xmm7 +; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rax), %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrd $1, %xmm2, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm7, %xmm7 +; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrd $2, %xmm2, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm7, %xmm7 +; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrd $3, %xmm2, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm7, %xmm2 -; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax -; AVX512BW-NEXT: vextracti32x4 $3, %zmm3, %xmm3 -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm6, %xmm5 -; AVX512BW-NEXT: vmovd %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrd $1, %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrd $2, %xmm3, %eax +; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm6, %xmm2 +; AVX512BW-NEXT: vpextrd $2, %xmm4, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrd $3, %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm5, %xmm3 -; AVX512BW-NEXT: vpextrd $1, %xmm1, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrd $2, %xmm1, %eax +; AVX512BW-NEXT: vpextrd $3, %xmm4, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm5, %xmm4 ; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1 +; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm1 ; AVX512BW-NEXT: vpmovsxbd %xmm2, %zmm2 ; AVX512BW-NEXT: vcvtdq2ps %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512BW-NEXT: vpmovsxbd %xmm4, %zmm3 ; AVX512BW-NEXT: vcvtdq2ps %zmm3, %zmm3 ; AVX512BW-NEXT: vpmovsxbd %xmm1, %zmm1 ; AVX512BW-NEXT: vcvtdq2ps %zmm1, %zmm1 diff --git a/llvm/test/CodeGen/X86/vastart-defs-eflags.ll b/llvm/test/CodeGen/X86/vastart-defs-eflags.ll --- a/llvm/test/CodeGen/X86/vastart-defs-eflags.ll +++ b/llvm/test/CodeGen/X86/vastart-defs-eflags.ll @@ -9,11 +9,11 @@ ; CHECK-LABEL: check_flag: ; CHECK: ## %bb.0: ## %entry ; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: je LBB0_4 ; CHECK-NEXT: ## %bb.3: ## %entry diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll @@ -808,8 +808,7 @@ ; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX1-64-NEXT: vextractps $2, %xmm0, %eax ; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 -; AVX1-64-NEXT: vmovq %xmm0, %rax -; AVX1-64-NEXT: movl %eax, %eax +; AVX1-64-NEXT: vmovd %xmm0, %eax ; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm4 ; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX1-64-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 @@ -851,8 +850,7 @@ ; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-64-NEXT: vextractps $2, %xmm0, %eax ; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 -; AVX2-64-NEXT: vmovq %xmm0, %rax -; AVX2-64-NEXT: movl %eax, %eax +; AVX2-64-NEXT: vmovd %xmm0, %eax ; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm0 ; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX2-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1084,28 +1082,31 @@ ; ; AVX1-64-LABEL: uitofp_v4i64_v4f32: ; AVX1-64: # %bb.0: -; AVX1-64-NEXT: vpsrlq $1, %xmm0, %xmm1 -; AVX1-64-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-64-NEXT: vpsrlq $1, %xmm2, %xmm3 -; AVX1-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-64-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 -; AVX1-64-NEXT: vorpd %ymm3, %ymm1, %ymm1 -; AVX1-64-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 +; AVX1-64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-64-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 +; AVX1-64-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3] +; AVX1-64-NEXT: vpsrlq $1, %xmm0, %xmm2 +; AVX1-64-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-64-NEXT: vpsrlq $1, %xmm3, %xmm4 +; AVX1-64-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-64-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 +; AVX1-64-NEXT: vorpd %ymm4, %ymm2, %ymm2 +; AVX1-64-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm1 ; AVX1-64-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 ; AVX1-64-NEXT: vmovq %xmm1, %rax -; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; AVX1-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[2,3] ; AVX1-64-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-64-NEXT: vmovq %xmm1, %rax ; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX1-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] ; AVX1-64-NEXT: vpextrq $1, %xmm1, %rax ; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] -; AVX1-64-NEXT: vaddps %xmm1, %xmm1, %xmm3 -; AVX1-64-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-64-NEXT: vblendvps %xmm0, %xmm3, %xmm1, %xmm0 +; AVX1-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] +; AVX1-64-NEXT: vaddps %xmm1, %xmm1, %xmm2 +; AVX1-64-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-64-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX1-64-NEXT: vzeroupper ; AVX1-64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vec_fneg.ll b/llvm/test/CodeGen/X86/vec_fneg.ll --- a/llvm/test/CodeGen/X86/vec_fneg.ll +++ b/llvm/test/CodeGen/X86/vec_fneg.ll @@ -118,30 +118,15 @@ define <2 x float> @fneg_bitcast(i64 %i) nounwind { ; X32-SSE1-LABEL: fneg_bitcast: ; X32-SSE1: # %bb.0: -; X32-SSE1-NEXT: pushl %ebp -; X32-SSE1-NEXT: movl %esp, %ebp -; X32-SSE1-NEXT: andl $-16, %esp -; X32-SSE1-NEXT: subl $16, %esp -; X32-SSE1-NEXT: movl $-2147483648, %eax # imm = 0x80000000 -; X32-SSE1-NEXT: movl 12(%ebp), %ecx -; X32-SSE1-NEXT: xorl %eax, %ecx -; X32-SSE1-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X32-SSE1-NEXT: xorl 8(%ebp), %eax -; X32-SSE1-NEXT: movl %eax, (%esp) -; X32-SSE1-NEXT: movaps (%esp), %xmm0 -; X32-SSE1-NEXT: movl %ebp, %esp -; X32-SSE1-NEXT: popl %ebp +; X32-SSE1-NEXT: xorps %xmm0, %xmm0 +; X32-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X32-SSE1-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X32-SSE1-NEXT: retl ; ; X32-SSE2-LABEL: fneg_bitcast: ; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: movl $-2147483648, %eax # imm = 0x80000000 -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE2-NEXT: xorl %eax, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: xorl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movd %eax, %xmm0 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X32-SSE2-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X32-SSE2-NEXT: retl ; ; X64-SSE1-LABEL: fneg_bitcast: diff --git a/llvm/test/CodeGen/X86/vec_fpext.ll b/llvm/test/CodeGen/X86/vec_fpext.ll --- a/llvm/test/CodeGen/X86/vec_fpext.ll +++ b/llvm/test/CodeGen/X86/vec_fpext.ll @@ -186,14 +186,14 @@ ; X32-SSE: # %bb.0: # %entry ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] -; X32-SSE-NEXT: cvtps2pd 8(%ecx), %xmm0 # encoding: [0x0f,0x5a,0x41,0x08] -; X32-SSE-NEXT: cvtps2pd (%ecx), %xmm1 # encoding: [0x0f,0x5a,0x09] -; X32-SSE-NEXT: cvtps2pd 24(%ecx), %xmm2 # encoding: [0x0f,0x5a,0x51,0x18] -; X32-SSE-NEXT: cvtps2pd 16(%ecx), %xmm3 # encoding: [0x0f,0x5a,0x59,0x10] -; X32-SSE-NEXT: movups %xmm3, 32(%eax) # encoding: [0x0f,0x11,0x58,0x20] -; X32-SSE-NEXT: movups %xmm2, 48(%eax) # encoding: [0x0f,0x11,0x50,0x30] -; X32-SSE-NEXT: movups %xmm1, (%eax) # encoding: [0x0f,0x11,0x08] -; X32-SSE-NEXT: movups %xmm0, 16(%eax) # encoding: [0x0f,0x11,0x40,0x10] +; X32-SSE-NEXT: cvtps2pd (%ecx), %xmm0 # encoding: [0x0f,0x5a,0x01] +; X32-SSE-NEXT: cvtps2pd 8(%ecx), %xmm1 # encoding: [0x0f,0x5a,0x49,0x08] +; X32-SSE-NEXT: cvtps2pd 16(%ecx), %xmm2 # encoding: [0x0f,0x5a,0x51,0x10] +; X32-SSE-NEXT: cvtps2pd 24(%ecx), %xmm3 # encoding: [0x0f,0x5a,0x59,0x18] +; X32-SSE-NEXT: movups %xmm3, 48(%eax) # encoding: [0x0f,0x11,0x58,0x30] +; X32-SSE-NEXT: movups %xmm2, 32(%eax) # encoding: [0x0f,0x11,0x50,0x20] +; X32-SSE-NEXT: movups %xmm1, 16(%eax) # encoding: [0x0f,0x11,0x48,0x10] +; X32-SSE-NEXT: movups %xmm0, (%eax) # encoding: [0x0f,0x11,0x00] ; X32-SSE-NEXT: retl # encoding: [0xc3] ; ; X32-AVX-LABEL: fpext_frommem8: @@ -218,14 +218,14 @@ ; ; X64-SSE-LABEL: fpext_frommem8: ; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: cvtps2pd 8(%rdi), %xmm0 # encoding: [0x0f,0x5a,0x47,0x08] -; X64-SSE-NEXT: cvtps2pd (%rdi), %xmm1 # encoding: [0x0f,0x5a,0x0f] -; X64-SSE-NEXT: cvtps2pd 24(%rdi), %xmm2 # encoding: [0x0f,0x5a,0x57,0x18] -; X64-SSE-NEXT: cvtps2pd 16(%rdi), %xmm3 # encoding: [0x0f,0x5a,0x5f,0x10] -; X64-SSE-NEXT: movups %xmm3, 32(%rsi) # encoding: [0x0f,0x11,0x5e,0x20] -; X64-SSE-NEXT: movups %xmm2, 48(%rsi) # encoding: [0x0f,0x11,0x56,0x30] -; X64-SSE-NEXT: movups %xmm1, (%rsi) # encoding: [0x0f,0x11,0x0e] -; X64-SSE-NEXT: movups %xmm0, 16(%rsi) # encoding: [0x0f,0x11,0x46,0x10] +; X64-SSE-NEXT: cvtps2pd (%rdi), %xmm0 # encoding: [0x0f,0x5a,0x07] +; X64-SSE-NEXT: cvtps2pd 8(%rdi), %xmm1 # encoding: [0x0f,0x5a,0x4f,0x08] +; X64-SSE-NEXT: cvtps2pd 16(%rdi), %xmm2 # encoding: [0x0f,0x5a,0x57,0x10] +; X64-SSE-NEXT: cvtps2pd 24(%rdi), %xmm3 # encoding: [0x0f,0x5a,0x5f,0x18] +; X64-SSE-NEXT: movups %xmm3, 48(%rsi) # encoding: [0x0f,0x11,0x5e,0x30] +; X64-SSE-NEXT: movups %xmm2, 32(%rsi) # encoding: [0x0f,0x11,0x56,0x20] +; X64-SSE-NEXT: movups %xmm1, 16(%rsi) # encoding: [0x0f,0x11,0x4e,0x10] +; X64-SSE-NEXT: movups %xmm0, (%rsi) # encoding: [0x0f,0x11,0x06] ; X64-SSE-NEXT: retq # encoding: [0xc3] ; ; X64-AVX-LABEL: fpext_frommem8: diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -2141,39 +2141,33 @@ ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: psrlq $1, %xmm2 ; SSE41-NEXT: por %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm0 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2ss %rax, %xmm3 +; SSE41-NEXT: movq %xmm0, %rax ; SSE41-NEXT: xorps %xmm2, %xmm2 ; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: addps %xmm1, %xmm2 -; SSE41-NEXT: xorps %xmm3, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[2,3] -; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],zero,zero +; SSE41-NEXT: movaps %xmm2, %xmm3 +; SSE41-NEXT: addps %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: uitofp_4i64_to_4f32_undef: ; AVX1: # %bb.0: ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 -; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm2 -; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-NEXT: vmovaps %xmm0, %xmm2 -; AVX1-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm1 +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm1 +; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; AVX1-NEXT: vorpd %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero ; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 @@ -2570,28 +2564,31 @@ ; ; AVX1-LABEL: uitofp_4i64_to_4f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 -; AVX1-NEXT: vorpd %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3] +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsrlq $1, %xmm3, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 +; AVX1-NEXT: vorpd %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm1 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] ; AVX1-NEXT: vpextrq $1, %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] -; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm3 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm0, %xmm3, %xmm1, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] +; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -3658,8 +3655,8 @@ ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vaddpd %ymm0, %ymm2, %ymm0 @@ -4372,23 +4369,24 @@ ; AVX1-NEXT: vmovapd (%rdi), %ymm0 ; AVX1-NEXT: vmovdqa (%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm3 -; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 -; AVX1-NEXT: vorpd %ymm4, %ymm3, %ymm3 -; AVX1-NEXT: vblendvpd %ymm0, %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm3 +; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm4 +; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm5 +; AVX1-NEXT: vorpd %ymm5, %ymm4, %ymm4 +; AVX1-NEXT: vblendvpd %ymm3, %ymm4, %ymm0, %ymm0 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm6, %xmm3 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm6, %xmm4 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm6, %xmm4 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm6, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] ; AVX1-NEXT: vaddps %xmm0, %xmm0, %xmm3 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 @@ -4824,7 +4822,8 @@ ; AVX1-NEXT: vpsrlq $1, %xmm8, %xmm9 ; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 ; AVX1-NEXT: vorpd %ymm5, %ymm7, %ymm5 -; AVX1-NEXT: vblendvpd %ymm3, %ymm5, %ymm3, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm7 +; AVX1-NEXT: vblendvpd %ymm7, %ymm5, %ymm3, %ymm3 ; AVX1-NEXT: vpextrq $1, %xmm3, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm5 ; AVX1-NEXT: vmovq %xmm3, %rax @@ -4845,7 +4844,8 @@ ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-NEXT: vorpd %ymm4, %ymm5, %ymm4 -; AVX1-NEXT: vblendvpd %ymm2, %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5 +; AVX1-NEXT: vblendvpd %ymm5, %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vpextrq $1, %xmm2, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm4 ; AVX1-NEXT: vmovq %xmm2, %rax diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll --- a/llvm/test/CodeGen/X86/vec_saddo.ll +++ b/llvm/test/CodeGen/X86/vec_saddo.ll @@ -503,9 +503,12 @@ ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28] ; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi) ; AVX2-NEXT: vmovdqa %ymm2, (%rdi) ; AVX2-NEXT: retq @@ -566,8 +569,7 @@ ; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 ; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; SSSE3-NEXT: movdqa %xmm3, %xmm1 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] @@ -999,15 +1001,15 @@ ; ; AVX512-LABEL: saddo_v4i1: ; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm0, %xmm2 +; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k0 +; AVX512-NEXT: vpslld $31, %xmm1, %xmm2 +; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 -; AVX512-NEXT: vpslld $31, %xmm1, %xmm0 -; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k2 -; AVX512-NEXT: kandw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: kshiftlw $12, %k2, %k0 +; AVX512-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX512-NEXT: kshiftlw $12, %k0, %k0 ; AVX512-NEXT: kshiftrw $12, %k0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) diff --git a/llvm/test/CodeGen/X86/vec_setcc-2.ll b/llvm/test/CodeGen/X86/vec_setcc-2.ll --- a/llvm/test/CodeGen/X86/vec_setcc-2.ll +++ b/llvm/test/CodeGen/X86/vec_setcc-2.ll @@ -231,12 +231,14 @@ ; SSE2-LABEL: ugt_v2i64_splat: ; SSE2: ## %bb.0: ; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259898,9223372039002259898] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -303,13 +305,15 @@ ; SSE2-LABEL: uge_v2i64_splat: ; SSE2: ## %bb.0: ; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259898,9223372039002259898] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: uge_v2i64_splat: @@ -376,11 +380,13 @@ ; SSE2-LABEL: ult_v2i64_splat: ; SSE2: ## %bb.0: ; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259898,9223372039002259898] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: ult_v2i64_splat: @@ -446,15 +452,17 @@ ; SSE2-LABEL: ule_v2i64_splat: ; SSE2: ## %bb.0: ; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259898,9223372039002259898] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: ule_v2i64_splat: diff --git a/llvm/test/CodeGen/X86/vec_setcc.ll b/llvm/test/CodeGen/X86/vec_setcc.ll --- a/llvm/test/CodeGen/X86/vec_setcc.ll +++ b/llvm/test/CodeGen/X86/vec_setcc.ll @@ -310,11 +310,12 @@ ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_setcc_v3i1_v3i16: diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -422,151 +422,151 @@ ; SSE2-LABEL: smulo_v6i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax -; SSE2-NEXT: movd %r8d, %xmm0 -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: movd %esi, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE2-NEXT: movd %r8d, %xmm5 +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: movd %esi, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE2-NEXT: movd {{.*#+}} xmm8 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE2-NEXT: movd {{.*#+}} xmm9 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm0[0] ; SSE2-NEXT: movd %r9d, %xmm0 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero -; SSE2-NEXT: pmuludq %xmm6, %xmm0 -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE2-NEXT: movd {{.*#+}} xmm10 = mem[0],zero,zero,zero +; SSE2-NEXT: pmuludq %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 -; SSE2-NEXT: pand %xmm4, %xmm8 -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm9 -; SSE2-NEXT: pand %xmm5, %xmm9 -; SSE2-NEXT: paddd %xmm8, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm8, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; SSE2-NEXT: psubd %xmm9, %xmm10 +; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm12 +; SSE2-NEXT: pand %xmm6, %xmm12 +; SSE2-NEXT: pxor %xmm13, %xmm13 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm13 +; SSE2-NEXT: pand %xmm7, %xmm13 +; SSE2-NEXT: paddd %xmm12, %xmm13 +; SSE2-NEXT: pmuludq %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,3,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm8[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE2-NEXT: pmuludq %xmm9, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE2-NEXT: psubd %xmm13, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE2-NEXT: movdqa %xmm4, (%rcx) -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 -; SSE2-NEXT: pand %xmm3, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE2-NEXT: pand %xmm7, %xmm6 -; SSE2-NEXT: paddd %xmm8, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE2-NEXT: movdqa %xmm5, (%rcx) +; SSE2-NEXT: psrad $31, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm6 +; SSE2-NEXT: pand %xmm3, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm10 +; SSE2-NEXT: pand %xmm11, %xmm10 +; SSE2-NEXT: paddd %xmm6, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE2-NEXT: pmuludq %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2-NEXT: psubd %xmm6, %xmm3 +; SSE2-NEXT: psubd %xmm10, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: movq %xmm0, 16(%rcx) ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: movq %xmm0, 16(%rdi) -; SSE2-NEXT: movdqa %xmm4, (%rdi) +; SSE2-NEXT: movdqa %xmm5, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: smulo_v6i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq %rdi, %rax -; SSSE3-NEXT: movd %r8d, %xmm0 -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: movd %esi, %xmm4 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movd %r8d, %xmm5 +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: movd %esi, %xmm6 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSSE3-NEXT: movd {{.*#+}} xmm8 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSSE3-NEXT: movd {{.*#+}} xmm9 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm0[0] ; SSSE3-NEXT: movd %r9d, %xmm0 ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: movdqa %xmm0, %xmm3 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero -; SSSE3-NEXT: pmuludq %xmm6, %xmm0 -; SSSE3-NEXT: movdqa %xmm6, %xmm7 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSSE3-NEXT: movd {{.*#+}} xmm10 = mem[0],zero,zero,zero +; SSSE3-NEXT: pmuludq %xmm10, %xmm0 +; SSSE3-NEXT: movdqa %xmm10, %xmm11 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 -; SSSE3-NEXT: pand %xmm4, %xmm8 -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm9 -; SSSE3-NEXT: pand %xmm5, %xmm9 -; SSSE3-NEXT: paddd %xmm8, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm8, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; SSSE3-NEXT: psubd %xmm9, %xmm10 +; SSSE3-NEXT: pxor %xmm10, %xmm10 +; SSSE3-NEXT: pxor %xmm12, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm12 +; SSSE3-NEXT: pand %xmm6, %xmm12 +; SSSE3-NEXT: pxor %xmm13, %xmm13 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm13 +; SSSE3-NEXT: pand %xmm7, %xmm13 +; SSSE3-NEXT: paddd %xmm12, %xmm13 +; SSSE3-NEXT: pmuludq %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,3,2,3] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm8[0] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSSE3-NEXT: pmuludq %xmm9, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSSE3-NEXT: psubd %xmm13, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSSE3-NEXT: movdqa %xmm4, (%rcx) -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 -; SSSE3-NEXT: pxor %xmm5, %xmm4 -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 -; SSSE3-NEXT: pand %xmm3, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 -; SSSE3-NEXT: pand %xmm7, %xmm6 -; SSSE3-NEXT: paddd %xmm8, %xmm6 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSSE3-NEXT: movdqa %xmm5, (%rcx) +; SSSE3-NEXT: psrad $31, %xmm5 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 +; SSSE3-NEXT: pxor %xmm4, %xmm5 +; SSSE3-NEXT: pxor %xmm6, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm6 +; SSSE3-NEXT: pand %xmm3, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm10 +; SSSE3-NEXT: pand %xmm11, %xmm10 +; SSSE3-NEXT: paddd %xmm6, %xmm10 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSSE3-NEXT: pmuludq %xmm2, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSSE3-NEXT: psubd %xmm6, %xmm3 +; SSSE3-NEXT: psubd %xmm10, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: movq %xmm0, 16(%rcx) ; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: pcmpeqd %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm5, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm0 ; SSSE3-NEXT: movq %xmm0, 16(%rdi) -; SSSE3-NEXT: movdqa %xmm4, (%rdi) +; SSSE3-NEXT: movdqa %xmm5, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: smulo_v6i32: @@ -1173,16 +1173,15 @@ ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm7, %xmm0 ; AVX1-NEXT: vpackssdw %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[1,1,1,1] +; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 ; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi) @@ -1215,10 +1214,13 @@ ; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpsrad $31, %ymm2, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28] -; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi) ; AVX2-NEXT: vmovdqa %ymm2, (%rdi) ; AVX2-NEXT: retq @@ -1318,8 +1320,7 @@ ; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 ; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; SSSE3-NEXT: movdqa %xmm3, %xmm1 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] @@ -1639,10 +1640,9 @@ ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm8 ; SSSE3-NEXT: psrad $31, %xmm8 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; SSSE3-NEXT: pshufb %xmm10, %xmm4 +; SSSE3-NEXT: pshufb %xmm10, %xmm1 ; SSSE3-NEXT: movdqa %xmm3, 16(%rsi) ; SSSE3-NEXT: movdqa %xmm2, (%rsi) ; SSSE3-NEXT: movdqa %xmm1, 64(%rdi) @@ -2003,14 +2003,14 @@ ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] ; SSE2-NEXT: movdqa %xmm4, (%rdi) ; SSE2-NEXT: movdqa %xmm8, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm8 ; SSE2-NEXT: psrad $31, %xmm8 -; SSE2-NEXT: movdqa %xmm8, 224(%rdi) -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm8, 240(%rdi) +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: movdqa %xmm4, 240(%rdi) +; SSE2-NEXT: movdqa %xmm4, 224(%rdi) ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] @@ -2018,14 +2018,14 @@ ; SSE2-NEXT: psrad $31, %xmm5 ; SSE2-NEXT: movdqa %xmm5, 208(%rdi) ; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm3 ; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm3, 160(%rdi) -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm3, 176(%rdi) +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: movdqa %xmm4, 176(%rdi) +; SSE2-NEXT: movdqa %xmm4, 160(%rdi) ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] @@ -2033,14 +2033,14 @@ ; SSE2-NEXT: psrad $31, %xmm6 ; SSE2-NEXT: movdqa %xmm6, 144(%rdi) ; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: movdqa %xmm2, 96(%rdi) -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm2, 112(%rdi) +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm3 ; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm3, 112(%rdi) +; SSE2-NEXT: movdqa %xmm3, 96(%rdi) ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] @@ -2048,14 +2048,14 @@ ; SSE2-NEXT: psrad $31, %xmm7 ; SSE2-NEXT: movdqa %xmm7, 80(%rdi) ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: movdqa %xmm1, 32(%rdi) -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm1, 48(%rdi) +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: movdqa %xmm2, 48(%rdi) +; SSE2-NEXT: movdqa %xmm2, 32(%rdi) ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm0 @@ -2066,175 +2066,174 @@ ; SSSE3-LABEL: smulo_v64i8: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq %rdi, %rax -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] ; SSSE3-NEXT: pxor %xmm10, %xmm10 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15] -; SSSE3-NEXT: pmulhw %xmm8, %xmm10 -; SSSE3-NEXT: movdqa %xmm10, %xmm8 -; SSSE3-NEXT: psrlw $8, %xmm8 -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; SSSE3-NEXT: pmulhw %xmm9, %xmm7 -; SSSE3-NEXT: movdqa %xmm7, %xmm11 -; SSSE3-NEXT: psrlw $8, %xmm11 -; SSSE3-NEXT: packuswb %xmm8, %xmm11 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] -; SSSE3-NEXT: pand %xmm9, %xmm10 -; SSSE3-NEXT: pand %xmm9, %xmm7 -; SSSE3-NEXT: packuswb %xmm10, %xmm7 -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: pcmpgtb %xmm7, %xmm3 -; SSSE3-NEXT: pcmpeqb %xmm11, %xmm3 +; SSSE3-NEXT: pmulhw %xmm9, %xmm10 +; SSSE3-NEXT: movdqa %xmm10, %xmm9 +; SSSE3-NEXT: psrlw $8, %xmm9 +; SSSE3-NEXT: pxor %xmm11, %xmm11 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] ; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; SSSE3-NEXT: pmulhw %xmm11, %xmm8 +; SSSE3-NEXT: movdqa %xmm8, %xmm12 +; SSSE3-NEXT: psrlw $8, %xmm12 +; SSSE3-NEXT: packuswb %xmm9, %xmm12 +; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255] +; SSSE3-NEXT: pand %xmm11, %xmm10 +; SSSE3-NEXT: pand %xmm11, %xmm8 +; SSSE3-NEXT: packuswb %xmm10, %xmm8 +; SSSE3-NEXT: pxor %xmm7, %xmm7 +; SSSE3-NEXT: pcmpgtb %xmm8, %xmm7 +; SSSE3-NEXT: pcmpeqb %xmm12, %xmm7 +; SSSE3-NEXT: pcmpeqd %xmm12, %xmm12 +; SSSE3-NEXT: pxor %xmm12, %xmm7 +; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] ; SSSE3-NEXT: pxor %xmm10, %xmm10 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] -; SSSE3-NEXT: pmulhw %xmm8, %xmm10 -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSSE3-NEXT: pmulhw %xmm9, %xmm10 +; SSSE3-NEXT: movdqa %xmm10, %xmm9 +; SSSE3-NEXT: psrlw $8, %xmm9 +; SSSE3-NEXT: pxor %xmm13, %xmm13 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] ; SSSE3-NEXT: pxor %xmm6, %xmm6 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; SSSE3-NEXT: movdqa %xmm10, %xmm2 -; SSSE3-NEXT: psrlw $8, %xmm2 -; SSSE3-NEXT: pmulhw %xmm8, %xmm6 -; SSSE3-NEXT: movdqa %xmm6, %xmm8 -; SSSE3-NEXT: psrlw $8, %xmm8 -; SSSE3-NEXT: packuswb %xmm2, %xmm8 -; SSSE3-NEXT: pand %xmm9, %xmm10 -; SSSE3-NEXT: pand %xmm9, %xmm6 +; SSSE3-NEXT: pmulhw %xmm13, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, %xmm13 +; SSSE3-NEXT: psrlw $8, %xmm13 +; SSSE3-NEXT: packuswb %xmm9, %xmm13 +; SSSE3-NEXT: pand %xmm11, %xmm10 +; SSSE3-NEXT: pand %xmm11, %xmm6 ; SSSE3-NEXT: packuswb %xmm10, %xmm6 ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: pcmpgtb %xmm6, %xmm2 -; SSSE3-NEXT: pcmpeqb %xmm8, %xmm2 -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] +; SSSE3-NEXT: pcmpeqb %xmm13, %xmm2 +; SSSE3-NEXT: pxor %xmm12, %xmm2 +; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] ; SSSE3-NEXT: pxor %xmm10, %xmm10 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] -; SSSE3-NEXT: pmulhw %xmm8, %xmm10 -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; SSSE3-NEXT: pmulhw %xmm9, %xmm10 +; SSSE3-NEXT: movdqa %xmm10, %xmm9 +; SSSE3-NEXT: psrlw $8, %xmm9 +; SSSE3-NEXT: pxor %xmm13, %xmm13 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3],xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7] ; SSSE3-NEXT: pxor %xmm5, %xmm5 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; SSSE3-NEXT: movdqa %xmm10, %xmm1 +; SSSE3-NEXT: pmulhw %xmm13, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm1 ; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: pmulhw %xmm8, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm8 -; SSSE3-NEXT: psrlw $8, %xmm8 -; SSSE3-NEXT: packuswb %xmm1, %xmm8 -; SSSE3-NEXT: pand %xmm9, %xmm10 -; SSSE3-NEXT: pand %xmm9, %xmm5 +; SSSE3-NEXT: packuswb %xmm9, %xmm1 +; SSSE3-NEXT: pand %xmm11, %xmm10 +; SSSE3-NEXT: pand %xmm11, %xmm5 ; SSSE3-NEXT: packuswb %xmm10, %xmm5 +; SSSE3-NEXT: pxor %xmm9, %xmm9 +; SSSE3-NEXT: pcmpgtb %xmm5, %xmm9 +; SSSE3-NEXT: pcmpeqb %xmm1, %xmm9 +; SSSE3-NEXT: pxor %xmm12, %xmm9 ; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: pcmpgtb %xmm5, %xmm1 -; SSSE3-NEXT: pcmpeqb %xmm8, %xmm1 -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSSE3-NEXT: pxor %xmm13, %xmm13 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm0[8],xmm13[9],xmm0[9],xmm13[10],xmm0[10],xmm13[11],xmm0[11],xmm13[12],xmm0[12],xmm13[13],xmm0[13],xmm13[14],xmm0[14],xmm13[15],xmm0[15] +; SSSE3-NEXT: pmulhw %xmm1, %xmm13 +; SSSE3-NEXT: movdqa %xmm13, %xmm1 +; SSSE3-NEXT: psrlw $8, %xmm1 +; SSSE3-NEXT: pxor %xmm14, %xmm14 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3],xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] ; SSSE3-NEXT: pxor %xmm10, %xmm10 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] -; SSSE3-NEXT: pmulhw %xmm8, %xmm10 -; SSSE3-NEXT: pxor %xmm11, %xmm11 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3],xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] +; SSSE3-NEXT: pmulhw %xmm14, %xmm10 ; SSSE3-NEXT: movdqa %xmm10, %xmm0 ; SSSE3-NEXT: psrlw $8, %xmm0 -; SSSE3-NEXT: pmulhw %xmm11, %xmm8 -; SSSE3-NEXT: movdqa %xmm8, %xmm11 -; SSSE3-NEXT: psrlw $8, %xmm11 -; SSSE3-NEXT: packuswb %xmm0, %xmm11 -; SSSE3-NEXT: pand %xmm9, %xmm10 -; SSSE3-NEXT: pand %xmm9, %xmm8 -; SSSE3-NEXT: packuswb %xmm10, %xmm8 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpgtb %xmm8, %xmm4 -; SSSE3-NEXT: pcmpeqb %xmm11, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 -; SSSE3-NEXT: pxor %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm0, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm7, 48(%rsi) -; SSSE3-NEXT: movdqa %xmm1, %xmm7 +; SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm11, %xmm13 +; SSSE3-NEXT: pand %xmm11, %xmm10 +; SSSE3-NEXT: packuswb %xmm13, %xmm10 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtb %xmm10, %xmm1 +; SSSE3-NEXT: pcmpeqb %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm12, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: movdqa %xmm9, %xmm4 +; SSSE3-NEXT: movdqa %xmm9, %xmm11 +; SSSE3-NEXT: movdqa %xmm2, %xmm12 +; SSSE3-NEXT: movdqa %xmm2, %xmm13 +; SSSE3-NEXT: movdqa %xmm7, %xmm14 +; SSSE3-NEXT: movdqa {{.*#+}} xmm15 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; SSSE3-NEXT: pshufb %xmm15, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: pshufb %xmm15, %xmm9 +; SSSE3-NEXT: pshufb %xmm15, %xmm2 +; SSSE3-NEXT: movdqa %xmm7, %xmm3 +; SSSE3-NEXT: pshufb %xmm15, %xmm7 +; SSSE3-NEXT: movdqa %xmm8, 48(%rsi) ; SSSE3-NEXT: movdqa %xmm6, 32(%rsi) -; SSSE3-NEXT: movdqa %xmm2, %xmm6 ; SSSE3-NEXT: movdqa %xmm5, 16(%rsi) -; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: movdqa %xmm8, (%rsi) -; SSSE3-NEXT: movdqa %xmm3, %xmm8 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movdqa %xmm3, 192(%rdi) -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movdqa %xmm10, (%rsi) +; SSSE3-NEXT: movdqa %xmm7, 192(%rdi) ; SSSE3-NEXT: movdqa %xmm2, 128(%rdi) -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movdqa %xmm1, 64(%rdi) -; SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movdqa %xmm4, (%rdi) -; SSSE3-NEXT: movdqa %xmm8, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm8 -; SSSE3-NEXT: psrad $31, %xmm8 -; SSSE3-NEXT: movdqa %xmm8, 224(%rdi) -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, 240(%rdi) +; SSSE3-NEXT: movdqa %xmm9, 64(%rdi) ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm5 -; SSSE3-NEXT: psrad $31, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, 208(%rdi) -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSSE3-NEXT: movaps %xmm2, (%rdi) +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm3 ; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, 160(%rdi) -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, 176(%rdi) -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm6 -; SSSE3-NEXT: psrad $31, %xmm6 -; SSSE3-NEXT: movdqa %xmm6, 144(%rdi) -; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, 240(%rdi) +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, 224(%rdi) +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm14 +; SSSE3-NEXT: psrad $31, %xmm14 +; SSSE3-NEXT: movdqa %xmm14, 208(%rdi) +; SSSE3-NEXT: movdqa %xmm13, %xmm2 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm13 +; SSSE3-NEXT: psrad $31, %xmm13 +; SSSE3-NEXT: movdqa %xmm13, 176(%rdi) +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, 160(%rdi) +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm12 +; SSSE3-NEXT: psrad $31, %xmm12 +; SSSE3-NEXT: movdqa %xmm12, 144(%rdi) +; SSSE3-NEXT: movdqa %xmm11, %xmm2 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm11 +; SSSE3-NEXT: psrad $31, %xmm11 +; SSSE3-NEXT: movdqa %xmm11, 112(%rdi) ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm2 ; SSSE3-NEXT: psrad $31, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, 96(%rdi) -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm3 -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, 112(%rdi) ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm7 -; SSSE3-NEXT: psrad $31, %xmm7 -; SSSE3-NEXT: movdqa %xmm7, 80(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, 80(%rdi) ; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, 32(%rdi) -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm1, 48(%rdi) +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm2 ; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, 48(%rdi) +; SSSE3-NEXT: movdqa %xmm2, 32(%rdi) ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm0 @@ -2347,61 +2346,61 @@ ; SSE41-NEXT: movdqa %xmm4, 64(%rdi) ; SSE41-NEXT: pmovsxbd %xmm0, %xmm4 ; SSE41-NEXT: movdqa %xmm4, (%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,3,3,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm4 ; SSE41-NEXT: psrad $31, %xmm4 -; SSE41-NEXT: movdqa %xmm4, 224(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,3,3,3] +; SSE41-NEXT: movdqa %xmm4, 240(%rdi) +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm4 ; SSE41-NEXT: psrad $31, %xmm4 -; SSE41-NEXT: movdqa %xmm4, 240(%rdi) +; SSE41-NEXT: movdqa %xmm4, 224(%rdi) ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm3 ; SSE41-NEXT: psrad $31, %xmm3 ; SSE41-NEXT: movdqa %xmm3, 208(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm3 ; SSE41-NEXT: psrad $31, %xmm3 -; SSE41-NEXT: movdqa %xmm3, 160(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3] +; SSE41-NEXT: movdqa %xmm3, 176(%rdi) +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm3 ; SSE41-NEXT: psrad $31, %xmm3 -; SSE41-NEXT: movdqa %xmm3, 176(%rdi) +; SSE41-NEXT: movdqa %xmm3, 160(%rdi) ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 ; SSE41-NEXT: movdqa %xmm2, 144(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: movdqa %xmm2, 96(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] +; SSE41-NEXT: movdqa %xmm2, 112(%rdi) +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: movdqa %xmm2, 112(%rdi) +; SSE41-NEXT: movdqa %xmm2, 96(%rdi) ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 ; SSE41-NEXT: movdqa %xmm1, 80(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: movdqa %xmm1, 32(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE41-NEXT: movdqa %xmm1, 48(%rdi) +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: movdqa %xmm1, 48(%rdi) +; SSE41-NEXT: movdqa %xmm1, 32(%rdi) ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 @@ -2491,39 +2490,39 @@ ; AVX1-NEXT: vmovdqa %xmm1, 64(%rdi) ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, 224(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 240(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, 224(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 208(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, 160(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 176(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, 160(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 144(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, 96(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 112(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, 96(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 80(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, 32(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 48(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, 32(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 16(%rdi) @@ -3258,9 +3257,9 @@ ; AVX-LABEL: smulo_v4i1: ; AVX: # %bb.0: ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpslld $31, %xmm0, %xmm1 -; AVX-NEXT: vpsrad $31, %xmm1, %xmm0 -; AVX-NEXT: vmovmskps %xmm1, %eax +; AVX-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX-NEXT: vmovmskps %xmm0, %eax +; AVX-NEXT: vpsrad $31, %xmm0, %xmm0 ; AVX-NEXT: movb %al, (%rdi) ; AVX-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll --- a/llvm/test/CodeGen/X86/vec_ssubo.ll +++ b/llvm/test/CodeGen/X86/vec_ssubo.ll @@ -508,9 +508,12 @@ ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28] ; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi) ; AVX2-NEXT: vmovdqa %ymm2, (%rdi) ; AVX2-NEXT: retq @@ -571,8 +574,7 @@ ; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 ; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; SSSE3-NEXT: movdqa %xmm3, %xmm1 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] @@ -1008,14 +1010,14 @@ ; ; AVX512-LABEL: ssubo_v4i1: ; AVX512: # %bb.0: -; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 -; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vpslld $31, %xmm0, %xmm2 +; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k0 +; AVX512-NEXT: vpslld $31, %xmm1, %xmm2 +; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k0 -; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1 {%k1} -; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vpsrad $31, %xmm0, %xmm0 ; AVX512-NEXT: kshiftlw $12, %k0, %k0 ; AVX512-NEXT: kshiftrw $12, %k0, %k0 ; AVX512-NEXT: kmovd %k0, %eax diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -562,16 +562,15 @@ ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpackssdw %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[1,1,1,1] +; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 ; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi) @@ -592,10 +591,13 @@ ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpmaxud %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28] -; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi) ; AVX2-NEXT: vmovdqa %ymm2, (%rdi) ; AVX2-NEXT: retq @@ -651,8 +653,7 @@ ; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSSE3-NEXT: pxor %xmm0, %xmm3 ; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; SSSE3-NEXT: movdqa %xmm3, %xmm4 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] @@ -1098,15 +1099,15 @@ ; ; AVX512-LABEL: uaddo_v4i1: ; AVX512: # %bb.0: +; AVX512-NEXT: vpslld $31, %xmm0, %xmm2 +; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k0 +; AVX512-NEXT: vpslld $31, %xmm1, %xmm2 +; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 -; AVX512-NEXT: vpslld $31, %xmm1, %xmm0 -; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k2 -; AVX512-NEXT: kandw %k1, %k0, %k1 -; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: kshiftlw $12, %k2, %k0 +; AVX512-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX512-NEXT: kshiftlw $12, %k0, %k0 ; AVX512-NEXT: kshiftrw $12, %k0, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -138,10 +138,12 @@ ; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE2-NEXT: movd %xmm2, 8(%rdi) -; SSE2-NEXT: movq %xmm0, (%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE2-NEXT: movd %xmm0, 8(%rdi) +; SSE2-NEXT: movq %xmm2, (%rdi) ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -158,10 +160,12 @@ ; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSSE3-NEXT: movd %xmm2, 8(%rdi) -; SSSE3-NEXT: movq %xmm0, (%rdi) +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSSE3-NEXT: movd %xmm0, 8(%rdi) +; SSSE3-NEXT: movq %xmm2, (%rdi) ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; @@ -362,101 +366,101 @@ ; SSE2-LABEL: umulo_v6i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE2-NEXT: movd %r8d, %xmm0 -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: movd %esi, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE2-NEXT: movd %r9d, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movd %r8d, %xmm4 +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: movd %esi, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE2-NEXT: pmuludq %xmm1, %xmm6 +; SSE2-NEXT: movd %r9d, %xmm0 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0] +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,3,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE2-NEXT: pmuludq %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = mem[0,0,0,0] ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = mem[0,0,0,0] -; SSE2-NEXT: pmuludq %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm7 -; SSE2-NEXT: pxor %xmm5, %xmm7 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE2-NEXT: movq %xmm0, 16(%rcx) -; SSE2-NEXT: movdqa %xmm3, (%rcx) +; SSE2-NEXT: pmuludq %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm7 +; SSE2-NEXT: pxor %xmm3, %xmm7 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE2-NEXT: movq %xmm1, 16(%rcx) +; SSE2-NEXT: movdqa %xmm4, (%rcx) ; SSE2-NEXT: movq %xmm7, 16(%rdi) -; SSE2-NEXT: movdqa %xmm1, (%rdi) +; SSE2-NEXT: movdqa %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: umulo_v6i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq %rdi, %rax +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSSE3-NEXT: movd %r8d, %xmm0 -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: movd %esi, %xmm3 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSSE3-NEXT: movd %r9d, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSSE3-NEXT: movd %r8d, %xmm4 +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: movd %esi, %xmm6 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSSE3-NEXT: pmuludq %xmm1, %xmm6 +; SSSE3-NEXT: movd %r9d, %xmm0 ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: pmuludq %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 -; SSSE3-NEXT: pxor %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0] +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: pmuludq %xmm0, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,3,2,3] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSSE3-NEXT: pmuludq %xmm3, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 +; SSSE3-NEXT: pxor %xmm3, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = mem[0,0,0,0] ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = mem[0,0,0,0] -; SSSE3-NEXT: pmuludq %xmm2, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm7 -; SSSE3-NEXT: pxor %xmm5, %xmm7 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSSE3-NEXT: movq %xmm0, 16(%rcx) -; SSSE3-NEXT: movdqa %xmm3, (%rcx) +; SSSE3-NEXT: pmuludq %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm7 +; SSSE3-NEXT: pxor %xmm3, %xmm7 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSSE3-NEXT: movq %xmm1, 16(%rcx) +; SSSE3-NEXT: movdqa %xmm4, (%rcx) ; SSSE3-NEXT: movq %xmm7, 16(%rdi) -; SSSE3-NEXT: movdqa %xmm1, (%rdi) +; SSSE3-NEXT: movdqa %xmm0, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: umulo_v6i32: @@ -964,26 +968,25 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3],xmm12[4,5],xmm11[6,7] ; AVX1-NEXT: vpcmpeqd %xmm7, %xmm11, %xmm7 ; AVX1-NEXT: vpackssdw %xmm9, %xmm7, %xmm7 -; AVX1-NEXT: vpacksswb %xmm6, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm9, %xmm9, %xmm9 -; AVX1-NEXT: vpxor %xmm7, %xmm9, %xmm7 +; AVX1-NEXT: vpacksswb %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpcmpeqd %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vpxor %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpmulld %xmm8, %xmm10, %xmm8 +; AVX1-NEXT: vpmulld %xmm8, %xmm10, %xmm7 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm3 ; AVX1-NEXT: vpmulld %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpmovsxbd %xmm7, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] +; AVX1-NEXT: vpmovsxbd %xmm6, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpacksswb %xmm6, %xmm6, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm9, %xmm1 -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[3,3,3,3] +; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi) ; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi) -; AVX1-NEXT: vmovdqa %xmm8, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm7, 16(%rdi) ; AVX1-NEXT: vmovdqa %xmm2, (%rdi) ; AVX1-NEXT: retq ; @@ -1008,12 +1011,15 @@ ; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,1,3,3,5,5,7,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7] ; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 +; AVX2-NEXT: vpacksswb %xmm4, %xmm5, %xmm5 ; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm3 -; AVX2-NEXT: vpacksswb %xmm4, %xmm4, %xmm0 -; AVX2-NEXT: vpmovsxbd %xmm0, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28] -; AVX2-NEXT: vpxor %ymm6, %ymm0, %ymm0 +; AVX2-NEXT: vpmovsxbd %xmm5, %ymm0 +; AVX2-NEXT: vpacksswb %xmm4, %xmm4, %xmm1 +; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi) ; AVX2-NEXT: vmovdqa %ymm2, (%rdi) ; AVX2-NEXT: retq @@ -1104,8 +1110,7 @@ ; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 ; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; SSSE3-NEXT: movdqa %xmm3, %xmm1 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] @@ -1396,10 +1401,9 @@ ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm8 ; SSSE3-NEXT: psrad $31, %xmm8 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; SSSE3-NEXT: pshufb %xmm10, %xmm0 +; SSSE3-NEXT: pshufb %xmm10, %xmm1 ; SSSE3-NEXT: movdqa %xmm4, 16(%rsi) ; SSSE3-NEXT: movdqa %xmm2, (%rsi) ; SSSE3-NEXT: movdqa %xmm1, 64(%rdi) @@ -1721,14 +1725,14 @@ ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: movdqa %xmm0, (%rdi) ; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: movdqa %xmm4, 224(%rdi) -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm4, 240(%rdi) +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: movdqa %xmm0, 240(%rdi) +; SSE2-NEXT: movdqa %xmm0, 224(%rdi) ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] @@ -1736,14 +1740,14 @@ ; SSE2-NEXT: psrad $31, %xmm5 ; SSE2-NEXT: movdqa %xmm5, 208(%rdi) ; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm3 ; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm3, 160(%rdi) -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm3, 176(%rdi) +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: movdqa %xmm0, 176(%rdi) +; SSE2-NEXT: movdqa %xmm0, 160(%rdi) ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] @@ -1751,14 +1755,14 @@ ; SSE2-NEXT: psrad $31, %xmm8 ; SSE2-NEXT: movdqa %xmm8, 144(%rdi) ; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: movdqa %xmm2, 96(%rdi) -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm2, 112(%rdi) +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: movdqa %xmm0, 112(%rdi) +; SSE2-NEXT: movdqa %xmm0, 96(%rdi) ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] @@ -1766,14 +1770,14 @@ ; SSE2-NEXT: psrad $31, %xmm7 ; SSE2-NEXT: movdqa %xmm7, 80(%rdi) ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: movdqa %xmm1, 32(%rdi) -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm1, 48(%rdi) +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: movdqa %xmm0, 48(%rdi) +; SSE2-NEXT: movdqa %xmm0, 32(%rdi) ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm6 @@ -1783,6 +1787,8 @@ ; ; SSSE3-LABEL: umulo_v64i8: ; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa %xmm5, %xmm13 +; SSSE3-NEXT: movdqa %xmm3, %xmm5 ; SSSE3-NEXT: movq %rdi, %rax ; SSSE3-NEXT: pxor %xmm10, %xmm10 ; SSSE3-NEXT: movdqa %xmm4, %xmm8 @@ -1796,22 +1802,23 @@ ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] ; SSSE3-NEXT: pmullw %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pand %xmm8, %xmm4 -; SSSE3-NEXT: packuswb %xmm9, %xmm4 -; SSSE3-NEXT: movdqa %xmm5, %xmm9 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pand %xmm8, %xmm3 +; SSSE3-NEXT: packuswb %xmm9, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSSE3-NEXT: movdqa %xmm13, %xmm9 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] ; SSSE3-NEXT: movdqa %xmm1, %xmm12 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] ; SSSE3-NEXT: pmullw %xmm9, %xmm12 ; SSSE3-NEXT: movdqa %xmm12, %xmm9 ; SSSE3-NEXT: pand %xmm8, %xmm9 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; SSSE3-NEXT: pmullw %xmm5, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSSE3-NEXT: pand %xmm8, %xmm5 -; SSSE3-NEXT: packuswb %xmm9, %xmm5 +; SSSE3-NEXT: pmullw %xmm13, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pand %xmm8, %xmm4 +; SSSE3-NEXT: packuswb %xmm9, %xmm4 ; SSSE3-NEXT: movdqa %xmm6, %xmm9 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] ; SSSE3-NEXT: movdqa %xmm2, %xmm13 @@ -1827,116 +1834,114 @@ ; SSSE3-NEXT: packuswb %xmm14, %xmm9 ; SSSE3-NEXT: movdqa %xmm7, %xmm6 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] -; SSSE3-NEXT: movdqa %xmm3, %xmm14 +; SSSE3-NEXT: movdqa %xmm5, %xmm14 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15] ; SSSE3-NEXT: pmullw %xmm6, %xmm14 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] -; SSSE3-NEXT: pmullw %xmm7, %xmm3 ; SSSE3-NEXT: movdqa %xmm14, %xmm6 ; SSSE3-NEXT: pand %xmm8, %xmm6 -; SSSE3-NEXT: pand %xmm3, %xmm8 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; SSSE3-NEXT: pmullw %xmm7, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm8 ; SSSE3-NEXT: packuswb %xmm6, %xmm8 ; SSSE3-NEXT: psrlw $8, %xmm14 -; SSSE3-NEXT: psrlw $8, %xmm3 -; SSSE3-NEXT: packuswb %xmm14, %xmm3 +; SSSE3-NEXT: psrlw $8, %xmm5 +; SSSE3-NEXT: packuswb %xmm14, %xmm5 +; SSSE3-NEXT: pcmpeqb %xmm10, %xmm5 +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 +; SSSE3-NEXT: pxor %xmm6, %xmm5 ; SSSE3-NEXT: psrlw $8, %xmm13 ; SSSE3-NEXT: psrlw $8, %xmm2 ; SSSE3-NEXT: packuswb %xmm13, %xmm2 +; SSSE3-NEXT: pcmpeqb %xmm10, %xmm2 +; SSSE3-NEXT: pxor %xmm6, %xmm2 ; SSSE3-NEXT: psrlw $8, %xmm12 ; SSSE3-NEXT: psrlw $8, %xmm1 ; SSSE3-NEXT: packuswb %xmm12, %xmm1 +; SSSE3-NEXT: pcmpeqb %xmm10, %xmm1 +; SSSE3-NEXT: pxor %xmm6, %xmm1 ; SSSE3-NEXT: psrlw $8, %xmm11 ; SSSE3-NEXT: psrlw $8, %xmm0 ; SSSE3-NEXT: packuswb %xmm11, %xmm0 -; SSSE3-NEXT: pcmpeqb %xmm10, %xmm3 -; SSSE3-NEXT: pcmpeqb %xmm10, %xmm2 -; SSSE3-NEXT: pcmpeqb %xmm10, %xmm1 ; SSSE3-NEXT: pcmpeqb %xmm10, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSSE3-NEXT: pxor %xmm6, %xmm3 -; SSSE3-NEXT: pxor %xmm6, %xmm2 -; SSSE3-NEXT: pxor %xmm6, %xmm1 ; SSSE3-NEXT: pxor %xmm6, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm6 +; SSSE3-NEXT: movdqa %xmm0, %xmm7 +; SSSE3-NEXT: movdqa %xmm1, %xmm10 +; SSSE3-NEXT: movdqa %xmm1, %xmm11 +; SSSE3-NEXT: movdqa %xmm2, %xmm12 +; SSSE3-NEXT: movdqa %xmm2, %xmm13 +; SSSE3-NEXT: movdqa %xmm5, %xmm14 +; SSSE3-NEXT: movdqa {{.*#+}} xmm15 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; SSSE3-NEXT: pshufb %xmm15, %xmm0 +; SSSE3-NEXT: pshufb %xmm15, %xmm1 +; SSSE3-NEXT: pshufb %xmm15, %xmm2 +; SSSE3-NEXT: movdqa %xmm5, %xmm3 +; SSSE3-NEXT: pshufb %xmm15, %xmm5 ; SSSE3-NEXT: movdqa %xmm8, 48(%rsi) -; SSSE3-NEXT: movdqa %xmm1, %xmm7 ; SSSE3-NEXT: movdqa %xmm9, 32(%rsi) -; SSSE3-NEXT: movdqa %xmm2, %xmm8 -; SSSE3-NEXT: movdqa %xmm5, 16(%rsi) -; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: movdqa %xmm4, (%rsi) -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movdqa %xmm3, 192(%rdi) -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movdqa %xmm4, 16(%rsi) +; SSSE3-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSSE3-NEXT: movaps %xmm4, (%rsi) +; SSSE3-NEXT: movdqa %xmm5, 192(%rdi) ; SSSE3-NEXT: movdqa %xmm2, 128(%rdi) -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: movdqa %xmm1, 64(%rdi) -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movdqa %xmm0, (%rdi) -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, 224(%rdi) -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, 240(%rdi) ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm5 -; SSSE3-NEXT: psrad $31, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, 208(%rdi) +; SSSE3-NEXT: movdqa %xmm0, (%rdi) ; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm3 ; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, 160(%rdi) -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm3, 240(%rdi) +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, 176(%rdi) -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm8 -; SSSE3-NEXT: psrad $31, %xmm8 -; SSSE3-NEXT: movdqa %xmm8, 144(%rdi) -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, 96(%rdi) -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm0, 224(%rdi) +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm14 +; SSSE3-NEXT: psrad $31, %xmm14 +; SSSE3-NEXT: movdqa %xmm14, 208(%rdi) +; SSSE3-NEXT: movdqa %xmm13, %xmm0 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm13 +; SSSE3-NEXT: psrad $31, %xmm13 +; SSSE3-NEXT: movdqa %xmm13, 176(%rdi) +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, 112(%rdi) -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm0, 160(%rdi) +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm12 +; SSSE3-NEXT: psrad $31, %xmm12 +; SSSE3-NEXT: movdqa %xmm12, 144(%rdi) +; SSSE3-NEXT: movdqa %xmm11, %xmm0 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm11 +; SSSE3-NEXT: psrad $31, %xmm11 +; SSSE3-NEXT: movdqa %xmm11, 112(%rdi) +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: movdqa %xmm0, 96(%rdi) +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm10 +; SSSE3-NEXT: psrad $31, %xmm10 +; SSSE3-NEXT: movdqa %xmm10, 80(%rdi) +; SSSE3-NEXT: movdqa %xmm7, %xmm0 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm7 ; SSSE3-NEXT: psrad $31, %xmm7 -; SSSE3-NEXT: movdqa %xmm7, 80(%rdi) -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, 32(%rdi) -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm7, 48(%rdi) +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, 48(%rdi) +; SSSE3-NEXT: movdqa %xmm0, 32(%rdi) ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm6 @@ -2025,61 +2030,61 @@ ; SSE41-NEXT: movdqa %xmm0, 64(%rdi) ; SSE41-NEXT: pmovsxbd %xmm8, %xmm0 ; SSE41-NEXT: movdqa %xmm0, (%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: movdqa %xmm0, 224(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE41-NEXT: movdqa %xmm0, 240(%rdi) +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: movdqa %xmm0, 240(%rdi) +; SSE41-NEXT: movdqa %xmm0, 224(%rdi) ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 208(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: movdqa %xmm0, 160(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; SSE41-NEXT: movdqa %xmm0, 176(%rdi) +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: movdqa %xmm0, 176(%rdi) +; SSE41-NEXT: movdqa %xmm0, 160(%rdi) ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 144(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: movdqa %xmm0, 96(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] +; SSE41-NEXT: movdqa %xmm0, 112(%rdi) +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: movdqa %xmm0, 112(%rdi) +; SSE41-NEXT: movdqa %xmm0, 96(%rdi) ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 80(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: movdqa %xmm0, 32(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] +; SSE41-NEXT: movdqa %xmm0, 48(%rdi) +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: movdqa %xmm0, 48(%rdi) +; SSE41-NEXT: movdqa %xmm0, 32(%rdi) ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 @@ -2165,39 +2170,39 @@ ; AVX1-NEXT: vmovdqa %xmm0, 64(%rdi) ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 224(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 240(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, 224(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 208(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 160(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 176(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, 160(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 144(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 96(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 112(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, 96(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 80(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 32(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 48(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, 32(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 16(%rdi) diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -605,16 +605,15 @@ ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpackssdw %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[1,1,1,1] +; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 ; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi) @@ -635,10 +634,13 @@ ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpminud %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28] -; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi) ; AVX2-NEXT: vmovdqa %ymm2, (%rdi) ; AVX2-NEXT: retq @@ -695,8 +697,7 @@ ; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSSE3-NEXT: pxor %xmm0, %xmm3 ; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; SSSE3-NEXT: movdqa %xmm3, %xmm1 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] @@ -1145,14 +1146,14 @@ ; ; AVX512-LABEL: usubo_v4i1: ; AVX512: # %bb.0: -; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 -; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vpslld $31, %xmm0, %xmm2 +; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k0 +; AVX512-NEXT: vpslld $31, %xmm1, %xmm2 +; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k0 -; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1 {%k1} -; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vpsrad $31, %xmm0, %xmm0 ; AVX512-NEXT: kshiftlw $12, %k0, %k0 ; AVX512-NEXT: kshiftrw $12, %k0, %k0 ; AVX512-NEXT: kmovd %k0, %eax diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll --- a/llvm/test/CodeGen/X86/vector-bitreverse.ll +++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -987,7 +987,9 @@ ; ; GFNIAVX1-LABEL: test_bitreverse_v32i8: ; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX1-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX1-NEXT: # ymm1 = mem[0,1,0,1] +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: test_bitreverse_v32i8: @@ -1176,7 +1178,9 @@ ; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX1-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX1-NEXT: # ymm1 = mem[0,1,0,1] +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: test_bitreverse_v16i16: @@ -1376,7 +1380,9 @@ ; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX1-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX1-NEXT: # ymm1 = mem[0,1,0,1] +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: test_bitreverse_v8i32: @@ -1580,7 +1586,9 @@ ; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX1-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX1-NEXT: # ymm1 = mem[0,1,0,1] +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: test_bitreverse_v4i64: @@ -1855,7 +1863,8 @@ ; ; GFNIAVX1-LABEL: test_bitreverse_v64i8: ; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNIAVX1-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX1-NEXT: # ymm2 = mem[0,1,0,1] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 ; GFNIAVX1-NEXT: retq @@ -2176,7 +2185,8 @@ ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNIAVX1-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX1-NEXT: # ymm2 = mem[0,1,0,1] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 @@ -2534,7 +2544,8 @@ ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNIAVX1-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX1-NEXT: # ymm2 = mem[0,1,0,1] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 @@ -2900,7 +2911,8 @@ ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNIAVX1-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX1-NEXT: # ymm2 = mem[0,1,0,1] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-bo-select.ll b/llvm/test/CodeGen/X86/vector-bo-select.ll --- a/llvm/test/CodeGen/X86/vector-bo-select.ll +++ b/llvm/test/CodeGen/X86/vector-bo-select.ll @@ -516,9 +516,9 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512-NEXT: vsubps %zmm2, %zmm1, %zmm0 -; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1} +; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 +; AVX512-NEXT: vsubps %zmm2, %zmm1, %zmm1 {%k1} +; AVX512-NEXT: vmovaps %zmm1, %zmm0 ; AVX512-NEXT: retq %s = select <16 x i1> %b, <16 x float> zeroinitializer, <16 x float> %y %r = fsub <16 x float> %x, %s @@ -2672,16 +2672,16 @@ ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: psubd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm1 +; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: mul_v4i32: @@ -2731,26 +2731,26 @@ ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm5 ; SSE2-NEXT: psrad $31, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1,1,1,1] ; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: paddd %xmm5, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: psubd %xmm5, %xmm4 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: psubd %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE2-NEXT: pandn %xmm6, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm5, %xmm1 +; SSE2-NEXT: pmuludq %xmm3, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm3, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] @@ -2812,32 +2812,32 @@ ; SSE2-LABEL: mul_v8i32_cast_cond: ; SSE2: # %bb.0: ; SSE2-NEXT: movd %edi, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: paddd %xmm6, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: psubd %xmm5, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1,2,4,8] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128] +; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm6, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: psubd %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1,1,1,1] +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] +; SSE2-NEXT: pand %xmm3, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm5, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm1 +; SSE2-NEXT: pmuludq %xmm4, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm2, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] @@ -5666,9 +5666,8 @@ ; ; AVX2-LABEL: select_sdiv_neutral_constant_v8i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpslld $31, %xmm5, %xmm5 -; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5 +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpsllq $63, %ymm5, %ymm5 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1,1,1,1] ; AVX2-NEXT: vblendvpd %ymm5, %ymm6, %ymm3, %ymm5 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] diff --git a/llvm/test/CodeGen/X86/vector-compare-all_of.ll b/llvm/test/CodeGen/X86/vector-compare-all_of.ll --- a/llvm/test/CodeGen/X86/vector-compare-all_of.ll +++ b/llvm/test/CodeGen/X86/vector-compare-all_of.ll @@ -45,36 +45,16 @@ ; SSE-NEXT: negq %rax ; SSE-NEXT: retq ; -; AVX1-LABEL: test_v4f64_sext: -; AVX1: # %bb.0: -; AVX1-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vtestpd %ymm1, %ymm0 -; AVX1-NEXT: sbbq %rax, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v4f64_sext: -; AVX2: # %bb.0: -; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vtestpd %ymm1, %ymm0 -; AVX2-NEXT: sbbq %rax, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v4f64_sext: -; AVX512: # %bb.0: -; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vtestpd %ymm1, %ymm0 -; AVX512-NEXT: sbbq %rax, %rax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: test_v4f64_sext: +; AVX: # %bb.0: +; AVX-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vandpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vandpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %c = fcmp ogt <4 x double> %a0, %a1 %s = sext <4 x i1> %c to <4 x i64> %1 = shufflevector <4 x i64> %s, <4 x i64> undef, <4 x i32> @@ -173,36 +153,18 @@ ; SSE-NEXT: negl %eax ; SSE-NEXT: retq ; -; AVX1-LABEL: test_v8f32_sext: -; AVX1: # %bb.0: -; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: vtestps %ymm1, %ymm0 -; AVX1-NEXT: sbbl %eax, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8f32_sext: -; AVX2: # %bb.0: -; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vtestps %ymm1, %ymm0 -; AVX2-NEXT: sbbl %eax, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v8f32_sext: -; AVX512: # %bb.0: -; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vtestps %ymm1, %ymm0 -; AVX512-NEXT: sbbl %eax, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: test_v8f32_sext: +; AVX: # %bb.0: +; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %c = fcmp ogt <8 x float> %a0, %a1 %s = sext <8 x i1> %c to <8 x i32> %1 = shufflevector <8 x i32> %s, <8 x i32> undef, <8 x i32> @@ -357,30 +319,31 @@ ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vtestpd %xmm1, %xmm0 -; AVX1-NEXT: sbbq %rax, %rax +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v4i64_sext: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vtestpd %ymm1, %ymm0 -; AVX2-NEXT: sbbq %rax, %rax +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v4i64_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vtestpd %ymm1, %ymm0 -; AVX512-NEXT: sbbq %rax, %rax +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <4 x i64> %a0, %a1 @@ -528,30 +491,37 @@ ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vtestps %xmm1, %xmm0 -; AVX1-NEXT: sbbl %eax, %eax +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v8i32_sext: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vtestps %ymm1, %ymm0 -; AVX2-NEXT: sbbl %eax, %eax +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v8i32_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vtestps %ymm1, %ymm0 -; AVX512-NEXT: sbbl %eax, %eax +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <8 x i32> %a0, %a1 @@ -686,11 +656,13 @@ ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %ecx -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; AVX1-NEXT: sete %al -; AVX1-NEXT: negl %eax +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -698,11 +670,15 @@ ; AVX2-LABEL: test_v16i16_sext: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmovmskb %ymm0, %ecx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: cmpl $-1, %ecx -; AVX2-NEXT: sete %al -; AVX2-NEXT: negl %eax +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -710,11 +686,15 @@ ; AVX512-LABEL: test_v16i16_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovmskb %ymm0, %ecx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: cmpl $-1, %ecx -; AVX512-NEXT: sete %al -; AVX512-NEXT: negl %eax +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -852,30 +832,52 @@ ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; AVX1-NEXT: sete %al -; AVX1-NEXT: negb %al +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v32i8_sext: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmovmskb %ymm0, %eax -; AVX2-NEXT: cmpl $-1, %eax -; AVX2-NEXT: sete %al -; AVX2-NEXT: negb %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v32i8_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovmskb %ymm0, %eax -; AVX512-NEXT: cmpl $-1, %eax -; AVX512-NEXT: sete %al -; AVX512-NEXT: negb %al +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <32 x i8> %a0, %a1 @@ -1543,27 +1545,32 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: movmskpd %xmm0, %eax -; SSE2-NEXT: cmpl $3, %eax -; SSE2-NEXT: sete %al +; SSE2-NEXT: movmskpd %xmm0, %ecx +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: shrb %al +; SSE2-NEXT: andb %cl, %al ; SSE2-NEXT: retq ; ; SSE42-LABEL: select_v2i8: ; SSE42: # %bb.0: ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: pxor %xmm0, %xmm1 -; SSE42-NEXT: ptest %xmm1, %xmm1 -; SSE42-NEXT: sete %al +; SSE42-NEXT: pcmpeqq %xmm0, %xmm1 +; SSE42-NEXT: movmskpd %xmm1, %ecx +; SSE42-NEXT: movl %ecx, %eax +; SSE42-NEXT: shrb %al +; SSE42-NEXT: andb %cl, %al ; SSE42-NEXT: retq ; ; AVX1OR2-LABEL: select_v2i8: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero ; AVX1OR2-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX1OR2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vptest %xmm0, %xmm0 -; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vmovmskpd %xmm0, %ecx +; AVX1OR2-NEXT: movl %ecx, %eax +; AVX1OR2-NEXT: shrb %al +; AVX1OR2-NEXT: andb %cl, %al ; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: select_v2i8: diff --git a/llvm/test/CodeGen/X86/vector-compare-any_of.ll b/llvm/test/CodeGen/X86/vector-compare-any_of.ll --- a/llvm/test/CodeGen/X86/vector-compare-any_of.ll +++ b/llvm/test/CodeGen/X86/vector-compare-any_of.ll @@ -46,10 +46,11 @@ ; AVX-LABEL: test_v4f64_sext: ; AVX: # %bb.0: ; AVX-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: vtestpd %ymm0, %ymm0 -; AVX-NEXT: setne %al -; AVX-NEXT: negq %rax +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vorpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %c = fcmp ogt <4 x double> %a0, %a1 @@ -151,10 +152,13 @@ ; AVX-LABEL: test_v8f32_sext: ; AVX: # %bb.0: ; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: vtestps %ymm0, %ymm0 -; AVX-NEXT: setne %al -; AVX-NEXT: negl %eax +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %c = fcmp ogt <8 x float> %a0, %a1 @@ -304,30 +308,31 @@ ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: vtestpd %xmm0, %xmm0 -; AVX1-NEXT: setne %al -; AVX1-NEXT: negq %rax +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v4i64_sext: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: vtestpd %ymm0, %ymm0 -; AVX2-NEXT: setne %al -; AVX2-NEXT: negq %rax +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v4i64_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: vtestpd %ymm0, %ymm0 -; AVX512-NEXT: setne %al -; AVX512-NEXT: negq %rax +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <4 x i64> %a0, %a1 @@ -472,30 +477,37 @@ ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: vtestps %xmm0, %xmm0 -; AVX1-NEXT: setne %al -; AVX1-NEXT: negl %eax +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v8i32_sext: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: vtestps %ymm0, %ymm0 -; AVX2-NEXT: setne %al -; AVX2-NEXT: negl %eax +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v8i32_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: vtestps %ymm0, %ymm0 -; AVX512-NEXT: setne %al -; AVX512-NEXT: negl %eax +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <8 x i32> %a0, %a1 @@ -623,10 +635,13 @@ ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %ecx -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: negl %ecx -; AVX1-NEXT: sbbl %eax, %eax +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -634,10 +649,15 @@ ; AVX2-LABEL: test_v16i16_sext: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmovmskb %ymm0, %ecx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: negl %ecx -; AVX2-NEXT: sbbl %eax, %eax +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -645,10 +665,15 @@ ; AVX512-LABEL: test_v16i16_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovmskb %ymm0, %ecx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: negl %ecx -; AVX512-NEXT: sbbl %eax, %eax +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -785,10 +810,15 @@ ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %ecx -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: negl %ecx -; AVX1-NEXT: sbbl %eax, %eax +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -796,10 +826,17 @@ ; AVX2-LABEL: test_v32i8_sext: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmovmskb %ymm0, %ecx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: negl %ecx -; AVX2-NEXT: sbbl %eax, %eax +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -807,10 +844,17 @@ ; AVX512-LABEL: test_v32i8_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovmskb %ymm0, %ecx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: negl %ecx -; AVX512-NEXT: sbbl %eax, %eax +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1426,9 +1470,10 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: movmskpd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: setne %al +; SSE2-NEXT: movmskpd %xmm0, %ecx +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: shrb %al +; SSE2-NEXT: orb %cl, %al ; SSE2-NEXT: retq ; ; SSE42-LABEL: select_v2i8: @@ -1436,9 +1481,10 @@ ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: pcmpeqq %xmm0, %xmm1 -; SSE42-NEXT: movmskpd %xmm1, %eax -; SSE42-NEXT: testl %eax, %eax -; SSE42-NEXT: setne %al +; SSE42-NEXT: movmskpd %xmm1, %ecx +; SSE42-NEXT: movl %ecx, %eax +; SSE42-NEXT: shrb %al +; SSE42-NEXT: orb %cl, %al ; SSE42-NEXT: retq ; ; AVX1OR2-LABEL: select_v2i8: @@ -1446,8 +1492,10 @@ ; AVX1OR2-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero ; AVX1OR2-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero ; AVX1OR2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vtestpd %xmm0, %xmm0 -; AVX1OR2-NEXT: setne %al +; AVX1OR2-NEXT: vmovmskpd %xmm0, %ecx +; AVX1OR2-NEXT: movl %ecx, %eax +; AVX1OR2-NEXT: shrb %al +; AVX1OR2-NEXT: orb %cl, %al ; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: select_v2i8: diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -7539,8 +7539,7 @@ ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX1-NEXT: vextractps $2, %xmm0, %eax ; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: movl %eax, %eax +; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm4 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 @@ -7656,28 +7655,31 @@ ; ; AVX1-LABEL: constrained_vector_uitofp_v4f32_v4i64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 -; AVX1-NEXT: vorpd %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3] +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsrlq $1, %xmm3, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 +; AVX1-NEXT: vorpd %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm1 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] ; AVX1-NEXT: vpextrq $1, %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] -; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm3 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm0, %xmm3, %xmm1, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] +; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-ext-logic.ll b/llvm/test/CodeGen/X86/vector-ext-logic.ll --- a/llvm/test/CodeGen/X86/vector-ext-logic.ll +++ b/llvm/test/CodeGen/X86/vector-ext-logic.ll @@ -250,25 +250,19 @@ define <8 x i32> @bool_zext_and(<8 x i1> %x, <8 x i1> %y) { ; SSE2-LABEL: bool_zext_and: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: retq ; ; AVX2-LABEL: bool_zext_and: ; AVX2: # %bb.0: +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %xz = zext <8 x i1> %x to <8 x i32> %yz = zext <8 x i1> %y to <8 x i32> diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -1269,31 +1269,31 @@ ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX1-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX1-NEXT: vpcmpeqb %xmm0, %xmm3, %xmm3 -; AVX1-NEXT: vpmovsxbd %xmm3, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] +; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-NEXT: vmovdqu 4096(%rdi,%rax,4), %xmm5 ; AVX1-NEXT: vmovdqu 4112(%rdi,%rax,4), %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,2,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,2,3,3] ; AVX1-NEXT: vpsllq %xmm1, %xmm7, %xmm8 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] -; AVX1-NEXT: vpsllq %xmm1, %xmm5, %xmm9 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,1,1] +; AVX1-NEXT: vpsllq %xmm1, %xmm6, %xmm9 ; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm9[1,3],xmm8[1,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[2,2,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[2,2,3,3] ; AVX1-NEXT: vpsllq %xmm1, %xmm9, %xmm10 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,1,1] -; AVX1-NEXT: vpsllq %xmm1, %xmm6, %xmm11 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] +; AVX1-NEXT: vpsllq %xmm1, %xmm5, %xmm11 ; AVX1-NEXT: vshufps {{.*#+}} xmm10 = xmm11[1,3],xmm10[1,3] ; AVX1-NEXT: vpsllq %xmm2, %xmm7, %xmm7 -; AVX1-NEXT: vpsllq %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,3],xmm7[1,3] -; AVX1-NEXT: vblendvps %xmm4, %xmm8, %xmm5, %xmm4 -; AVX1-NEXT: vpsllq %xmm2, %xmm9, %xmm5 ; AVX1-NEXT: vpsllq %xmm2, %xmm6, %xmm6 -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm6[1,3],xmm5[1,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,3],xmm7[1,3] +; AVX1-NEXT: vblendvps %xmm4, %xmm8, %xmm6, %xmm4 +; AVX1-NEXT: vpsllq %xmm2, %xmm9, %xmm6 +; AVX1-NEXT: vpsllq %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,3],xmm6[1,3] ; AVX1-NEXT: vblendvps %xmm3, %xmm10, %xmm5, %xmm3 -; AVX1-NEXT: vmovups %xmm4, 4096(%rdi,%rax,4) -; AVX1-NEXT: vmovups %xmm3, 4112(%rdi,%rax,4) +; AVX1-NEXT: vmovups %xmm4, 4112(%rdi,%rax,4) +; AVX1-NEXT: vmovups %xmm3, 4096(%rdi,%rax,4) ; AVX1-NEXT: addq $8, %rax ; AVX1-NEXT: jne .LBB8_1 ; AVX1-NEXT: # %bb.2: # %exit @@ -1340,9 +1340,11 @@ ; AVX512F-NEXT: .p2align 4, 0x90 ; AVX512F-NEXT: .LBB8_1: # %loop ; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm3, %ymm3 -; AVX512F-NEXT: vblendvps %ymm3, %ymm0, %ymm1, %ymm3 +; AVX512F-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3 +; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-NEXT: vpblendmd %zmm0, %zmm1, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm4 ; AVX512F-NEXT: vprolvd %zmm3, %zmm4, %zmm3 ; AVX512F-NEXT: vmovdqu %ymm3, 4096(%rdi,%rax,4) @@ -1357,15 +1359,18 @@ ; AVX512VL-NEXT: vpbroadcastd %edx, %ymm0 ; AVX512VL-NEXT: vpbroadcastd %ecx, %ymm1 ; AVX512VL-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: .p2align 4, 0x90 ; AVX512VL-NEXT: .LBB8_1: # %loop ; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX512VL-NEXT: vptestnmd %ymm2, %ymm2, %k1 -; AVX512VL-NEXT: vpblendmd %ymm0, %ymm1, %ymm2 {%k1} -; AVX512VL-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm3 -; AVX512VL-NEXT: vprolvd %ymm2, %ymm3, %ymm2 -; AVX512VL-NEXT: vmovdqu %ymm2, 4096(%rdi,%rax,4) +; AVX512VL-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512VL-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3 +; AVX512VL-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512VL-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512VL-NEXT: vpblendmd %ymm0, %ymm1, %ymm3 {%k1} +; AVX512VL-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm4 +; AVX512VL-NEXT: vprolvd %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vmovdqu %ymm3, 4096(%rdi,%rax,4) ; AVX512VL-NEXT: addq $8, %rax ; AVX512VL-NEXT: jne .LBB8_1 ; AVX512VL-NEXT: # %bb.2: # %exit @@ -1504,11 +1509,11 @@ ; XOPAVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; XOPAVX2-NEXT: vpcmpeqd %ymm2, %ymm3, %ymm3 ; XOPAVX2-NEXT: vblendvps %ymm3, %ymm0, %ymm1, %ymm3 -; XOPAVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 -; XOPAVX2-NEXT: vprotd %xmm4, 4112(%rdi,%rax,4), %xmm4 -; XOPAVX2-NEXT: vprotd %xmm3, 4096(%rdi,%rax,4), %xmm3 -; XOPAVX2-NEXT: vmovdqu %xmm3, 4096(%rdi,%rax,4) -; XOPAVX2-NEXT: vmovdqu %xmm4, 4112(%rdi,%rax,4) +; XOPAVX2-NEXT: vprotd %xmm3, 4096(%rdi,%rax,4), %xmm4 +; XOPAVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 +; XOPAVX2-NEXT: vprotd %xmm3, 4112(%rdi,%rax,4), %xmm3 +; XOPAVX2-NEXT: vmovdqu %xmm3, 4112(%rdi,%rax,4) +; XOPAVX2-NEXT: vmovdqu %xmm4, 4096(%rdi,%rax,4) ; XOPAVX2-NEXT: addq $8, %rax ; XOPAVX2-NEXT: jne .LBB8_1 ; XOPAVX2-NEXT: # %bb.2: # %exit diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -1317,9 +1317,9 @@ ; SSE41-LABEL: constant_funnnel_v4i32: ; SSE41: # %bb.0: ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] @@ -1328,13 +1328,13 @@ ; ; AVX1-LABEL: constant_funnnel_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -1088,21 +1088,21 @@ ; AVX1-LABEL: constant_funnnel_v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3],xmm3[4,5],xmm0[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll @@ -293,9 +293,9 @@ ; SSE41-LABEL: constant_funnnel_v2i32: ; SSE41: # %bb.0: ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] @@ -304,13 +304,13 @@ ; ; AVX1-LABEL: constant_funnnel_v2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll --- a/llvm/test/CodeGen/X86/vector-fshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -824,8 +824,10 @@ ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2] +; AVX512F-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2,1] ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 @@ -843,8 +845,10 @@ ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2] +; AVX512VL-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2,1] ; AVX512VL-NEXT: vpmullw %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmullw %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -1066,9 +1066,9 @@ ; XOPAVX1-LABEL: splatvar_funnnel_v8i16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; @@ -1381,9 +1381,9 @@ ; SSE41-LABEL: constant_funnnel_v4i32: ; SSE41: # %bb.0: ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] @@ -1392,13 +1392,13 @@ ; ; AVX1-LABEL: constant_funnnel_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -884,9 +884,9 @@ ; XOPAVX1-LABEL: splatvar_funnnel_v16i16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 @@ -1139,21 +1139,21 @@ ; AVX1-LABEL: constant_funnnel_v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3],xmm3[4,5],xmm0[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll @@ -307,9 +307,9 @@ ; SSE41-LABEL: constant_funnnel_v2i32: ; SSE41: # %bb.0: ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] @@ -318,13 +318,13 @@ ; ; AVX1-LABEL: constant_funnnel_v2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -3156,6 +3156,27 @@ ; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; F16C-NEXT: addq $40, %rsp ; F16C-NEXT: retq +; +; AVX512-LABEL: cvt_2f64_to_2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: retq %1 = fptrunc <2 x double> %a0 to <2 x half> %2 = bitcast <2 x half> %1 to <2 x i16> ret <2 x i16> %2 @@ -3284,8 +3305,9 @@ ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] +; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: addq $72, %rsp ; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> @@ -3416,8 +3438,9 @@ ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] +; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: addq $72, %rsp ; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> @@ -4119,9 +4142,10 @@ ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] -; AVX512-NEXT: vmovaps %xmm0, (%rbx) +; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vmovdqa %xmm0, (%rbx) ; AVX512-NEXT: addq $64, %rsp ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: retq @@ -4942,7 +4966,7 @@ ; F16C-NEXT: movzwl %ax, %eax ; F16C-NEXT: vmovd %eax, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; F16C-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; F16C-NEXT: vcvttps2dq %xmm0, %xmm0 ; F16C-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; F16C-NEXT: retq @@ -4958,7 +4982,7 @@ ; AVX512-NEXT: movzwl %ax, %eax ; AVX512-NEXT: vmovd %eax, %xmm0 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleave.ll b/llvm/test/CodeGen/X86/vector-interleave.ll --- a/llvm/test/CodeGen/X86/vector-interleave.ll +++ b/llvm/test/CodeGen/X86/vector-interleave.ll @@ -540,12 +540,13 @@ ; AVX1-NEXT: vmovups 16(%rdi), %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,0,1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,0,1,1] ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-NEXT: vmovups %xmm1, 48(%rsi) -; AVX1-NEXT: vmovups %xmm3, 32(%rsi) -; AVX1-NEXT: vmovups %xmm0, 16(%rsi) -; AVX1-NEXT: vmovups %xmm2, (%rsi) +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmovups %ymm1, 32(%rsi) +; AVX1-NEXT: vmovups %ymm0, (%rsi) +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: splat2_i32: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll @@ -137,30 +137,30 @@ ; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa 48(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: pslld $16, %xmm4 ; SSE-NEXT: psrad $16, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: pslld $16, %xmm5 ; SSE-NEXT: psrad $16, %xmm5 ; SSE-NEXT: packssdw %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: pslld $16, %xmm4 ; SSE-NEXT: psrad $16, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: pslld $16, %xmm6 ; SSE-NEXT: psrad $16, %xmm6 ; SSE-NEXT: packssdw %xmm4, %xmm6 -; SSE-NEXT: psrad $16, %xmm3 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: packssdw %xmm3, %xmm1 ; SSE-NEXT: psrad $16, %xmm2 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm6, (%rsi) -; SSE-NEXT: movdqa %xmm5, 16(%rsi) -; SSE-NEXT: movdqa %xmm0, (%rdx) +; SSE-NEXT: psrad $16, %xmm3 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: packssdw %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm6, 16(%rsi) +; SSE-NEXT: movdqa %xmm5, (%rsi) ; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: movdqa %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride2_vf16: @@ -188,60 +188,22 @@ ; AVX1-ONLY-NEXT: vmovdqa %xmm3, 16(%rdx) ; AVX1-ONLY-NEXT: retq ; -; AVX2-SLOW-LABEL: load_i16_stride2_vf16: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: load_i16_stride2_vf16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: load_i16_stride2_vf16: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-ONLY-LABEL: load_i16_stride2_vf16: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,u,u,u,u,0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq ; ; AVX512-LABEL: load_i16_stride2_vf16: ; AVX512: # %bb.0: @@ -262,32 +224,32 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind { ; SSE-LABEL: load_i16_stride2_vf32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa 80(%rdi), %xmm4 -; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: movdqa 112(%rdi), %xmm6 -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm7 +; SSE-NEXT: movdqa 96(%rdi), %xmm0 +; SSE-NEXT: movdqa 112(%rdi), %xmm4 +; SSE-NEXT: movdqa 64(%rdi), %xmm2 +; SSE-NEXT: movdqa 80(%rdi), %xmm6 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm9 ; SSE-NEXT: movdqa 32(%rdi), %xmm3 -; SSE-NEXT: movdqa 48(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pslld $16, %xmm8 -; SSE-NEXT: psrad $16, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: movdqa 48(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: pslld $16, %xmm7 +; SSE-NEXT: psrad $16, %xmm7 +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pslld $16, %xmm5 ; SSE-NEXT: psrad $16, %xmm5 -; SSE-NEXT: packssdw %xmm8, %xmm5 -; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: packssdw %xmm7, %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm10 ; SSE-NEXT: pslld $16, %xmm10 ; SSE-NEXT: psrad $16, %xmm10 -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: pslld $16, %xmm8 -; SSE-NEXT: psrad $16, %xmm8 -; SSE-NEXT: packssdw %xmm10, %xmm8 +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: pslld $16, %xmm7 +; SSE-NEXT: psrad $16, %xmm7 +; SSE-NEXT: packssdw %xmm10, %xmm7 ; SSE-NEXT: movdqa %xmm6, %xmm11 ; SSE-NEXT: pslld $16, %xmm11 ; SSE-NEXT: psrad $16, %xmm11 -; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: pslld $16, %xmm10 ; SSE-NEXT: psrad $16, %xmm10 ; SSE-NEXT: packssdw %xmm11, %xmm10 @@ -299,25 +261,25 @@ ; SSE-NEXT: psrad $16, %xmm12 ; SSE-NEXT: packssdw %xmm11, %xmm12 ; SSE-NEXT: psrad $16, %xmm9 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: packssdw %xmm9, %xmm1 +; SSE-NEXT: psrad $16, %xmm8 ; SSE-NEXT: psrad $16, %xmm3 -; SSE-NEXT: packssdw %xmm9, %xmm3 -; SSE-NEXT: psrad $16, %xmm7 -; SSE-NEXT: psrad $16, %xmm2 -; SSE-NEXT: packssdw %xmm7, %xmm2 +; SSE-NEXT: packssdw %xmm8, %xmm3 ; SSE-NEXT: psrad $16, %xmm6 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: packssdw %xmm6, %xmm1 +; SSE-NEXT: psrad $16, %xmm2 +; SSE-NEXT: packssdw %xmm6, %xmm2 ; SSE-NEXT: psrad $16, %xmm4 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm12, 32(%rsi) -; SSE-NEXT: movdqa %xmm10, 48(%rsi) -; SSE-NEXT: movdqa %xmm8, (%rsi) -; SSE-NEXT: movdqa %xmm5, 16(%rsi) -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm1, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) +; SSE-NEXT: movdqa %xmm12, 48(%rsi) +; SSE-NEXT: movdqa %xmm10, 32(%rsi) +; SSE-NEXT: movdqa %xmm7, 16(%rsi) +; SSE-NEXT: movdqa %xmm5, (%rsi) +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm2, 32(%rdx) ; SSE-NEXT: movdqa %xmm3, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride2_vf32: @@ -365,115 +327,50 @@ ; AVX1-ONLY-NEXT: vmovdqa %xmm3, 16(%rdx) ; AVX1-ONLY-NEXT: retq ; -; AVX2-SLOW-LABEL: load_i16_stride2_vf32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,2],ymm4[0,2],ymm5[4,6],ymm4[4,6] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,2],ymm5[0,2],ymm6[4,6],ymm5[4,6] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%rdx) -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: load_i16_stride2_vf32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm6 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,2],ymm5[0,2],ymm6[4,6],ymm5[4,6] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2],ymm6[0,2],ymm4[4,6],ymm6[4,6] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rdx) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: load_i16_stride2_vf32: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm3, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,2],ymm5[0,2],ymm6[4,6],ymm5[4,6] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2],ymm6[0,2],ymm4[4,6],ymm6[4,6] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-ONLY-LABEL: load_i16_stride2_vf32: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm1, %ymm5 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,1,3] +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm2, %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm6 = [6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] +; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] +; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 32(%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i16_stride2_vf32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512F-NEXT: vpsrld $16, %zmm0, %zmm2 -; AVX512F-NEXT: vpsrld $16, %zmm1, %zmm3 -; AVX512F-NEXT: vpmovdw %zmm1, 32(%rsi) +; AVX512F-NEXT: vpsrld $16, %zmm1, %zmm2 +; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512F-NEXT: vpsrld $16, %zmm0, %zmm3 ; AVX512F-NEXT: vpmovdw %zmm0, (%rsi) -; AVX512F-NEXT: vpmovdw %zmm3, 32(%rdx) -; AVX512F-NEXT: vpmovdw %zmm2, (%rdx) +; AVX512F-NEXT: vpmovdw %zmm1, 32(%rsi) +; AVX512F-NEXT: vpmovdw %zmm3, (%rdx) +; AVX512F-NEXT: vmovdqa %ymm2, 32(%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -501,26 +398,26 @@ ; SSE-LABEL: load_i16_stride2_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $40, %rsp -; SSE-NEXT: movdqa 96(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm3 +; SSE-NEXT: movdqa 64(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm11 +; SSE-NEXT: movdqa 160(%rdi), %xmm11 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm2 +; SSE-NEXT: movdqa 176(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm10 +; SSE-NEXT: movdqa 128(%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm10, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm4 +; SSE-NEXT: movdqa 144(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm9 +; SSE-NEXT: movdqa (%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: pslld $16, %xmm12 @@ -547,41 +444,42 @@ ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: pslld $16, %xmm13 -; SSE-NEXT: psrad $16, %xmm13 -; SSE-NEXT: packssdw %xmm0, %xmm13 -; SSE-NEXT: movdqa 240(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslld $16, %xmm14 +; SSE-NEXT: psrad $16, %xmm14 +; SSE-NEXT: packssdw %xmm0, %xmm14 +; SSE-NEXT: movdqa 208(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: movdqa 224(%rdi), %xmm7 +; SSE-NEXT: movdqa 192(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, %xmm15 ; SSE-NEXT: pslld $16, %xmm15 ; SSE-NEXT: psrad $16, %xmm15 ; SSE-NEXT: packssdw %xmm0, %xmm15 -; SSE-NEXT: movdqa 80(%rdi), %xmm3 +; SSE-NEXT: movdqa 112(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: movdqa 64(%rdi), %xmm5 +; SSE-NEXT: movdqa 96(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, %xmm4 ; SSE-NEXT: pslld $16, %xmm4 ; SSE-NEXT: psrad $16, %xmm4 ; SSE-NEXT: packssdw %xmm1, %xmm4 -; SSE-NEXT: movdqa 208(%rdi), %xmm8 +; SSE-NEXT: movdqa 240(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, %xmm6 ; SSE-NEXT: pslld $16, %xmm6 ; SSE-NEXT: psrad $16, %xmm6 -; SSE-NEXT: movdqa 192(%rdi), %xmm2 +; SSE-NEXT: movdqa 224(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 ; SSE-NEXT: packssdw %xmm6, %xmm1 -; SSE-NEXT: psrad $16, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: packssdw %xmm14, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: psrad $16, %xmm6 +; SSE-NEXT: packssdw %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -590,9 +488,10 @@ ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: psrad $16, %xmm14 -; SSE-NEXT: packssdw %xmm0, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: psrad $16, %xmm6 +; SSE-NEXT: packssdw %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrad $16, %xmm3 ; SSE-NEXT: psrad $16, %xmm5 ; SSE-NEXT: packssdw %xmm3, %xmm5 @@ -606,313 +505,195 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: psrad $16, %xmm3 ; SSE-NEXT: packssdw %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm13 ; SSE-NEXT: psrad $16, %xmm7 -; SSE-NEXT: packssdw %xmm0, %xmm7 +; SSE-NEXT: packssdw %xmm13, %xmm7 ; SSE-NEXT: psrad $16, %xmm8 ; SSE-NEXT: psrad $16, %xmm2 ; SSE-NEXT: packssdw %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm1, 96(%rsi) -; SSE-NEXT: movdqa %xmm4, 32(%rsi) -; SSE-NEXT: movdqa %xmm15, 112(%rsi) -; SSE-NEXT: movdqa %xmm13, 48(%rsi) -; SSE-NEXT: movdqa %xmm11, 64(%rsi) -; SSE-NEXT: movdqa %xmm9, (%rsi) -; SSE-NEXT: movdqa %xmm10, 80(%rsi) -; SSE-NEXT: movdqa %xmm12, 16(%rsi) -; SSE-NEXT: movdqa %xmm2, 96(%rdx) -; SSE-NEXT: movdqa %xmm7, 112(%rdx) -; SSE-NEXT: movdqa %xmm3, 64(%rdx) -; SSE-NEXT: movdqa %xmm6, 80(%rdx) -; SSE-NEXT: movdqa %xmm5, 32(%rdx) -; SSE-NEXT: movdqa %xmm14, 48(%rdx) +; SSE-NEXT: movdqa %xmm1, 112(%rsi) +; SSE-NEXT: movdqa %xmm4, 48(%rsi) +; SSE-NEXT: movdqa %xmm15, 96(%rsi) +; SSE-NEXT: movdqa %xmm14, 32(%rsi) +; SSE-NEXT: movdqa %xmm11, 80(%rsi) +; SSE-NEXT: movdqa %xmm9, 16(%rsi) +; SSE-NEXT: movdqa %xmm10, 64(%rsi) +; SSE-NEXT: movdqa %xmm12, (%rsi) +; SSE-NEXT: movdqa %xmm2, 112(%rdx) +; SSE-NEXT: movdqa %xmm7, 96(%rdx) +; SSE-NEXT: movdqa %xmm3, 80(%rdx) +; SSE-NEXT: movdqa %xmm6, 64(%rdx) +; SSE-NEXT: movdqa %xmm5, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: addq $40, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride2_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $24, %rsp -; AVX1-ONLY-NEXT: vpxor %xmm11, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2],xmm11[3],xmm0[4],xmm11[5],xmm0[6],xmm11[7] -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm11[1],xmm9[2],xmm11[3],xmm9[4],xmm11[5],xmm9[6],xmm11[7] +; AVX1-ONLY-NEXT: vpxor %xmm10, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm10[1],xmm13[2],xmm10[3],xmm13[4],xmm10[5],xmm13[6],xmm10[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] ; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm11[1],xmm10[2],xmm11[3],xmm10[4],xmm11[5],xmm10[6],xmm11[7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm11[1],xmm7[2],xmm11[3],xmm7[4],xmm11[5],xmm7[6],xmm11[7] -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm11[1],xmm5[2],xmm11[3],xmm5[4],xmm11[5],xmm5[6],xmm11[7] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm11[1],xmm4[2],xmm11[3],xmm4[4],xmm11[5],xmm4[6],xmm11[7] -; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm10[1],xmm0[2],xmm10[3],xmm0[4],xmm10[5],xmm0[6],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm10[1],xmm0[2],xmm10[3],xmm0[4],xmm10[5],xmm0[6],xmm10[7] +; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm11[1],xmm8[2],xmm11[3],xmm8[4],xmm11[5],xmm8[6],xmm11[7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm11[1],xmm6[2],xmm11[3],xmm6[4],xmm11[5],xmm6[6],xmm11[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm10[1],xmm3[2],xmm10[3],xmm3[4],xmm10[5],xmm3[6],xmm10[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm10[1],xmm4[2],xmm10[3],xmm4[4],xmm10[5],xmm4[6],xmm10[7] ; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm3, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4],xmm11[5],xmm12[6],xmm11[7] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm11[1],xmm13[2],xmm11[3],xmm13[4],xmm11[5],xmm13[6],xmm11[7] -; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm10[1],xmm9[2],xmm10[3],xmm9[4],xmm10[5],xmm9[6],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0],xmm10[1],xmm12[2],xmm10[3],xmm12[4],xmm10[5],xmm12[6],xmm10[7] +; AVX1-ONLY-NEXT: vpackusdw %xmm3, %xmm7, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm11[1],xmm15[2],xmm11[3],xmm15[4],xmm11[5],xmm15[6],xmm11[7] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm11[1],xmm6[2],xmm11[3],xmm6[4],xmm11[5],xmm6[6],xmm11[7] -; AVX1-ONLY-NEXT: vpackusdw %xmm14, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm10[1],xmm6[2],xmm10[3],xmm6[4],xmm10[5],xmm6[6],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm2[0],xmm10[1],xmm2[2],xmm10[3],xmm2[4],xmm10[5],xmm2[6],xmm10[7] +; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm14, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm11[1],xmm3[2],xmm11[3],xmm3[4],xmm11[5],xmm3[6],xmm11[7] -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm11[1],xmm0[2],xmm11[3],xmm0[4],xmm11[5],xmm0[6],xmm11[7] -; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm8, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm11[1],xmm8[2],xmm11[3],xmm8[4],xmm11[5],xmm8[6],xmm11[7] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0],xmm11[1],xmm2[2],xmm11[3],xmm2[4],xmm11[5],xmm2[6],xmm11[7] -; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpackusdw %xmm3, %xmm0, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm10[1],xmm15[2],xmm10[3],xmm15[4],xmm10[5],xmm15[6],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm10[1],xmm8[2],xmm10[3],xmm8[4],xmm10[5],xmm8[6],xmm10[7] +; AVX1-ONLY-NEXT: vpackusdw %xmm14, %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm10[1],xmm3[2],xmm10[3],xmm3[4],xmm10[5],xmm3[6],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0],xmm10[1],xmm1[2],xmm10[3],xmm1[4],xmm10[5],xmm1[6],xmm10[7] +; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm10[1],xmm5[2],xmm10[3],xmm5[4],xmm10[5],xmm5[6],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm4[0],xmm10[1],xmm4[2],xmm10[3],xmm4[4],xmm10[5],xmm4[6],xmm10[7] +; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm11, %xmm11 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm3 -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm4 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm8 -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm8 +; AVX1-ONLY-NEXT: vpackusdw %xmm6, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpackusdw %xmm8, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 96(%rsi) +; AVX1-ONLY-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm12, %xmm9 +; AVX1-ONLY-NEXT: vpackusdw %xmm6, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm9 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 96(%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 32(%rsi) ; AVX1-ONLY-NEXT: vmovdqa %xmm14, 112(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rsi) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, 32(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 48(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 96(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, (%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 16(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 64(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 80(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 80(%rsi) +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 96(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 112(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 64(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 80(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm11, 16(%rdx) ; AVX1-ONLY-NEXT: addq $24, %rsp ; AVX1-ONLY-NEXT: retq ; -; AVX2-SLOW-LABEL: load_i16_stride2_vf64: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,2],ymm2[0,2],ymm7[4,6],ymm2[4,6] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm10[0,2],ymm7[0,2],ymm10[4,6],ymm7[4,6] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,1,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,2],ymm10[0,2],ymm11[4,6],ymm10[4,6] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,1,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,2],ymm11[0,2],ymm12[4,6],ymm11[4,6] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,1,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-SLOW-NEXT: vmovaps %ymm11, 64(%rsi) -; AVX2-SLOW-NEXT: vmovaps %ymm10, (%rsi) -; AVX2-SLOW-NEXT: vmovaps %ymm7, 96(%rsi) -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 96(%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, 32(%rdx) -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: load_i16_stride2_vf64: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm10 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,2],ymm2[0,2],ymm10[4,6],ymm2[4,6] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm11 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,2],ymm10[0,2],ymm11[4,6],ymm10[4,6] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm12 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,2],ymm11[0,2],ymm12[4,6],ymm11[4,6] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2],ymm12[0,2],ymm9[4,6],ymm12[4,6] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,1,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FAST-NEXT: vmovaps %ymm9, 64(%rsi) -; AVX2-FAST-NEXT: vmovaps %ymm11, (%rsi) -; AVX2-FAST-NEXT: vmovaps %ymm10, 96(%rsi) -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm4, (%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 96(%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 32(%rdx) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: load_i16_stride2_vf64: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm8, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm7, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,2],ymm2[0,2],ymm10[4,6],ymm2[4,6] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm3, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,2],ymm10[0,2],ymm11[4,6],ymm10[4,6] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm6, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm5, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,2],ymm11[0,2],ymm12[4,6],ymm11[4,6] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2],ymm12[0,2],ymm9[4,6],ymm12[4,6] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-ONLY-LABEL: load_i16_stride2_vf64: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] +; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm6, %ymm3 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm5, %ymm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm3[2,3],ymm11[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm4, %ymm11 +; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm2, %ymm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,1,3] +; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm8, %ymm12 +; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm7, %ymm13 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,1,3] +; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm1, %ymm9 +; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,1,3] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm10 = [6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] +; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] +; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,1,3] +; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm8, %ymm6 +; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,1,3] +; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-ONLY-NEXT: vmovdqa %ymm9, 96(%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, 32(%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 64(%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 96(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 64(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, 32(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, (%rdx) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i16_stride2_vf64: ; AVX512F: # %bb.0: @@ -920,19 +701,18 @@ ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512F-NEXT: vpmovdw %zmm1, %ymm4 -; AVX512F-NEXT: vpsrld $16, %zmm1, %zmm1 +; AVX512F-NEXT: vpsrld $16, %zmm1, %zmm4 ; AVX512F-NEXT: vpsrld $16, %zmm0, %zmm5 ; AVX512F-NEXT: vpsrld $16, %zmm3, %zmm6 ; AVX512F-NEXT: vpsrld $16, %zmm2, %zmm7 -; AVX512F-NEXT: vpmovdw %zmm0, (%rsi) -; AVX512F-NEXT: vmovdqa %ymm4, 32(%rsi) ; AVX512F-NEXT: vpmovdw %zmm2, 64(%rsi) +; AVX512F-NEXT: vpmovdw %zmm0, (%rsi) ; AVX512F-NEXT: vpmovdw %zmm3, 96(%rsi) +; AVX512F-NEXT: vpmovdw %zmm1, 32(%rsi) ; AVX512F-NEXT: vpmovdw %zmm7, 64(%rdx) ; AVX512F-NEXT: vpmovdw %zmm6, 96(%rdx) ; AVX512F-NEXT: vpmovdw %zmm5, (%rdx) -; AVX512F-NEXT: vpmovdw %zmm1, 32(%rdx) +; AVX512F-NEXT: vpmovdw %zmm4, 32(%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -964,7 +744,9 @@ } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; AVX2: {{.*}} -; AVX2-ONLY: {{.*}} +; AVX2-FAST: {{.*}} +; AVX2-FAST-PERLANE: {{.*}} +; AVX2-SLOW: {{.*}} ; AVX512-FAST: {{.*}} ; AVX512-SLOW: {{.*}} ; AVX512BW-FAST: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -579,25 +579,23 @@ ; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14],ymm3[15] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] ; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> ; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm5 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm6 = ; AVX2-ONLY-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 @@ -616,36 +614,34 @@ ; ; AVX512F-LABEL: load_i16_stride3_vf16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-NEXT: vmovdqa %ymm0, %ymm3 -; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm3 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512F-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm3 ; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] ; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm4 ; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] ; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7],ymm3[8,9,10],ymm6[11,12,13,14,15] -; AVX512F-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm6 +; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm6 ; AVX512F-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] ; AVX512F-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] ; AVX512F-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13] ; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15] -; AVX512F-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,6,7,4] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15] @@ -686,19 +682,19 @@ ; SSE-LABEL: load_i16_stride3_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $40, %rsp -; SSE-NEXT: movdqa 96(%rdi), %xmm5 +; SSE-NEXT: movdqa 144(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm6 +; SSE-NEXT: movdqa 128(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm13 -; SSE-NEXT: movdqa 160(%rdi), %xmm9 -; SSE-NEXT: movdqa 80(%rdi), %xmm11 -; SSE-NEXT: movdqa (%rdi), %xmm15 -; SSE-NEXT: movdqa 16(%rdi), %xmm10 -; SSE-NEXT: movdqa 32(%rdi), %xmm7 +; SSE-NEXT: movdqa 96(%rdi), %xmm13 +; SSE-NEXT: movdqa 112(%rdi), %xmm9 +; SSE-NEXT: movdqa 80(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm0 -; SSE-NEXT: movdqa 64(%rdi), %xmm12 +; SSE-NEXT: movdqa 64(%rdi), %xmm10 +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm12 +; SSE-NEXT: movdqa 32(%rdi), %xmm11 +; SSE-NEXT: movdqa 48(%rdi), %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm12, %xmm2 @@ -744,7 +740,7 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm2[2,0] -; SSE-NEXT: movdqa 112(%rdi), %xmm6 +; SSE-NEXT: movdqa 160(%rdi), %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] @@ -755,7 +751,7 @@ ; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa 128(%rdi), %xmm5 +; SSE-NEXT: movdqa 176(%rdi), %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,7,6,7] @@ -902,21 +898,21 @@ ; SSE-NEXT: pandn %xmm6, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movdqa %xmm5, 32(%rdx) -; SSE-NEXT: movdqa %xmm13, 48(%rdx) -; SSE-NEXT: movdqa %xmm15, (%rdx) -; SSE-NEXT: movdqa %xmm10, 16(%rdx) -; SSE-NEXT: movdqa %xmm0, 32(%rcx) -; SSE-NEXT: movdqa %xmm2, 48(%rcx) -; SSE-NEXT: movdqa %xmm3, (%rcx) -; SSE-NEXT: movdqa %xmm4, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movdqa %xmm5, 48(%rdx) +; SSE-NEXT: movdqa %xmm13, 32(%rdx) +; SSE-NEXT: movdqa %xmm15, 16(%rdx) +; SSE-NEXT: movdqa %xmm10, (%rdx) +; SSE-NEXT: movdqa %xmm0, 48(%rcx) +; SSE-NEXT: movdqa %xmm2, 32(%rcx) +; SSE-NEXT: movdqa %xmm3, 16(%rcx) +; SSE-NEXT: movdqa %xmm4, (%rcx) ; SSE-NEXT: addq $40, %rsp ; SSE-NEXT: retq ; @@ -937,7 +933,7 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm8[1],xmm5[2,3],xmm8[4],xmm5[5,6],xmm8[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u> +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5] @@ -965,16 +961,14 @@ ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,8,9,14,15,14,15,8,9,14,15,12,13,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm15 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm11[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm5[0,1],xmm8[2],xmm5[3,4],xmm8[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,1,6,7,12,13,0,0,0,1,6,7,12,13] -; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,6,7,0,1,0,1,6,7,12,13] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm11 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2,3,4],xmm11[5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] @@ -986,10 +980,9 @@ ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,2,3,8,9,14,15,0,0,2,3,8,9,14,15] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,2,3,8,9,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm5 @@ -999,7 +992,7 @@ ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3],mem[4],xmm6[5,6],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] @@ -1029,23 +1022,22 @@ ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm3 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7],ymm3[8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13,14],ymm5[15] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm2 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3],ymm5[4],ymm2[5,6],ymm5[7],ymm2[8],ymm5[9],ymm2[10,11],ymm5[12],ymm2[13,14],ymm5[15] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm2, %ymm2 ; AVX2-ONLY-NEXT: vmovdqa 176(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm9, %xmm9 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1,2],ymm9[3,4,5,6,7],ymm3[8,9,10],ymm9[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm9[3,4,5,6,7],ymm2[8,9,10],ymm9[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm7 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm9 = ymm7[2,3,0,1] ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6],ymm9[7],ymm7[8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13,14],ymm9[15] @@ -1056,42 +1048,39 @@ ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm11, %xmm10 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm4, %ymm2, %ymm10 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX2-ONLY-NEXT: vpshufb %ymm12, %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm11 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX2-ONLY-NEXT: vpshufb %ymm12, %ymm11, %ymm11 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] ; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0,1,2],ymm13[3,4,5,6,7],ymm10[8,9,10],ymm13[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7,8,9],ymm13[10],ymm11[11,12],ymm13[13],ymm11[14,15] -; AVX2-ONLY-NEXT: vpshufb %ymm12, %ymm11, %ymm11 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm13 = ymm11[0,1,2],ymm13[3,4,5,6,7],ymm11[8,9,10],ymm13[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7,8,9],ymm13[10],ymm10[11,12],ymm13[13],ymm10[14,15] +; AVX2-ONLY-NEXT: vpshufb %ymm12, %ymm10, %ymm10 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] ; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm12, %xmm12 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0,1,2],ymm12[3,4,5,6,7],ymm10[8,9,10],ymm12[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15] +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm4, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7],ymm4[8],ymm3[9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] @@ -1100,11 +1089,11 @@ ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 32(%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rsi) ; AVX2-ONLY-NEXT: vmovdqa %ymm9, (%rsi) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 32(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 32(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 32(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -1118,69 +1107,67 @@ ; AVX512F-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm1 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm4 ; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm4[3,4,5,6,7] ; AVX512F-NEXT: vmovdqa (%rdi), %ymm8 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm3 -; AVX512F-NEXT: vpternlogq $202, %ymm9, %ymm8, %ymm3 -; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512F-NEXT: vmovdqa %ymm0, %ymm4 +; AVX512F-NEXT: vpternlogq $202, %ymm9, %ymm8, %ymm4 +; AVX512F-NEXT: vpermq {{.*#+}} ymm10 = ymm4[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm10[1],ymm4[2,3],ymm10[4],ymm4[5,6],ymm10[7],ymm4[8],ymm10[9],ymm4[10,11],ymm10[12],ymm4[13,14],ymm10[15] +; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm10 ; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] ; AVX512F-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2],ymm11[3,4,5,6,7],ymm10[8,9,10],ymm11[11,12,13,14,15] -; AVX512F-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm10 ; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm5, %ymm10 ; AVX512F-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] ; AVX512F-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4],xmm10[5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-NEXT: vmovdqa %ymm11, %ymm12 -; AVX512F-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm12 -; AVX512F-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7,8,9],ymm13[10],ymm12[11,12],ymm13[13],ymm12[14,15] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX512F-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-NEXT: vmovdqa %ymm12, %ymm13 +; AVX512F-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm13 +; AVX512F-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7,8,9],ymm14[10],ymm13[11,12],ymm14[13],ymm13[14,15] +; AVX512F-NEXT: vpshufb %ymm11, %ymm13, %ymm11 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm13 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13] ; AVX512F-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm13 = ymm12[0,1,2],ymm13[3,4,5,6,7],ymm12[8,9,10],ymm13[11,12,13,14,15] -; AVX512F-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,6,7,4] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm10 -; AVX512F-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm11 -; AVX512F-NEXT: vpermq {{.*#+}} ymm5 = ymm11[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm11[1,2],ymm5[3],ymm11[4,5],ymm5[6],ymm11[7],ymm5[8],ymm11[9,10],ymm5[11],ymm11[12,13],ymm5[14],ymm11[15] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm13 = ymm11[0,1,2],ymm13[3,4,5,6,7],ymm11[8,9,10],ymm13[11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512F-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm12 +; AVX512F-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] ; AVX512F-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX512F-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm0 -; AVX512F-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2],ymm8[3],ymm0[4,5],ymm8[6],ymm0[7],ymm8[8],ymm0[9,10],ymm8[11],ymm0[12,13],ymm8[14],ymm0[15] -; AVX512F-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm1 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512F-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm10, (%rdx) @@ -1224,17 +1211,17 @@ ; SSE-LABEL: load_i16_stride3_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $440, %rsp # imm = 0x1B8 -; SSE-NEXT: movdqa 192(%rdi), %xmm14 -; SSE-NEXT: movdqa 272(%rdi), %xmm6 -; SSE-NEXT: movdqa 240(%rdi), %xmm5 -; SSE-NEXT: movdqa 256(%rdi), %xmm7 -; SSE-NEXT: movdqa 80(%rdi), %xmm10 -; SSE-NEXT: movdqa (%rdi), %xmm15 -; SSE-NEXT: movdqa 16(%rdi), %xmm9 -; SSE-NEXT: movdqa 32(%rdi), %xmm8 +; SSE-NEXT: movdqa 240(%rdi), %xmm14 +; SSE-NEXT: movdqa 80(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm0 -; SSE-NEXT: movdqa 64(%rdi), %xmm11 +; SSE-NEXT: movdqa 64(%rdi), %xmm9 +; SSE-NEXT: movdqa 224(%rdi), %xmm6 +; SSE-NEXT: movdqa 192(%rdi), %xmm5 +; SSE-NEXT: movdqa 208(%rdi), %xmm7 +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm11 +; SSE-NEXT: movdqa 32(%rdi), %xmm10 +; SSE-NEXT: movdqa 48(%rdi), %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm11, %xmm2 @@ -1280,7 +1267,7 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm2[2,0] -; SSE-NEXT: movdqa 208(%rdi), %xmm8 +; SSE-NEXT: movdqa 256(%rdi), %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] @@ -1293,7 +1280,7 @@ ; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa 224(%rdi), %xmm2 +; SSE-NEXT: movdqa 272(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] @@ -1305,16 +1292,16 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm3 +; SSE-NEXT: movdqa 112(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm3 +; SSE-NEXT: movdqa 96(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa 176(%rdi), %xmm2 +; SSE-NEXT: movdqa 128(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] @@ -1326,16 +1313,16 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm3 +; SSE-NEXT: movdqa 304(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm3 +; SSE-NEXT: movdqa 288(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa 368(%rdi), %xmm2 +; SSE-NEXT: movdqa 320(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] @@ -1347,16 +1334,16 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm3 +; SSE-NEXT: movdqa 160(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm3 +; SSE-NEXT: movdqa 144(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa 128(%rdi), %xmm2 +; SSE-NEXT: movdqa 176(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] @@ -1368,14 +1355,14 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: movdqa 336(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: movdqa 304(%rdi), %xmm3 +; SSE-NEXT: movdqa 352(%rdi), %xmm3 ; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa 320(%rdi), %xmm2 +; SSE-NEXT: movdqa 368(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] @@ -1665,462 +1652,451 @@ ; SSE-NEXT: pandn %xmm12, %xmm13 ; SSE-NEXT: por %xmm0, %xmm13 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movdqa %xmm3, 96(%rdx) -; SSE-NEXT: movdqa %xmm6, 32(%rdx) -; SSE-NEXT: movdqa %xmm10, 112(%rdx) -; SSE-NEXT: movdqa %xmm15, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movdqa %xmm3, 112(%rdx) +; SSE-NEXT: movdqa %xmm6, 48(%rdx) +; SSE-NEXT: movdqa %xmm10, 96(%rdx) +; SSE-NEXT: movdqa %xmm15, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movdqa %xmm13, 96(%rcx) -; SSE-NEXT: movdqa %xmm9, 112(%rcx) -; SSE-NEXT: movdqa %xmm11, 64(%rcx) -; SSE-NEXT: movdqa %xmm14, 80(%rcx) -; SSE-NEXT: movdqa %xmm2, 32(%rcx) -; SSE-NEXT: movdqa %xmm5, 48(%rcx) -; SSE-NEXT: movdqa %xmm8, (%rcx) -; SSE-NEXT: movdqa %xmm4, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movdqa %xmm13, 112(%rcx) +; SSE-NEXT: movdqa %xmm9, 96(%rcx) +; SSE-NEXT: movdqa %xmm11, 80(%rcx) +; SSE-NEXT: movdqa %xmm14, 64(%rcx) +; SSE-NEXT: movdqa %xmm2, 48(%rcx) +; SSE-NEXT: movdqa %xmm5, 32(%rcx) +; SSE-NEXT: movdqa %xmm8, 16(%rcx) +; SSE-NEXT: movdqa %xmm4, (%rcx) ; SSE-NEXT: addq $440, %rsp # imm = 0x1B8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: subq $472, %rsp # imm = 0x1D8 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2],xmm0[3,4],xmm14[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5,6],xmm10[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0,1],xmm15[2],xmm11[3,4],xmm15[5],xmm11[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm15[2],xmm2[3,4],xmm15[5],xmm2[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm7[1],xmm5[2,3],xmm7[4],xmm5[5,6],xmm7[7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0],xmm0[1],xmm11[2,3],xmm0[4],xmm11[5,6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0],xmm3[1],xmm12[2,3],xmm3[4],xmm12[5,6],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm14[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm13[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm13[2],xmm8[3,4],xmm13[5],xmm8[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm14 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm14[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm11[2],xmm15[3,4],xmm11[5],xmm15[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm13[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm9[0,1],xmm10[2],xmm9[3,4],xmm10[5],xmm9[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,1,6,7,12,13,0,0,0,1,6,7,12,13] -; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3,4],xmm15[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm14[0,1],mem[2],xmm14[3,4],mem[5],xmm14[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,8,9,14,15,14,15,8,9,14,15,12,13,14,15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm1[0,1],mem[2],xmm1[3,4],mem[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm13[0,1,2,3,4],xmm15[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm5[0,1],mem[2],xmm5[3,4],mem[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2],xmm13[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm12[0,1],xmm8[2],xmm12[3,4],xmm8[5],xmm12[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3,4],xmm15[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm15[0,1],mem[2],xmm15[3,4],mem[5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,6,7,0,1,0,1,6,7,12,13] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm15[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1],xmm9[2],xmm11[3,4],xmm9[5],xmm11[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm15[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm11[0,1],mem[2],xmm11[3,4],mem[5],xmm11[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [0,0,2,3,8,9,14,15,0,0,2,3,8,9,14,15] -; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm15[2],xmm8[3,4],xmm15[5],xmm8[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm12[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,2,3,8,9,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm0[0,1,2,3,4],xmm12[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1,2,3,4],xmm5[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2],mem[3,4],xmm0[5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm14, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1],xmm12[2],xmm8[3,4],xmm12[5],xmm8[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm5[1],xmm15[2,3],xmm5[4],xmm15[5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2,3,4],xmm8[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $109, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0],xmm0[1],mem[2,3],xmm0[4],mem[5,6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $109, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm11[1],mem[2,3],xmm11[4],mem[5,6],xmm11[7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $109, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0],xmm4[1],mem[2,3],xmm4[4],mem[5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm15[0],mem[1],xmm15[2,3],mem[4],xmm15[5,6],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3],mem[4],xmm6[5,6],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3],mem[4],xmm7[5,6],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $109, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0],xmm8[1],mem[2,3],xmm8[4],mem[5,6],xmm8[7] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 48(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 80(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 96(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 16(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 64(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, 80(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 112(%rcx) -; AVX1-ONLY-NEXT: addq $456, %rsp # imm = 0x1C8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm14, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 80(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 96(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 112(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 64(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 80(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 32(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 48(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, (%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX1-ONLY-NEXT: addq $472, %rsp # imm = 0x1D8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i16_stride3_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $136, %rsp -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm12, %ymm13, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm11, %ymm12, %ymm0 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14],ymm3[15] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm3 -; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm8 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm13, %ymm12, %ymm6 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm13, %ymm12, %ymm6 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm13 -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm10, %ymm9, %ymm6 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm10 -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 176(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm11 -; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm14 -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm6 -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1,2],ymm1[3,4,5,6,7],ymm15[8,9,10],ymm1[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,6,5,4,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14],ymm1[15] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm15[2],xmm4[3,4],xmm15[5],xmm4[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm8[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6],ymm1[7],ymm8[8],ymm1[9],ymm8[10,11],ymm1[12],ymm8[13,14],ymm1[15] -; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm3[0,1],xmm8[2],xmm3[3,4],xmm8[5],xmm3[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm12 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm11[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6],ymm1[7],ymm11[8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13,14],ymm1[15] -; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm1, %ymm12 -; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm2[2],xmm11[3,4],xmm2[5],xmm11[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,6,5,4,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm10, %ymm13, %ymm15 +; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm4, %ymm6, %ymm7 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm13, %ymm10, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm12 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm13[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15] -; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm12, %ymm12 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm13 = ymm12[0,1,2],ymm13[3,4,5,6,7],ymm12[8,9,10],ymm13[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] -; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm12 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0,1,2],ymm12[3,4,5,6,7],ymm10[8,9,10],ymm12[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15] -; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm12, %ymm9 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm11[2],xmm2[3,4],xmm11[5],xmm2[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm13, %ymm10, %ymm0 +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm12, %ymm11, %ymm14 +; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm12, %ymm11, %ymm0 +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm6, %ymm4, %ymm12 +; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm6, %ymm4, %ymm10 +; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm2, %ymm5, %ymm13 +; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm2, %ymm5, %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3,4,5,6,7],ymm9[8,9,10],ymm1[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7],ymm1[8],ymm9[9,10],ymm1[11],ymm9[12,13],ymm1[14],ymm9[15] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5,6],xmm5[7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm15[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5,6],ymm1[7],ymm15[8],ymm1[9],ymm15[10,11],ymm1[12],ymm15[13,14],ymm1[15] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm1, %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm15 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm15 = ymm2[0,1,2],ymm15[3,4,5,6,7],ymm2[8,9,10],ymm15[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm7[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6],ymm2[7],ymm7[8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13,14],ymm2[15] +; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vmovdqa %ymm9, %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1],xmm7[2],xmm15[3,4],xmm7[5],xmm15[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm9, %xmm9 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm9[3,4,5,6,7],ymm2[8,9,10],ymm9[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7],ymm0[8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14],ymm2[15] +; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm5 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7],ymm9[8,9,10],ymm5[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm14[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0,1],ymm5[2],ymm14[3,4],ymm5[5],ymm14[6,7,8,9],ymm5[10],ymm14[11,12],ymm5[13],ymm14[14,15] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1],xmm8[2],xmm6[3,4],xmm8[5],xmm6[6,7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm9, %xmm9 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm9 = ymm5[0,1,2],ymm9[3,4,5,6,7],ymm5[8,9,10],ymm9[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm9 = ymm5[0,1],ymm9[2],ymm5[3,4],ymm9[5],ymm5[6,7,8,9],ymm9[10],ymm5[11,12],ymm9[13],ymm5[14,15] +; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1],xmm3[2],xmm1[3,4],xmm3[5],xmm1[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm5 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm12[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7],ymm1[8],ymm12[9,10],ymm1[11],ymm12[12,13],ymm1[14],ymm12[15] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2,3],xmm15[4],xmm4[5,6],xmm15[7] -; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] -; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6],xmm8[7] -; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm6[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7],ymm4[8],ymm6[9,10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15] -; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2,3],xmm2[4],xmm11[5,6],xmm2[7] -; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 64(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm14, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm13, 96(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 64(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 96(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 32(%rcx) +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7],ymm9[8,9,10],ymm5[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0,1],ymm5[2],ymm12[3,4],ymm5[5],ymm12[6,7,8,9],ymm5[10],ymm12[11,12],ymm5[13],ymm12[14,15] +; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm15[2],xmm7[3,4],xmm15[5],xmm7[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0,1,2],ymm12[3,4,5,6,7],ymm5[8,9,10],ymm12[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm13[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm5 = ymm13[0,1],ymm5[2],ymm13[3,4],ymm5[5],ymm13[6,7,8,9],ymm5[10],ymm13[11,12],ymm5[13],ymm13[14,15] +; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm11 = ymm5[0,1,2],ymm11[3,4,5,6,7],ymm5[8,9,10],ymm11[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm11[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm11[1,2],ymm5[3],ymm11[4,5],ymm5[6],ymm11[7],ymm5[8],ymm11[9,10],ymm5[11],ymm11[12,13],ymm5[14],ymm11[15] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm1[5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm14[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm14[1,2],ymm5[3],ymm14[4,5],ymm5[6],ymm14[7],ymm5[8],ymm14[9,10],ymm5[11],ymm14[12,13],ymm5[14],ymm14[15] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2,3],xmm6[4],xmm8[5,6],xmm6[7] +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm6 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm10[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm10[1,2],ymm6[3],ymm10[4,5],ymm6[6],ymm10[7],ymm6[8],ymm10[9,10],ymm6[11],ymm10[12,13],ymm6[14],ymm10[15] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0],xmm7[1],xmm15[2,3],xmm7[4],xmm15[5,6],xmm7[7] +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2],ymm7[3],ymm4[4,5],ymm7[6],ymm4[7],ymm7[8],ymm4[9,10],ymm7[11],ymm4[12,13],ymm7[14],ymm4[15] +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] +; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, 96(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm9, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm13, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, 96(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 64(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-ONLY-NEXT: addq $136, %rsp ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -2128,172 +2104,158 @@ ; AVX512F-LABEL: load_i16_stride3_vf64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-NEXT: vmovdqa64 224(%rdi), %ymm20 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm21 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512F-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm1 -; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14],ymm3[15] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm5 -; AVX512F-NEXT: vmovdqa 272(%rdi), %xmm8 -; AVX512F-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] -; AVX512F-NEXT: vmovdqa %xmm2, %xmm14 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX512F-NEXT: vpshufb %xmm9, %xmm6, %xmm6 -; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] -; AVX512F-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 320(%rdi), %ymm22 -; AVX512F-NEXT: vmovdqa64 352(%rdi), %ymm23 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm6 -; AVX512F-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm6 -; AVX512F-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6],ymm7[7],ymm6[8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14],ymm7[15] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512F-NEXT: vpshufb %ymm11, %ymm6, %ymm12 -; AVX512F-NEXT: vmovdqa 304(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] -; AVX512F-NEXT: vmovdqa %xmm2, %xmm4 -; AVX512F-NEXT: vmovdqa %xmm1, %xmm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] -; AVX512F-NEXT: vpshufb %xmm15, %xmm13, %xmm13 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm16 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm24 -; AVX512F-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512F-NEXT: vpternlogq $202, %ymm24, %ymm13, %ymm5 -; AVX512F-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15] -; AVX512F-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm11 -; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7] -; AVX512F-NEXT: vpshufb %xmm15, %xmm10, %xmm10 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm5[3,4,5,6,7] -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512F-NEXT: vpternlogq $202, %ymm5, %ymm17, %ymm10 -; AVX512F-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8],ymm15[9],ymm10[10,11],ymm15[12],ymm10[13,14],ymm15[15] -; AVX512F-NEXT: vpshufb %ymm3, %ymm10, %ymm2 -; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7] -; AVX512F-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] -; AVX512F-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm18 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm22 +; AVX512F-NEXT: vmovdqa 160(%rdi), %ymm6 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512F-NEXT: vpternlogq $202, %ymm23, %ymm22, %ymm1 +; AVX512F-NEXT: vpternlogq $202, %ymm22, %ymm6, %ymm1 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7] -; AVX512F-NEXT: vmovdqa64 %xmm6, %xmm25 -; AVX512F-NEXT: vmovdqa64 %xmm4, %xmm26 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] -; AVX512F-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-NEXT: vmovdqa %ymm9, %ymm1 -; AVX512F-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm1 -; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512F-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %xmm14, %xmm7 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7] -; AVX512F-NEXT: vmovdqa64 %xmm8, %xmm27 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX512F-NEXT: vpshufb %xmm2, %xmm14, %xmm14 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512F-NEXT: vpshufb %ymm1, %ymm2, %ymm3 +; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm9 +; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm11 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm9[1],xmm11[2,3],xmm9[4],xmm11[5,6],xmm9[7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX512F-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] +; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm23 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX512F-NEXT: vmovdqa %ymm0, %ymm4 +; AVX512F-NEXT: vpternlogq $202, %ymm8, %ymm23, %ymm4 +; AVX512F-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7],ymm4[8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13,14],ymm5[15] +; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm12 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0,1],xmm12[2],xmm13[3,4],xmm12[5],xmm13[6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX512F-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2],ymm5[3,4,5,6,7],ymm4[8,9,10],ymm5[11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm17 +; AVX512F-NEXT: vmovdqa64 224(%rdi), %ymm19 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm21 +; AVX512F-NEXT: vmovdqa %ymm0, %ymm3 +; AVX512F-NEXT: vpternlogq $202, %ymm19, %ymm21, %ymm3 +; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] +; AVX512F-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512F-NEXT: vmovdqa 272(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 256(%rdi), %xmm4 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm15 = xmm4[0,1],xmm2[2],xmm4[3,4],xmm2[5],xmm4[6,7] +; AVX512F-NEXT: vmovdqa %xmm4, %xmm5 +; AVX512F-NEXT: vmovdqa %xmm2, %xmm10 +; AVX512F-NEXT: vpshufb %xmm14, %xmm15, %xmm14 ; AVX512F-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0,1,2],ymm14[3,4,5,6,7],ymm1[8,9,10],ymm14[11,12,13,14,15] -; AVX512F-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm19 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512F-NEXT: vpternlogq $202, %ymm13, %ymm24, %ymm1 -; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX512F-NEXT: vmovdqa64 %ymm28, %ymm3 -; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] -; AVX512F-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512F-NEXT: vpternlogq $202, %ymm17, %ymm5, %ymm3 -; AVX512F-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] -; AVX512F-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX512F-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm13 -; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7],ymm2[8],ymm13[9,10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15] -; AVX512F-NEXT: vpternlogq $226, %ymm17, %ymm0, %ymm5 -; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX512F-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm14 = ymm3[0,1,2],ymm14[3,4,5,6,7],ymm3[8,9,10],ymm14[11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vmovdqa 320(%rdi), %ymm15 +; AVX512F-NEXT: vmovdqa64 352(%rdi), %ymm16 +; AVX512F-NEXT: vmovdqa %ymm0, %ymm14 +; AVX512F-NEXT: vpternlogq $202, %ymm15, %ymm16, %ymm14 +; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm14[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3],ymm2[4],ymm14[5,6],ymm2[7],ymm14[8],ymm2[9],ymm14[10,11],ymm2[12],ymm14[13,14],ymm2[15] +; AVX512F-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX512F-NEXT: vmovdqa 304(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 288(%rdi), %xmm14 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm1[1],xmm14[2,3],xmm1[4],xmm14[5,6],xmm1[7] +; AVX512F-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm18 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512F-NEXT: vpternlogq $202, %ymm21, %ymm19, %ymm3 +; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1],xmm5[2],xmm10[3,4],xmm5[5],xmm10[6,7] +; AVX512F-NEXT: vmovdqa64 %xmm10, %xmm24 +; AVX512F-NEXT: vmovdqa64 %xmm5, %xmm25 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512F-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX512F-NEXT: vmovdqa64 %xmm5, %xmm26 +; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2],ymm7[3,4,5,6,7],ymm4[8,9,10],ymm7[11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vmovdqa %ymm0, %ymm7 +; AVX512F-NEXT: vpternlogq $202, %ymm16, %ymm15, %ymm7 +; AVX512F-NEXT: vpermq {{.*#+}} ymm5 = ymm7[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7,8,9],ymm5[10],ymm7[11,12],ymm5[13],ymm7[14,15] +; AVX512F-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0,1],xmm1[2],xmm14[3,4],xmm1[5],xmm14[6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] +; AVX512F-NEXT: vpshufb %xmm10, %xmm7, %xmm7 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm5[5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm20 +; AVX512F-NEXT: vmovdqa %ymm0, %ymm4 +; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm22, %ymm4 +; AVX512F-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm9[2],xmm11[3,4],xmm9[5],xmm11[6,7] +; AVX512F-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm4[5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vmovdqa %ymm2, %ymm5 +; AVX512F-NEXT: vpternlogq $202, %ymm23, %ymm8, %ymm5 +; AVX512F-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3,4],ymm7[5],ymm5[6,7,8,9],ymm7[10],ymm5[11,12],ymm7[13],ymm5[14,15] +; AVX512F-NEXT: vpshufb %ymm3, %ymm5, %ymm3 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] +; AVX512F-NEXT: vmovdqa64 %xmm26, %xmm7 +; AVX512F-NEXT: vpshufb %xmm7, %xmm5, %xmm5 ; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] -; AVX512F-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512F-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 -; AVX512F-NEXT: vextracti32x4 $2, %zmm5, %xmm5 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm2[5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm9 -; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm9[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7],ymm3[8],ymm9[9,10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15] -; AVX512F-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm0 -; AVX512F-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7],ymm5[8],ymm0[9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa64 %xmm27, %xmm4 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7] -; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512F-NEXT: vmovdqa64 %xmm26, %xmm5 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] -; AVX512F-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-NEXT: vextracti32x4 $2, %zmm4, %xmm4 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm18, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm16, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm19, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512F-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0,1,2],ymm5[3,4,5,6,7],ymm3[8,9,10],ymm5[11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm4 +; AVX512F-NEXT: vpternlogq $226, %ymm22, %ymm2, %ymm6 +; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm6[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm6[1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7],ymm3[8],ymm6[9,10],ymm3[11],ymm6[12,13],ymm3[14],ymm6[15] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX512F-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512F-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vpternlogq $226, %ymm23, %ymm0, %ymm8 +; AVX512F-NEXT: vpermq {{.*#+}} ymm6 = ymm8[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7],ymm6[8],ymm8[9,10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15] +; AVX512F-NEXT: vpshufb %ymm3, %ymm6, %ymm6 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm12[1],xmm13[2,3],xmm12[4],xmm13[5,6],xmm12[7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX512F-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX512F-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5,6,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512F-NEXT: vpternlogq $202, %ymm15, %ymm16, %ymm2 +; AVX512F-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1,2],ymm6[3],ymm2[4,5],ymm6[6],ymm2[7],ymm6[8],ymm2[9,10],ymm6[11],ymm2[12,13],ymm6[14],ymm2[15] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm14[2],xmm1[3,4],xmm14[5],xmm1[6,7] +; AVX512F-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vpternlogq $202, %ymm21, %ymm19, %ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512F-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa64 %xmm24, %xmm2 +; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm3 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX512F-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm20, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm4, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm2, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm5, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -345,147 +345,153 @@ ; ; AVX2-SLOW-LABEL: load_i16_stride4_vf8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = mem[0,1],ymm3[0,1] +; AVX2-SLOW-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7],ymm4[8],ymm5[9,10,11],ymm4[12],ymm5[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-SLOW-NEXT: vpackusdw %xmm6, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = mem[0,1],ymm1[0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3],ymm6[4],ymm5[5,6,7],ymm6[8],ymm5[9,10,11],ymm6[12],ymm5[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-SLOW-NEXT: vpackusdw %xmm6, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm4, (%rsi) ; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rdx) ; AVX2-SLOW-NEXT: vmovdqa %xmm6, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%r8) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride4_vf8: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = mem[0,1],ymm3[0,1] +; AVX2-FAST-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7],ymm4[8],ymm5[9,10,11],ymm4[12],ymm5[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-NEXT: vpackusdw %xmm6, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = mem[0,1],ymm1[0,1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3],ymm6[4],ymm5[5,6,7],ymm6[8],ymm5[9,10,11],ymm6[12],ymm5[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-NEXT: vpackusdw %xmm6, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,1,2,0,4,5,6,7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,1,2,0,4,5,6,7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,0,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm4, (%rsi) ; AVX2-FAST-NEXT: vmovdqa %xmm5, (%rdx) ; AVX2-FAST-NEXT: vmovdqa %xmm6, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %xmm1, (%r8) +; AVX2-FAST-NEXT: vmovdqa %xmm0, (%r8) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride4_vf8: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm4 = mem[0,1],ymm3[0,1] +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7],ymm4[8],ymm5[9,10,11],ymm4[12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm6, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm6 = mem[0,1],ymm1[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3],ymm6[4],ymm5[5,6,7],ymm6[8],ymm5[9,10,11],ymm6[12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm6, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%r8) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -616,491 +622,500 @@ ; ; AVX1-ONLY-LABEL: load_i16_stride4_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm1[1,2,3],xmm5[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm1[1,2,3],xmm6[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm1[1,2,3],xmm7[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm1[1,2,3],xmm8[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm9[0],xmm1[1,2,3],xmm9[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0],xmm1[1,2,3],xmm4[4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm0[1,2,3],xmm3[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm0[1,2,3],xmm4[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm0[1,2,3],xmm5[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm0[1,2,3],xmm6[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0],xmm0[1,2,3],xmm8[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0],xmm0[1,2,3],xmm7[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vpackusdw %xmm10, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm1[1,2,3],xmm3[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm11, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpackusdw %xmm10, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm11, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpackusdw %xmm10, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm4[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsi) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm11, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride4_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = mem[0,1],ymm3[0,1] +; AVX2-SLOW-NEXT: vpxor %xmm8, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm8[1,2,3],ymm4[4],ymm8[5,6,7],ymm4[8],ymm8[9,10,11],ymm4[12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = mem[0,1],ymm1[0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1,2,3],ymm5[4],ymm8[5,6,7],ymm5[8],ymm8[9,10,11],ymm5[12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-SLOW-NEXT: vpackusdw %xmm6, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4],ymm8[5,6,7],ymm7[8],ymm8[9,10,11],ymm7[12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX2-SLOW-NEXT: vpackusdw %xmm9, %xmm7, %xmm7 +; AVX2-SLOW-NEXT: vpackusdw %xmm7, %xmm7, %xmm7 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm7, %ymm11 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm8[1,2,3],ymm11[4],ymm8[5,6,7],ymm11[8],ymm8[9,10,11],ymm11[12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX2-SLOW-NEXT: vpackusdw %xmm11, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vpackusdw %xmm8, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm10[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%rsi) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rdx) +; AVX2-SLOW-NEXT: vmovdqa %ymm10, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride4_vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,2,3,0,2,4,6] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm4 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm12 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm11 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = mem[0,1],ymm5[0,1] +; AVX2-FAST-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1,2,3],ymm4[4],ymm7[5,6,7],ymm4[8],ymm7[9,10,11],ymm4[12],ymm7[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm8 +; AVX2-FAST-NEXT: vpackusdw %xmm8, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = mem[0,1],ymm1[0,1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3],ymm8[4],ymm7[5,6,7],ymm8[8],ymm7[9,10,11],ymm8[12],ymm7[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FAST-NEXT: vpackusdw %xmm8, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm7, %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,2,2,3,0,2,4,6] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm7, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm10 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm10 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm13 -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm13 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm12 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,2,3,1,3,5,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [1,3,2,3,1,3,5,7] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm9 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[2,0,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-FAST-NEXT: vmovdqa %ymm4, (%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm4, (%rsi) +; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rdx) +; AVX2-FAST-NEXT: vmovdqa %ymm8, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r8) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride4_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm3, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm4 = mem[0,1],ymm3[0,1] +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7],ymm4[8],ymm5[9,10,11],ymm4[12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm6, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm6 = mem[0,1],ymm1[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm5[1,2,3],ymm6[4],ymm5[5,6,7],ymm6[8],ymm5[9,10,11],ymm6[12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm7, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm6, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm5[1,2,3],ymm8[4],ymm5[5,6,7],ymm8[8],ymm5[9,10,11],ymm8[12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm9, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm8, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0],ymm5[1,2,3],ymm11[4],ymm5[5,6,7],ymm11[8],ymm5[9,10,11],ymm11[12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm11, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm5, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm9, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm1, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm4[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm8[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm11[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: load_i16_stride4_vf16: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vpmovqw %ymm0, %xmm0 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512F-SLOW-NEXT: vpmovqw %zmm3, %xmm6 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm7 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm7 +; AVX512F-SLOW-NEXT: vpmovqw %ymm7, %xmm7 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512F-SLOW-NEXT: vpmovqw %zmm7, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm3, %zmm5 -; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm7, %zmm4 +; AVX512F-SLOW-NEXT: vpmovqw %zmm4, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[2,0,2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm3, %zmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm7, %zmm8 ; AVX512F-SLOW-NEXT: vpmovqw %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm3, %zmm3 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm7, %zmm3 ; AVX512F-SLOW-NEXT: vpmovqw %zmm3, %xmm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512F-SLOW-NEXT: vmovdqa %ymm2, (%rdx) -; AVX512F-SLOW-NEXT: vmovdqa %ymm5, (%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm4, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%r8) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: load_i16_stride4_vf16: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] -; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u> -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm6 -; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,2,3,4,6,12,14] -; AVX512F-FAST-NEXT: vpermt2d %ymm4, %ymm8, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512F-FAST-NEXT: vpmovqw %zmm4, %xmm9 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpsrlq $16, %zmm4, %zmm3 -; AVX512F-FAST-NEXT: vpmovqw %zmm3, %xmm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7] -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm5 -; AVX512F-FAST-NEXT: vpermt2d %ymm0, %ymm8, %ymm5 -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm4, %zmm0 -; AVX512F-FAST-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm7 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512F-FAST-NEXT: vpmovqw %zmm7, %xmm8 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm7, %zmm2 +; AVX512F-FAST-NEXT: vpmovqw %zmm2, %xmm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,2,3,1,3,5,7] +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm3 +; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm5 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm7, %zmm5 +; AVX512F-FAST-NEXT: vpmovqw %zmm5, %xmm5 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-FAST-NEXT: vpsrlq $48, %zmm4, %zmm3 -; AVX512F-FAST-NEXT: vpmovqw %zmm3, %xmm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa %ymm7, (%rsi) -; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa %ymm2, (%r8) +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpsrlq $48, %zmm7, %zmm2 +; AVX512F-FAST-NEXT: vpmovqw %zmm2, %xmm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa %ymm4, (%rsi) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-FAST-NEXT: vmovdqa %ymm3, (%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%r8) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -1138,11 +1153,11 @@ ; SSE-LABEL: load_i16_stride4_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $248, %rsp -; SSE-NEXT: movdqa 224(%rdi), %xmm3 +; SSE-NEXT: movdqa 160(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm4 +; SSE-NEXT: movdqa 128(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm5 +; SSE-NEXT: movdqa 144(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1172,7 +1187,7 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 240(%rdi), %xmm0 +; SSE-NEXT: movdqa 176(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,0,2,4,5,6,7] @@ -1181,18 +1196,18 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 192(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm2 +; SSE-NEXT: movdqa 224(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,0,2,4,5,6,7] @@ -1201,18 +1216,18 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 160(%rdi), %xmm2 +; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,0,2,4,5,6,7] @@ -1358,263 +1373,254 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movapd %xmm12, 32(%rdx) +; SSE-NEXT: movapd %xmm12, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movapd %xmm1, 32(%rcx) -; SSE-NEXT: movapd %xmm8, (%rcx) -; SSE-NEXT: movapd %xmm15, 48(%rcx) +; SSE-NEXT: movapd %xmm1, (%rcx) +; SSE-NEXT: movapd %xmm8, 48(%rcx) +; SSE-NEXT: movapd %xmm15, 32(%rcx) ; SSE-NEXT: movapd %xmm10, 16(%rcx) -; SSE-NEXT: movapd %xmm3, 32(%r8) -; SSE-NEXT: movapd %xmm7, (%r8) -; SSE-NEXT: movapd %xmm14, 48(%r8) +; SSE-NEXT: movapd %xmm3, (%r8) +; SSE-NEXT: movapd %xmm7, 48(%r8) +; SSE-NEXT: movapd %xmm14, 32(%r8) ; SSE-NEXT: movapd %xmm11, 16(%r8) ; SSE-NEXT: addq $248, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride4_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $280, %rsp # imm = 0x118 -; AVX1-ONLY-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm6[1,2,3],xmm4[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: subq $248, %rsp +; AVX1-ONLY-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm3[1,2,3],xmm12[4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0],xmm6[1,2,3],xmm11[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3],xmm3[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm3[1,2,3],xmm11[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm4, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm13[0],xmm3[1,2,3],xmm13[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm4, %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm6[1,2,3],xmm2[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm6[1,2,3],xmm10[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm3[1,2,3],xmm9[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm3[1,2,3],xmm10[4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm8, %xmm9, %xmm8 ; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0],xmm6[1,2,3],xmm12[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0],xmm6[1,2,3],xmm5[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm8, %xmm9, %xmm8 -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0],xmm6[1,2,3],xmm14[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0],xmm6[1,2,3],xmm13[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm15, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm8[0],xmm6[1,2,3],xmm8[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm3[1,2,3],xmm15[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm3[1,2,3],xmm8[4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1,2,3],xmm8[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm15, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpackusdw %xmm4, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0],xmm3[1,2,3],xmm14[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpackusdw %xmm4, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm3[1,2,3],xmm7[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpackusdw %xmm6, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm15 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd $231, (%rsp), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm3[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshuflw $116, (%rsp), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -1624,187 +1630,200 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX1-ONLY-NEXT: addq $280, %rsp # imm = 0x118 +; AVX1-ONLY-NEXT: addq $248, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride4_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $168, %rsp -; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: subq $232, %rsp +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[0,1],ymm1[0,1] +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm10 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm3[0,1] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm15 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7],ymm3[8],ymm2[9,10,11],ymm3[12],ymm2[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm6[0,1] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm12, %ymm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7],ymm3[8],ymm2[9,10,11],ymm3[12],ymm2[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm9 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa 176(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[3,1,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm9 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm7 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm8[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm15 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm5 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -1818,27 +1837,27 @@ ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] @@ -1851,392 +1870,407 @@ ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm14, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqa %ymm10, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-SLOW-NEXT: addq $168, %rsp +; AVX2-SLOW-NEXT: addq $232, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride4_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $104, %rsp -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 +; AVX2-FAST-NEXT: subq $136, %rsp +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm13 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm8 +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[0,1],ymm8[0,1] +; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[0,1],ymm13[0,1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm10 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm9 -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm11 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm11[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm9 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm14 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm12 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm13 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm13 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm13 -; AVX2-FAST-NEXT: vmovdqa 176(%rdi), %xmm11 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm12 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX2-FAST-NEXT: vmovdqa 144(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm2 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 176(%rdi), %xmm7 +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[0,1],ymm7[0,1] +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 144(%rdi), %xmm10 +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = mem[0,1],ymm10[0,1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3],ymm4[4],ymm1[5,6,7],ymm4[8],ymm1[9,10,11],ymm4[12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm15 +; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm2, %ymm14 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,2,3,1,3,5,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm12 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm6 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm12[2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm4 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm14[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm3, %ymm15 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm15, %ymm12 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm3[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm13 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm10[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm15, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,3,1,4,5,6,7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rdx) ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm3, (%rdx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%r8) -; AVX2-FAST-NEXT: addq $104, %rsp +; AVX2-FAST-NEXT: addq $136, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride4_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $184, %rsp -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: subq $248, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm2[0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm4 = mem[0,1],ymm0[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm2[1,2,3],ymm4[4],ymm2[5,6,7],ymm4[8],ymm2[9,10,11],ymm4[12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm4, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm2[1,2,3],ymm4[4],ymm2[5,6,7],ymm4[8],ymm2[9,10,11],ymm4[12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7],ymm5[8],ymm2[9,10,11],ymm5[12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm6, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm5, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm0[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm4 = mem[0,1],ymm0[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm2[1,2,3],ymm4[4],ymm2[5,6,7],ymm4[8],ymm2[9,10,11],ymm4[12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm4, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm2[1,2,3],ymm4[4],ymm2[5,6,7],ymm4[8],ymm2[9,10,11],ymm4[12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm13, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7],ymm5[8],ymm2[9,10,11],ymm5[12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm2, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm5, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm6, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm7, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm13, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm10, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm7, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm8, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm15, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm13, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm3, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm15, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm13, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm12 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm12 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm13[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r8) -; AVX2-FAST-PERLANE-NEXT: addq $184, %rsp +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8) +; AVX2-FAST-PERLANE-NEXT: addq $248, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -2244,97 +2278,97 @@ ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vpmovqw %ymm2, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm17 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm17[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm6 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpmovqw %zmm1, %xmm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm9 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm6 +; AVX512F-SLOW-NEXT: vpmovqw %ymm6, %xmm6 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpmovqw %zmm1, %xmm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vpmovqw %ymm3, %xmm3 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpmovqw %zmm0, %xmm11 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm10[0,1,2,3],zmm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm13[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm14[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm7 ; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm14 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm1, %zmm9 -; AVX512F-SLOW-NEXT: vpmovqw %zmm9, %xmm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm12[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,2,2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm11, %ymm7, %ymm16 +; AVX512F-SLOW-NEXT: vpmovqw %ymm16, %xmm16 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] +; AVX512F-SLOW-NEXT: vpmovqw %zmm0, %xmm15 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm0, %zmm9 -; AVX512F-SLOW-NEXT: vpmovqw %zmm9, %xmm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm12 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm14[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm8[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[3,1,2,3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm1, %zmm10 +; AVX512F-SLOW-NEXT: vpmovqw %zmm10, %xmm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm13[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm14[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm0, %zmm12 +; AVX512F-SLOW-NEXT: vpmovqw %zmm12, %xmm12 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm17[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] ; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm1, %zmm13 ; AVX512F-SLOW-NEXT: vpmovqw %zmm13, %xmm13 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm8[0,1,2,0,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm11[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm7[2,0,2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] @@ -2342,26 +2376,26 @@ ; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm13[0,1,2,3],zmm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm1, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqw %zmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,3,1,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm0, %zmm0 @@ -2369,7 +2403,7 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512F-SLOW-NEXT: vzeroupper @@ -2379,66 +2413,64 @@ ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] -; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6] ; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 ; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u> ; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm8 ; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm10 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,2,2,3,4,6,12,14] -; AVX512F-FAST-NEXT: vpermt2d %ymm7, %ymm11, %ymm10 -; AVX512F-FAST-NEXT: vpmovqw %zmm1, %xmm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] +; AVX512F-FAST-NEXT: vpmovqw %zmm1, %xmm10 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm10 -; AVX512F-FAST-NEXT: vpermd %ymm10, %ymm4, %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm13 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm14 -; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm4, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm4 -; AVX512F-FAST-NEXT: vpermt2d %ymm13, %ymm11, %ymm4 -; AVX512F-FAST-NEXT: vpmovqw %zmm0, %xmm13 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm10, %ymm4, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm12 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm13 +; AVX512F-FAST-NEXT: vpermd %ymm13, %ymm4, %ymm14 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm14, %ymm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] +; AVX512F-FAST-NEXT: vpmovqw %zmm0, %xmm12 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm4[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm13 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm12 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm9 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-FAST-NEXT: vpsrlq $16, %zmm1, %zmm13 -; AVX512F-FAST-NEXT: vpmovqw %zmm13, %xmm13 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm12[6,7] +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm1, %zmm12 +; AVX512F-FAST-NEXT: vpmovqw %zmm12, %xmm12 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm13 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-FAST-NEXT: vpsrlq $16, %zmm0, %zmm13 -; AVX512F-FAST-NEXT: vpmovqw %zmm13, %xmm13 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm12[0,1,2,3],zmm9[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [1,3,2,3,1,3,5,7] -; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm12, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm13 -; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm15 -; AVX512F-FAST-NEXT: vpermt2d %ymm13, %ymm11, %ymm15 -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm1, %zmm13 -; AVX512F-FAST-NEXT: vpmovqw %zmm13, %xmm13 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm13 -; AVX512F-FAST-NEXT: vpermd %ymm10, %ymm12, %ymm10 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm12 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm0, %zmm12 +; AVX512F-FAST-NEXT: vpmovqw %zmm12, %xmm12 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [1,3,2,3,1,3,5,7] +; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm11, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm12 +; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm11, %ymm8 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm14 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm1, %zmm14 +; AVX512F-FAST-NEXT: vpmovqw %zmm14, %xmm14 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 +; AVX512F-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm2 -; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm12, %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm3 -; AVX512F-FAST-NEXT: vpermt2d %ymm2, %ymm11, %ymm3 -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512F-FAST-NEXT: vpmovqw %zmm2, %xmm2 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm13[4,5,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm13, %ymm11, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm0, %zmm3 +; AVX512F-FAST-NEXT: vpmovqw %zmm3, %xmm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7] ; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm3 ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm6 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] @@ -2447,7 +2479,7 @@ ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm4 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-FAST-NEXT: vpsrlq $48, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vpmovqw %zmm0, %xmm0 @@ -2511,20 +2543,20 @@ ; SSE-LABEL: load_i16_stride4_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $824, %rsp # imm = 0x338 -; SSE-NEXT: movdqa 352(%rdi), %xmm3 +; SSE-NEXT: movdqa 288(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm4 +; SSE-NEXT: movdqa 256(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm5 +; SSE-NEXT: movdqa 272(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] @@ -2548,7 +2580,7 @@ ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 368(%rdi), %xmm0 +; SSE-NEXT: movdqa 304(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2559,9 +2591,9 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 64(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 80(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2570,9 +2602,9 @@ ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm2 +; SSE-NEXT: movdqa 96(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: movdqa 112(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2583,9 +2615,9 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm1 +; SSE-NEXT: movdqa 320(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm0 +; SSE-NEXT: movdqa 336(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2594,9 +2626,9 @@ ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: movdqa 352(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 304(%rdi), %xmm0 +; SSE-NEXT: movdqa 368(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2607,10 +2639,10 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm1 +; SSE-NEXT: movdqa 128(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 144(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] @@ -2618,9 +2650,9 @@ ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 224(%rdi), %xmm2 +; SSE-NEXT: movdqa 160(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm0 +; SSE-NEXT: movdqa 176(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2630,18 +2662,18 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm1 +; SSE-NEXT: movdqa 384(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 464(%rdi), %xmm0 +; SSE-NEXT: movdqa 400(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 480(%rdi), %xmm2 +; SSE-NEXT: movdqa 416(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 496(%rdi), %xmm0 +; SSE-NEXT: movdqa 432(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,0,2,4,5,6,7] @@ -2650,18 +2682,18 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm1 +; SSE-NEXT: movdqa 192(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm0 +; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 160(%rdi), %xmm2 +; SSE-NEXT: movdqa 224(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,0,2,4,5,6,7] @@ -2670,19 +2702,19 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 384(%rdi), %xmm0 +; SSE-NEXT: movdqa 448(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 400(%rdi), %xmm1 +; SSE-NEXT: movdqa 464(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 416(%rdi), %xmm2 +; SSE-NEXT: movdqa 480(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 496(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,0,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] @@ -2773,13 +2805,28 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,1,2,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -2793,26 +2840,24 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,0,2,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm8[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2820,17 +2865,18 @@ ; SSE-NEXT: # xmm1 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm8[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2838,35 +2884,37 @@ ; SSE-NEXT: # xmm1 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm1[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] -; SSE-NEXT: pshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm8[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,1,2,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm12[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm2[0],xmm10[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2874,473 +2922,462 @@ ; SSE-NEXT: # xmm1 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm6[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm11[0],xmm5[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm8[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm15[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm6[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm13[0],xmm8[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm3[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm6[0],xmm13[1] -; SSE-NEXT: pshuflw $231, (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm3[0],xmm15[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movapd %xmm2, 96(%rcx) -; SSE-NEXT: movapd %xmm5, 32(%rcx) -; SSE-NEXT: movapd %xmm9, 112(%rcx) -; SSE-NEXT: movapd %xmm10, 48(%rcx) -; SSE-NEXT: movapd %xmm14, 64(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movapd %xmm7, 112(%r8) -; SSE-NEXT: movapd %xmm4, 96(%r8) -; SSE-NEXT: movapd %xmm3, 80(%r8) -; SSE-NEXT: movapd %xmm15, 64(%r8) -; SSE-NEXT: movapd %xmm12, 48(%r8) -; SSE-NEXT: movapd %xmm13, 32(%r8) -; SSE-NEXT: movapd %xmm8, 16(%r8) -; SSE-NEXT: movapd %xmm0, (%r8) +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: pshuflw $116, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm3[0],xmm15[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rdx) +; SSE-NEXT: movapd %xmm8, 112(%rcx) +; SSE-NEXT: movapd %xmm10, 96(%rcx) +; SSE-NEXT: movapd %xmm11, 80(%rcx) +; SSE-NEXT: movapd %xmm12, 64(%rcx) +; SSE-NEXT: movapd %xmm13, 48(%rcx) +; SSE-NEXT: movapd %xmm14, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: movapd %xmm15, 112(%r8) +; SSE-NEXT: movapd %xmm0, 96(%r8) +; SSE-NEXT: movapd %xmm1, 80(%r8) +; SSE-NEXT: movapd %xmm4, 64(%r8) +; SSE-NEXT: movapd %xmm5, 48(%r8) +; SSE-NEXT: movapd %xmm6, 32(%r8) +; SSE-NEXT: movapd %xmm7, 16(%r8) +; SSE-NEXT: movapd %xmm9, (%r8) ; SSE-NEXT: addq $824, %rsp # imm = 0x338 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride4_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $776, %rsp # imm = 0x308 -; AVX1-ONLY-NEXT: vpxor %xmm10, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm10[1,2,3],xmm8[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX1-ONLY-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm10[1,2,3],xmm4[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm10[1,2,3],xmm7[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm5, %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm10[1,2,3],xmm13[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm10[1,2,3],xmm14[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0],xmm6[1,2,3],xmm3[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0],xmm6[1,2,3],xmm2[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm8, %xmm9, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 +; AVX1-ONLY-NEXT: vpackusdw %xmm8, %xmm9, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm12, %xmm13, %xmm12 +; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm12, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm11, %xmm9 -; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm9, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm12, %xmm14, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0],xmm10[1,2,3],xmm15[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm9, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm14, %xmm15, %xmm14 +; AVX1-ONLY-NEXT: vpackusdw %xmm12, %xmm14, %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0],xmm6[1,2,3],xmm13[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm12, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm11[0],xmm6[1,2,3],xmm11[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm11, %xmm9 -; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm9, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm12, %xmm14, %xmm12 +; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm12, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm10[0],xmm6[1,2,3],xmm10[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm12, %xmm14, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm9[0],xmm6[1,2,3],xmm9[4],xmm6[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2,3],xmm9[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm10[1,2,3],xmm11[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm11, %xmm9 -; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm9, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm11, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm11, %xmm12, %xmm11 -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm11, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm14, %xmm15, %xmm14 +; AVX1-ONLY-NEXT: vpackusdw %xmm12, %xmm14, %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0],xmm6[1,2,3],xmm5[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0],xmm6[1,2,3],xmm8[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm12, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm3[0],xmm6[1,2,3],xmm3[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm4[0],xmm6[1,2,3],xmm4[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm12, %xmm14, %xmm12 +; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm12, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm11, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0],xmm10[1,2,3],xmm5[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm2[0],xmm6[1,2,3],xmm2[4],xmm6[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm11, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm14[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm12, %xmm14, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm6[1,2,3],xmm15[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpackusdw %xmm14, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpackusdw %xmm12, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm14[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm15[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, (%rsp), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm15[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] @@ -3354,9 +3391,9 @@ ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufd $231, (%rsp), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] @@ -3370,84 +3407,79 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[3,1,2,3] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -3457,23 +3489,24 @@ ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $231, (%rsp), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -3481,458 +3514,492 @@ ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshuflw $231, (%rsp), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 112(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 80(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm15, (%r8) -; AVX1-ONLY-NEXT: addq $776, %rsp # imm = 0x308 +; AVX1-ONLY-NEXT: vmovaps %ymm11, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) +; AVX1-ONLY-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride4_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $696, %rsp # imm = 0x2B8 +; AVX2-SLOW-NEXT: subq $936, %rsp # imm = 0x3A8 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm0[0,1] ; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[0,1],ymm3[0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm9, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm0[1,2,3],ymm4[4],ymm0[5,6,7],ymm4[8],ymm0[9,10,11],ymm4[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm1[0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 272(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = mem[0,1],ymm2[0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 368(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm10, %ymm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vmovdqa 336(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm15, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm0[1,2,3],ymm4[4],ymm0[5,6,7],ymm4[8],ymm0[9,10,11],ymm4[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm1[0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = mem[0,1],ymm2[0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm0[1,2,3],ymm4[4],ymm0[5,6,7],ymm4[8],ymm0[9,10,11],ymm4[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm1[0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 400(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = mem[0,1],ymm2[0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vmovdqa 496(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 464(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3],ymm4[4],ymm0[5,6,7],ymm4[8],ymm0[9,10,11],ymm4[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 272(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = mem[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 336(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm5 = mem[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshufd $232, (%rsp), %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa 176(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm5 = mem[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 496(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vmovdqa 464(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; AVX2-SLOW-NEXT: vmovdqa 400(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm5 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[3,1,2,3] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm10 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm10[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm13 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm12 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm8 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm11 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm9 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm7 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm6 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,0,2,3,4,5,6,7] @@ -3940,7 +4007,7 @@ ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm5 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -3956,7 +4023,7 @@ ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] @@ -3965,24 +4032,25 @@ ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm10 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm14 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm10 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm10 = mem[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -3990,694 +4058,807 @@ ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm10 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm14 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm10 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm10 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, (%rsp), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, (%rsp), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm15, (%r8) -; AVX2-SLOW-NEXT: addq $696, %rsp # imm = 0x2B8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, 32(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm10, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r8) +; AVX2-SLOW-NEXT: addq $936, %rsp # imm = 0x3A8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride4_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $680, %rsp # imm = 0x2A8 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX2-FAST-NEXT: subq $840, %rsp # imm = 0x348 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm8 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[0,1],ymm1[0,1] +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm11 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpxor %xmm12, %xmm12, %xmm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1,2,3],ymm0[4],ymm12[5,6,7],ymm0[8],ymm12[9,10,11],ymm0[12],ymm12[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 144(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm1[0,1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2,3],ymm1[4],ymm12[5,6,7],ymm1[8],ymm12[9,10,11],ymm1[12],ymm12[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,2,3,0,2,4,6] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm3, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm4 ; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX2-FAST-NEXT: vpackusdw %xmm6, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm4, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 432(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1,2,3],ymm0[4],ymm12[5,6,7],ymm0[8],ymm12[9,10,11],ymm0[12],ymm12[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 400(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm1[0,1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2,3],ymm1[4],ymm12[5,6,7],ymm1[8],ymm12[9,10,11],ymm1[12],ymm12[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm12, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm0[0,1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2,3],ymm1[4],ymm12[5,6,7],ymm1[8],ymm12[9,10,11],ymm1[12],ymm12[13,14,15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = mem[0,1],ymm0[0,1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2,3],ymm4[4],ymm12[5,6,7],ymm4[8],ymm12[9,10,11],ymm4[12],ymm12[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm4, %xmm1 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm9 +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm0[0,1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2,3],ymm1[4],ymm12[5,6,7],ymm1[8],ymm12[9,10,11],ymm1[12],ymm12[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa 272(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = mem[0,1],ymm0[0,1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2,3],ymm4[4],ymm12[5,6,7],ymm4[8],ymm12[9,10,11],ymm4[12],ymm12[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX2-FAST-NEXT: vpackusdw %xmm9, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm12 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm12, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm11, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm9 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm13 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm13, %xmm11 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm6 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 272(%rdi), %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm14 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm15 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm3 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm11 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm11 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm14 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm14 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm2 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm7, %xmm2 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm10 -; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm2 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm10 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm11, %ymm5 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm6 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa 432(%rdi), %xmm10 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm2 +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 400(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm11 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm6 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = mem[3,1,2,3] ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm14 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm13, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm12 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = mem[3,1,2,3] ; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm11 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshufd $231, (%rsp), %xmm7 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm10 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm7 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm5 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm9 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw $231, (%rsp), %xmm15 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 64(%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%r8) -; AVX2-FAST-NEXT: addq $680, %rsp # imm = 0x2A8 +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%r8) +; AVX2-FAST-NEXT: addq $840, %rsp # imm = 0x348 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride4_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX2-FAST-PERLANE-NEXT: subq $904, %rsp # imm = 0x388 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm0[0,1] ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[0,1],ymm3[0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm0[1,2,3],ymm4[4],ymm0[5,6,7],ymm4[8],ymm0[9,10,11],ymm4[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm1[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 272(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm3 = mem[0,1],ymm2[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 368(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 336(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm9, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm0[1,2,3],ymm4[4],ymm0[5,6,7],ymm4[8],ymm0[9,10,11],ymm4[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm1[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm3 = mem[0,1],ymm2[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm0[1,2,3],ymm4[4],ymm0[5,6,7],ymm4[8],ymm0[9,10,11],ymm4[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm1[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 400(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm3 = mem[0,1],ymm3[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 496(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3],ymm4[4],ymm0[5,6,7],ymm4[8],ymm0[9,10,11],ymm4[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 336(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm7, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm7, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 272(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm11, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm12, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm15, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm15, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 496(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm7, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm10, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm11, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm13, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm14, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 400(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,1,2,3] @@ -4700,92 +4881,55 @@ ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm14 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm13 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm12 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm14 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] @@ -4794,24 +4938,25 @@ ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm13 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -4819,530 +4964,538 @@ ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, (%rsp), %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, (%r8) -; AVX2-FAST-PERLANE-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r8) +; AVX2-FAST-PERLANE-NEXT: addq $904, %rsp # imm = 0x388 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: load_i16_stride4_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $104, %rsp -; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 112(%rdi), %xmm20 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm20[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 496(%rdi), %xmm19 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm19[0,2,2,3] +; AVX512F-SLOW-NEXT: subq $136, %rsp +; AVX512F-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm16 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm16[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 224(%rdi), %xmm17 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm17[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm26 +; AVX512F-SLOW-NEXT: vmovdqa64 208(%rdi), %xmm30 +; AVX512F-SLOW-NEXT: vmovdqa64 112(%rdi), %xmm18 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm18[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm9 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 480(%rdi), %xmm21 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm21[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 368(%rdi), %xmm17 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm17[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 352(%rdi), %xmm24 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm24[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm18 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] -; AVX512F-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm25 -; AVX512F-SLOW-NEXT: vmovdqa64 336(%rdi), %xmm29 -; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 464(%rdi), %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm6 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm27 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm16[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 496(%rdi), %xmm19 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm19[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 480(%rdi), %xmm14 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 368(%rdi), %xmm27 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm27[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 352(%rdi), %xmm28 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm28[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa 336(%rdi), %xmm13 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm30[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm26[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm26 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm29[0,2,2,3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm24 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm25[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm30 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm28 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm31 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm20[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm23 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm20 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm19[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm21[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm21 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm16[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm19 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm17[3,1,2,3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm25 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm29 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm16[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm17[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm31 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm30[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm23 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm26[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm1[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm21 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm18[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm11[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm20 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,1,2,3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm24[3,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm17 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm29[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm18 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm19[3,1,2,3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm25[3,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm16 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vpmovqw %ymm0, %xmm0 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512F-SLOW-NEXT: vpmovqw %zmm0, %xmm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX512F-SLOW-NEXT: vpmovqw %ymm5, %xmm5 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm29 -; AVX512F-SLOW-NEXT: vpmovqw %zmm29, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %ymm4 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm14[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm19 = xmm12[0],xmm15[0],xmm12[1],xmm15[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[3,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[3,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm17 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm27[3,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm12, (%rsp) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm28[3,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm28 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[3,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[3,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm16 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm26, %ymm12 +; AVX512F-SLOW-NEXT: vpmovqw %ymm12, %xmm12 +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm27 +; AVX512F-SLOW-NEXT: vpmovqw %zmm27, %xmm15 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 ; AVX512F-SLOW-NEXT: vpmovqw %ymm4, %xmm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm5 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm25 -; AVX512F-SLOW-NEXT: vpmovqw %zmm25, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm26 +; AVX512F-SLOW-NEXT: vpmovqw %zmm26, %xmm5 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm5 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm5 ; AVX512F-SLOW-NEXT: vpmovqw %ymm5, %xmm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512F-SLOW-NEXT: vpmovqw %zmm18, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm5[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm29, %zmm8 -; AVX512F-SLOW-NEXT: vpmovqw %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm5[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm25, %zmm5 -; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm18, %zmm8 -; AVX512F-SLOW-NEXT: vpmovqw %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm5[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm0, %zmm8 -; AVX512F-SLOW-NEXT: vpmovqw %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm7 +; AVX512F-SLOW-NEXT: vpmovqw %zmm7, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm10, %ymm5 +; AVX512F-SLOW-NEXT: vpmovqw %ymm5, %xmm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3,4,5],ymm10[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm12 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm27, %zmm12 +; AVX512F-SLOW-NEXT: vpmovqw %zmm12, %xmm12 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm26, %zmm13 +; AVX512F-SLOW-NEXT: vpmovqw %zmm13, %xmm13 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm12[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm12 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm0, %ymm13 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm7, %zmm13 +; AVX512F-SLOW-NEXT: vpmovqw %zmm13, %xmm13 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm29, %ymm0, %ymm13 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm15 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm5, %zmm15 +; AVX512F-SLOW-NEXT: vpmovqw %zmm15, %xmm15 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm12[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm12 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm27, %zmm12 +; AVX512F-SLOW-NEXT: vpmovqw %zmm12, %xmm12 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm6 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm29, %zmm8 -; AVX512F-SLOW-NEXT: vpmovqw %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm12 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm26, %zmm12 +; AVX512F-SLOW-NEXT: vpmovqw %zmm12, %xmm12 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm6[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,1,3,1,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm25, %zmm5 -; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm18, %zmm9 -; AVX512F-SLOW-NEXT: vpmovqw %zmm9, %xmm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm8[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm29, %zmm5 -; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm5 = mem[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm25, %zmm5 -; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm11 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm7, %zmm11 +; AVX512F-SLOW-NEXT: vpmovqw %zmm11, %xmm11 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 +; AVX512F-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm11 = mem[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm11 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm12 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm5, %zmm12 +; AVX512F-SLOW-NEXT: vpmovqw %zmm12, %xmm12 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm11[0,1,2,3],zmm6[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm11 = mem[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm14[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm18, %zmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm27, %zmm2 +; AVX512F-SLOW-NEXT: vpmovqw %zmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm12 = mem[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm26, %zmm9 +; AVX512F-SLOW-NEXT: vpmovqw %zmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw $116, (%rsp), %xmm3 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm9 = mem[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpmovqw %zmm7, %xmm7 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512F-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm7 = mem[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm9 = mem[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm5, %zmm5 ; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 64(%rsi) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm4, (%rsi) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, 64(%rdx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, (%rdx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rcx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, (%rcx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r8) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512F-SLOW-NEXT: addq $104, %rsp +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, (%rsi) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rdx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 64(%r8) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, (%r8) +; AVX512F-SLOW-NEXT: addq $136, %rsp ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: load_i16_stride4_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa64 256(%rdi), %zmm23 -; AVX512F-FAST-NEXT: vmovdqa64 384(%rdi), %zmm26 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512F-FAST-NEXT: vmovdqa64 256(%rdi), %zmm22 +; AVX512F-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] -; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] -; AVX512F-FAST-NEXT: vmovdqa64 224(%rdi), %ymm24 -; AVX512F-FAST-NEXT: vpermd %ymm24, %ymm1, %ymm10 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u> -; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %ymm25 -; AVX512F-FAST-NEXT: vpermd %ymm25, %ymm1, %ymm11 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,2,2,3,4,6,12,14] -; AVX512F-FAST-NEXT: vpermt2d %ymm0, %ymm7, %ymm3 -; AVX512F-FAST-NEXT: vpmovqw %zmm4, %xmm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,2,3,0,2,4,6] +; AVX512F-FAST-NEXT: vmovdqa64 224(%rdi), %ymm23 +; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm8, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %ymm24 +; AVX512F-FAST-NEXT: vpermd %ymm24, %ymm8, %ymm10 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-FAST-NEXT: vpmovqw %zmm4, %xmm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 96(%rdi), %ymm27 -; AVX512F-FAST-NEXT: vpermd %ymm27, %ymm1, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %ymm28 -; AVX512F-FAST-NEXT: vpermd %ymm28, %ymm1, %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm12 -; AVX512F-FAST-NEXT: vpermt2d %ymm9, %ymm7, %ymm12 -; AVX512F-FAST-NEXT: vpmovqw %zmm30, %xmm9 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm9[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 480(%rdi), %ymm16 -; AVX512F-FAST-NEXT: vpermd %ymm16, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa64 448(%rdi), %ymm17 -; AVX512F-FAST-NEXT: vpermd %ymm17, %ymm1, %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm13 -; AVX512F-FAST-NEXT: vpermt2d %ymm9, %ymm7, %ymm13 -; AVX512F-FAST-NEXT: vpmovqw %zmm26, %xmm9 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 352(%rdi), %ymm18 -; AVX512F-FAST-NEXT: vpermd %ymm18, %ymm1, %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa64 320(%rdi), %ymm20 -; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm15 -; AVX512F-FAST-NEXT: vpermt2d %ymm14, %ymm7, %ymm15 -; AVX512F-FAST-NEXT: vpmovqw %zmm23, %xmm14 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm14[0,1,2,3],zmm9[4,5,6,7] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm11, %ymm11 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm14[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 96(%rdi), %ymm25 +; AVX512F-FAST-NEXT: vpermd %ymm25, %ymm8, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm11 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %ymm26 +; AVX512F-FAST-NEXT: vpermd %ymm26, %ymm8, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm12 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX512F-FAST-NEXT: vpmovqw %zmm2, %xmm12 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm11[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %ymm15 +; AVX512F-FAST-NEXT: vpermd %ymm15, %ymm8, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm12 +; AVX512F-FAST-NEXT: vmovdqa64 448(%rdi), %ymm16 +; AVX512F-FAST-NEXT: vpermd %ymm16, %ymm8, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm13 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX512F-FAST-NEXT: vpmovqw %zmm1, %xmm13 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm18 +; AVX512F-FAST-NEXT: vmovdqa64 352(%rdi), %ymm17 +; AVX512F-FAST-NEXT: vpermd %ymm17, %ymm8, %ymm12 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa64 320(%rdi), %ymm19 +; AVX512F-FAST-NEXT: vpermd %ymm19, %ymm8, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm8 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm14[6,7] +; AVX512F-FAST-NEXT: vpmovqw %zmm22, %xmm14 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm8[0,1,2,3],zmm18[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm10 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7] ; AVX512F-FAST-NEXT: vpsrlq $16, %zmm4, %zmm14 ; AVX512F-FAST-NEXT: vpmovqw %zmm14, %xmm14 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpsrlq $16, %zmm30, %zmm8 -; AVX512F-FAST-NEXT: vpmovqw %zmm8, %xmm8 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm3[0,1,2,3],zmm11[4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm12, %ymm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm6 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm2, %zmm6 +; AVX512F-FAST-NEXT: vpmovqw %zmm6, %xmm6 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm3[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vpsrlq $16, %zmm26, %zmm3 +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm1, %zmm3 ; AVX512F-FAST-NEXT: vpmovqw %zmm3, %xmm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpsrlq $16, %zmm23, %zmm3 -; AVX512F-FAST-NEXT: vpmovqw %zmm3, %xmm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [1,3,2,3,1,3,5,7] -; AVX512F-FAST-NEXT: vpermd %ymm24, %ymm15, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm0 -; AVX512F-FAST-NEXT: vpermd %ymm25, %ymm15, %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm1 -; AVX512F-FAST-NEXT: vpermt2d %ymm0, %ymm7, %ymm1 -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm4, %zmm0 -; AVX512F-FAST-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vpermd %ymm27, %ymm15, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm13 -; AVX512F-FAST-NEXT: vpermd %ymm28, %ymm15, %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm14 -; AVX512F-FAST-NEXT: vpermt2d %ymm13, %ymm7, %ymm14 -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm30, %zmm13 -; AVX512F-FAST-NEXT: vpmovqw %zmm13, %xmm13 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm13[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm16, %ymm15, %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm1 -; AVX512F-FAST-NEXT: vpermd %ymm17, %ymm15, %ymm14 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm11 -; AVX512F-FAST-NEXT: vpermt2d %ymm1, %ymm7, %ymm11 -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm26, %zmm1 -; AVX512F-FAST-NEXT: vpmovqw %zmm1, %xmm1 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm11 -; AVX512F-FAST-NEXT: vpermd %ymm18, %ymm15, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm2 -; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm15, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm6 -; AVX512F-FAST-NEXT: vpermt2d %ymm2, %ymm7, %ymm6 -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm23, %zmm2 -; AVX512F-FAST-NEXT: vpmovqw %zmm2, %xmm2 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm11[4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm12, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm6 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpsrlq $48, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpmovqw %zmm4, %xmm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm22, %zmm6 +; AVX512F-FAST-NEXT: vpmovqw %zmm6, %xmm6 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm3[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [1,3,2,3,1,3,5,7] +; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm27, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm6 +; AVX512F-FAST-NEXT: vpermd %ymm24, %ymm27, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm4, %zmm11 +; AVX512F-FAST-NEXT: vpmovqw %zmm11, %xmm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm12 +; AVX512F-FAST-NEXT: vpermd %ymm25, %ymm27, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm13 +; AVX512F-FAST-NEXT: vpermd %ymm26, %ymm27, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm14 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm2, %zmm14 +; AVX512F-FAST-NEXT: vpmovqw %zmm14, %xmm14 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm13[0,1,2,3],zmm12[4,5,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm15, %ymm27, %ymm12 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm14 +; AVX512F-FAST-NEXT: vpermd %ymm16, %ymm27, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm15 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm1, %zmm15 +; AVX512F-FAST-NEXT: vpmovqw %zmm15, %xmm15 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm15 +; AVX512F-FAST-NEXT: vpermd %ymm17, %ymm27, %ymm14 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm10 +; AVX512F-FAST-NEXT: vpermd %ymm19, %ymm27, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm7 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm22, %zmm10 +; AVX512F-FAST-NEXT: vpmovqw %zmm10, %xmm10 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm15[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3 ; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm12, %ymm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vpsrlq $48, %zmm30, %zmm4 -; AVX512F-FAST-NEXT: vpmovqw %zmm4, %xmm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpsrlq $48, %zmm4, %zmm3 +; AVX512F-FAST-NEXT: vpmovqw %zmm3, %xmm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm4 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpsrlq $48, %zmm26, %zmm4 -; AVX512F-FAST-NEXT: vpmovqw %zmm4, %xmm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpsrlq $48, %zmm23, %zmm4 -; AVX512F-FAST-NEXT: vpmovqw %zmm4, %xmm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, 64(%rsi) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, (%rsi) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, 64(%rdx) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, (%rdx) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 64(%rcx) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512F-FAST-NEXT: vpsrlq $48, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vpmovqw %zmm2, %xmm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm12, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpsrlq $48, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm14, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpsrlq $48, %zmm22, %zmm3 +; AVX512F-FAST-NEXT: vpmovqw %zmm3, %xmm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, 64(%rsi) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, (%rsi) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, (%rdx) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 64(%rcx) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, (%rcx) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%r8) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512F-FAST-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -224,38 +224,38 @@ ; ; AVX1-ONLY-LABEL: load_i16_stride5_vf4: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,8,9,6,7,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,10,11,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovq %xmm0, (%rsi) +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovq %xmm3, (%rsi) ; AVX1-ONLY-NEXT: vmovq %xmm4, (%rdx) ; AVX1-ONLY-NEXT: vmovq %xmm5, (%rcx) ; AVX1-ONLY-NEXT: vmovq %xmm6, (%r8) -; AVX1-ONLY-NEXT: vmovq %xmm1, (%r9) +; AVX1-ONLY-NEXT: vmovq %xmm0, (%r9) ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride5_vf4: @@ -265,26 +265,27 @@ ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm1[1],xmm5[2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0],xmm4[1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm1[2],xmm6[3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovq %xmm3, (%rsi) -; AVX2-SLOW-NEXT: vmovq %xmm4, (%rdx) -; AVX2-SLOW-NEXT: vmovq %xmm5, (%rcx) -; AVX2-SLOW-NEXT: vmovq %xmm6, (%r8) -; AVX2-SLOW-NEXT: vmovq %xmm0, (%r9) +; AVX2-SLOW-NEXT: vmovq %xmm5, (%rdx) +; AVX2-SLOW-NEXT: vmovq %xmm6, (%rcx) +; AVX2-SLOW-NEXT: vmovq %xmm0, (%r8) +; AVX2-SLOW-NEXT: vmovq %xmm1, (%r9) ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride5_vf4: @@ -293,25 +294,26 @@ ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm1[1],xmm5[2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0],xmm4[1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm1[2],xmm6[3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FAST-NEXT: vmovq %xmm4, (%rdx) -; AVX2-FAST-NEXT: vmovq %xmm5, (%rcx) -; AVX2-FAST-NEXT: vmovq %xmm6, (%r8) -; AVX2-FAST-NEXT: vmovq %xmm0, (%r9) +; AVX2-FAST-NEXT: vmovq %xmm5, (%rdx) +; AVX2-FAST-NEXT: vmovq %xmm6, (%rcx) +; AVX2-FAST-NEXT: vmovq %xmm0, (%r8) +; AVX2-FAST-NEXT: vmovq %xmm1, (%r9) ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride5_vf4: @@ -320,25 +322,26 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm1[1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0],xmm4[1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm1[2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm4, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm5, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm6, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm0, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm5, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm6, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm0, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm1, (%r9) ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: load_i16_stride5_vf4: @@ -356,21 +359,20 @@ ; AVX512F-SLOW-NEXT: vmovd %r10d, %xmm4 ; AVX512F-SLOW-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vpextrw $3, %xmm1, %eax -; AVX512F-SLOW-NEXT: vpinsrw $2, %eax, %xmm4, %xmm1 +; AVX512F-SLOW-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vmovd %xmm2, %eax -; AVX512F-SLOW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] +; AVX512F-SLOW-NEXT: vpinsrw $3, %eax, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],mem[1,2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovq %xmm3, (%rsi) -; AVX512F-SLOW-NEXT: vmovq %xmm1, (%rdx) +; AVX512F-SLOW-NEXT: vmovq %xmm4, (%rdx) ; AVX512F-SLOW-NEXT: vmovq %xmm5, (%rcx) ; AVX512F-SLOW-NEXT: vmovq %xmm6, (%r8) ; AVX512F-SLOW-NEXT: vmovq %xmm0, (%r9) @@ -381,58 +383,92 @@ ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpextrw $5, %xmm0, %eax +; AVX512F-FAST-NEXT: vpinsrw $1, %eax, %xmm0, %xmm3 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512F-FAST-NEXT: vpextrw $7, %xmm1, %eax ; AVX512F-FAST-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 ; AVX512F-FAST-NEXT: vpextrw $3, %xmm1, %eax -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 ; AVX512F-FAST-NEXT: vmovd %xmm2, %eax -; AVX512F-FAST-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] +; AVX512F-FAST-NEXT: vpinsrw $3, %eax, %xmm4, %xmm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],mem[1,2,3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX512F-FAST-NEXT: vpermi2d %xmm6, %xmm1, %xmm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[6,7,12,13,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] ; AVX512F-FAST-NEXT: vmovq %xmm3, (%rsi) -; AVX512F-FAST-NEXT: vmovq %xmm1, (%rdx) +; AVX512F-FAST-NEXT: vmovq %xmm4, (%rdx) ; AVX512F-FAST-NEXT: vmovq %xmm5, (%rcx) ; AVX512F-FAST-NEXT: vmovq %xmm6, (%r8) ; AVX512F-FAST-NEXT: vmovq %xmm0, (%r9) ; AVX512F-FAST-NEXT: retq ; -; AVX512BW-LABEL: load_i16_stride5_vf4: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,6,11,0,1,6,11,0] -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,5,10,0,0,5,10,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512BW-NEXT: vpextrw $7, %xmm2, %eax -; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,7,12,17,2,7,12,17] -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm2 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,8,13,18,3,8,13,18] -; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm5 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,9,14,19,4,9,14,19] -; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm6 -; AVX512BW-NEXT: vmovq %xmm1, (%rsi) -; AVX512BW-NEXT: vmovq %xmm0, (%rdx) -; AVX512BW-NEXT: vmovq %xmm2, (%rcx) -; AVX512BW-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-NEXT: vmovq %xmm6, (%r9) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-SLOW-LABEL: load_i16_stride5_vf4: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-SLOW-NEXT: vpextrw $5, %xmm0, %eax +; AVX512BW-SLOW-NEXT: vpinsrw $1, %eax, %xmm0, %xmm2 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] +; AVX512BW-SLOW-NEXT: vpextrw $7, %xmm1, %eax +; AVX512BW-SLOW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 +; AVX512BW-SLOW-NEXT: vpextrw $6, %xmm0, %eax +; AVX512BW-SLOW-NEXT: vpextrw $1, %xmm0, %r10d +; AVX512BW-SLOW-NEXT: vmovd %r10d, %xmm0 +; AVX512BW-SLOW-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: vpextrw $3, %xmm1, %eax +; AVX512BW-SLOW-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,7,12,17,2,7,12,17] +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512BW-SLOW-NEXT: vpermi2w %ymm3, %ymm4, %ymm1 +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,8,13,18,3,8,13,18] +; AVX512BW-SLOW-NEXT: vpermi2w %ymm3, %ymm4, %ymm5 +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,9,14,19,4,9,14,19] +; AVX512BW-SLOW-NEXT: vpermi2w %ymm3, %ymm4, %ymm6 +; AVX512BW-SLOW-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-SLOW-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-SLOW-NEXT: vmovq %xmm1, (%rcx) +; AVX512BW-SLOW-NEXT: vmovq %xmm5, (%r8) +; AVX512BW-SLOW-NEXT: vmovq %xmm6, (%r9) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: load_i16_stride5_vf4: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-FAST-NEXT: vpextrw $5, %xmm0, %eax +; AVX512BW-FAST-NEXT: vpinsrw $1, %eax, %xmm0, %xmm2 +; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] +; AVX512BW-FAST-NEXT: vpextrw $7, %xmm1, %eax +; AVX512BW-FAST-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,6,11,0,1,6,11,0] +; AVX512BW-FAST-NEXT: vpermi2w %xmm1, %xmm0, %xmm3 +; AVX512BW-FAST-NEXT: vpinsrw $3, 32(%rdi), %xmm3, %xmm0 +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,7,12,17,2,7,12,17] +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512BW-FAST-NEXT: vpermi2w %ymm3, %ymm4, %ymm1 +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,8,13,18,3,8,13,18] +; AVX512BW-FAST-NEXT: vpermi2w %ymm3, %ymm4, %ymm5 +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,9,14,19,4,9,14,19] +; AVX512BW-FAST-NEXT: vpermi2w %ymm3, %ymm4, %ymm6 +; AVX512BW-FAST-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-FAST-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-FAST-NEXT: vmovq %xmm1, (%rcx) +; AVX512BW-FAST-NEXT: vmovq %xmm5, (%r8) +; AVX512BW-FAST-NEXT: vmovq %xmm6, (%r9) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq %wide.vec = load <20 x i16>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <20 x i16> %wide.vec, <20 x i16> poison, <4 x i32> %strided.vec1 = shufflevector <20 x i16> %wide.vec, <20 x i16> poison, <4 x i32> @@ -585,7 +621,7 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,0,1,10,11,u,u,u,u,u,u,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,4,6,7] @@ -598,7 +634,7 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] @@ -679,10 +715,10 @@ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [3,1,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,12,13,10,11,4,5,2,3,8,9,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] @@ -811,10 +847,10 @@ ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [3,1,6,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm6, %ymm6 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,12,13,10,11,4,5,2,3,8,9,u,u,u,u] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] @@ -1235,9 +1271,9 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [0,2,1,3,4,6,1,7] +; AVX2-SLOW-NEXT: vpermd %ymm9, %ymm10, %ymm9 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,10,11,0,1,6,7,10,11,4,5,4,5,18,19,28,29,26,27,16,17,22,23,26,27,20,21,20,21] ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm6[2],xmm4[3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] @@ -1263,9 +1299,9 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <3,u,u,0,7,5,2,0> +; AVX2-SLOW-NEXT: vpermd %ymm10, %ymm11, %ymm10 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,14,15,0,1,22,23,28,29,26,27,20,21,18,19,18,19,30,31,16,17] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm6[0,1],xmm4[2],xmm6[3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] @@ -1313,10 +1349,9 @@ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,5,0,0,3,5,0] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,3,1,u,0,3,5,u> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] @@ -1325,14 +1360,13 @@ ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3],xmm6[4,5,6],xmm9[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,u,u,u,4,7,1,6> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,u,u,u,4,6,1,7> ; AVX2-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm9 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,10,11,0,1,6,7,2,3,4,5,0,1,18,19,28,29,26,27,16,17,22,23,18,19,20,21,16,17] ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm9, %ymm6 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,3,6,0,1,3,6,0] -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <1,3,2,u,1,3,6,u> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm9 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0,1,2,3,4],ymm9[5,6,7],ymm6[8,9,10,11,12],ymm9[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] @@ -1351,14 +1385,14 @@ ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm7 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [3,1,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,12,13,10,11,4,5,2,3,8,9,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,u,u,5,0,2,7> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,u,u,3,5,7,2,0> ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,18,19,28,29,26,27,16,17,22,23,22,23,18,19,28,29] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,4,7,0,2,4,7,0] ; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] @@ -1416,9 +1450,9 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,2,1,3,4,6,1,7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm9, %ymm10, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,10,11,0,1,6,7,10,11,4,5,4,5,18,19,28,29,26,27,16,17,22,23,26,27,20,21,20,21] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] @@ -1444,9 +1478,9 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <3,u,u,0,7,5,2,0> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,14,15,0,1,22,23,28,29,26,27,20,21,18,19,18,19,30,31,16,17] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm4[0,1],xmm5[2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] @@ -1477,85 +1511,85 @@ ; ; AVX512F-SLOW-LABEL: load_i16_stride5_vf16: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6],ymm5[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0],xmm6[1],xmm4[2,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6],ymm8[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7],ymm7[8,9,10,11,12],ymm4[13,14,15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero ; AVX512F-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm6[2],xmm4[3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512F-SLOW-NEXT: vpermd %ymm8, %ymm9, %ymm8 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,5,18,19,28,29,26,27,16,17,22,23],zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3],xmm9[4,5,6],xmm10[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero +; AVX512F-SLOW-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm6[0],xmm4[1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <3,u,u,0,7,5,2,0> +; AVX512F-SLOW-NEXT: vpermd %ymm10, %ymm11, %ymm10 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,14,15,0,1,22,23,28,29,26,27,20,21,18,19,18,19,30,31,16,17] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm6[0,1],xmm4[2],xmm6[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm1[0,1],xmm0[2],xmm1[3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa %ymm5, (%rsi) +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vmovdqa %ymm4, (%rsi) ; AVX512F-SLOW-NEXT: vmovdqa %ymm7, (%rdx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm8, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm9, (%r8) @@ -1565,93 +1599,90 @@ ; ; AVX512F-FAST-LABEL: load_i16_stride5_vf16: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <1,u,u,u,4,6,1,3> -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpor %ymm4, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,5,0,0,3,5,0] -; AVX512F-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,u,u,u,4,7,1,6> +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,3,1,u,0,3,5,u> +; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12],ymm1[13],ymm2[14,15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <1,u,u,u,4,6,1,3> ; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7],ymm6[8,9,10,11,12],ymm5[13,14,15] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero ; AVX512F-FAST-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,3,6,0,1,3,6,0] -; AVX512F-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm9 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm9 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0,1,2,3,4],ymm9[5,6,7],ymm6[8,9,10,11,12],ymm9[13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <1,3,2,u,1,3,6,u> +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,u,u,u,4,6,1,7> +; AVX512F-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm9 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[0,1,18,19,28,29,26,27,16,17,22,23],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6,7],ymm9[8,9,10,11,12],ymm6[13,14,15] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6],xmm11[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero +; AVX512F-FAST-NEXT: vpor %ymm9, %ymm10, %ymm9 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0,1],ymm3[2],ymm4[3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11],ymm3[12],ymm4[13,14],ymm3[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,2,u,u,5,7,2,4> ; AVX512F-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,6,0,1,4,6,0] ; AVX512F-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm10, %ymm10 +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm10 ; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm7 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,u,u,5,0,2,7> +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [3,1,6,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] +; AVX512F-FAST-NEXT: vpermd %ymm10, %ymm9, %ymm9 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,12,13,10,11,4,5,2,3,8,9,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,u,u,3,5,7,2,0> ; AVX512F-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,18,19,28,29,26,27,16,17,22,23,22,23,18,19,28,29] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,4,7,0,2,4,7,0] ; AVX512F-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm10, %ymm10 +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm10 ; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm8 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <1,3,u,u,6,0,3,5> -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,5,7,0,2,5,7] -; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,u,u,u,u,24,25,30,31,u,u,u,u] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,12,14] -; AVX512F-FAST-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,3,u,u,6,0,3,5> +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,5,7,0,2,5,7] +; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-FAST-NEXT: vmovdqa %ymm5, (%rsi) ; AVX512F-FAST-NEXT: vmovdqa %ymm6, (%rdx) ; AVX512F-FAST-NEXT: vmovdqa %ymm7, (%rcx) ; AVX512F-FAST-NEXT: vmovdqa %ymm8, (%r8) -; AVX512F-FAST-NEXT: vmovdqa %ymm2, (%r9) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%r9) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -1705,19 +1736,19 @@ ; SSE-LABEL: load_i16_stride5_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $408, %rsp # imm = 0x198 -; SSE-NEXT: movdqa 64(%rdi), %xmm4 +; SSE-NEXT: movdqa 144(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa 16(%rdi), %xmm13 -; SSE-NEXT: movdqa 32(%rdi), %xmm9 -; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: movdqa 224(%rdi), %xmm7 +; SSE-NEXT: movdqa 80(%rdi), %xmm10 +; SSE-NEXT: movdqa 96(%rdi), %xmm13 +; SSE-NEXT: movdqa 128(%rdi), %xmm5 +; SSE-NEXT: movdqa 112(%rdi), %xmm9 +; SSE-NEXT: movdqa 64(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm11 -; SSE-NEXT: movdqa 176(%rdi), %xmm12 -; SSE-NEXT: movdqa 208(%rdi), %xmm3 -; SSE-NEXT: movdqa 192(%rdi), %xmm2 +; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa 16(%rdi), %xmm12 +; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 @@ -1761,30 +1792,30 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] ; SSE-NEXT: movaps %xmm15, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: movdqa 272(%rdi), %xmm4 +; SSE-NEXT: movdqa 192(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: andps %xmm15, %xmm3 ; SSE-NEXT: orps %xmm3, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 256(%rdi), %xmm3 +; SSE-NEXT: movdqa 176(%rdi), %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] ; SSE-NEXT: movdqa %xmm3, %xmm14 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 240(%rdi), %xmm13 +; SSE-NEXT: movdqa 160(%rdi), %xmm13 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,2,2,3] ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] -; SSE-NEXT: movdqa 304(%rdi), %xmm1 +; SSE-NEXT: movdqa 224(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE-NEXT: movaps %xmm15, %xmm2 @@ -1792,25 +1823,25 @@ ; SSE-NEXT: andps %xmm15, %xmm3 ; SSE-NEXT: orps %xmm3, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm1 +; SSE-NEXT: movdqa 288(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa 112(%rdi), %xmm2 +; SSE-NEXT: movdqa 272(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa 96(%rdi), %xmm1 +; SSE-NEXT: movdqa 256(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 80(%rdi), %xmm4 +; SSE-NEXT: movdqa 240(%rdi), %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] -; SSE-NEXT: movdqa 144(%rdi), %xmm0 +; SSE-NEXT: movdqa 304(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: movaps %xmm15, %xmm1 @@ -2160,38 +2191,38 @@ ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm4[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps %xmm1, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps %xmm1, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps %xmm1, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps %xmm1, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: movaps %xmm1, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps %xmm1, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rcx) -; SSE-NEXT: movaps %xmm10, 16(%r8) -; SSE-NEXT: movaps %xmm9, 48(%r8) -; SSE-NEXT: movaps %xmm13, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%r8) -; SSE-NEXT: movaps %xmm11, 16(%r9) -; SSE-NEXT: movaps %xmm0, 48(%r9) -; SSE-NEXT: movaps %xmm2, (%r9) -; SSE-NEXT: movaps %xmm3, 32(%r9) +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps %xmm10, 48(%r8) +; SSE-NEXT: movaps %xmm9, 32(%r8) +; SSE-NEXT: movaps %xmm13, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%r8) +; SSE-NEXT: movaps %xmm11, 48(%r9) +; SSE-NEXT: movaps %xmm0, 32(%r9) +; SSE-NEXT: movaps %xmm2, 16(%r9) +; SSE-NEXT: movaps %xmm3, (%r9) ; SSE-NEXT: addq $408, %rsp # imm = 0x198 ; SSE-NEXT: retq ; @@ -2357,7 +2388,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm6[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,8,9,2,3,12,13,12,13,12,13,12,13] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] @@ -2409,7 +2440,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm0[2,3],xmm5[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,10,11,4,5,14,15,14,15,14,15,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm0 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3,4,5,6,7] @@ -2528,195 +2559,186 @@ ; ; AVX2-SLOW-LABEL: load_i16_stride5_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $264, %rsp # imm = 0x108 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX2-SLOW-NEXT: subq $200, %rsp +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4],ymm7[5],ymm6[6,7],ymm7[8],ymm6[9,10],ymm7[11],ymm6[12],ymm7[13],ymm6[14,15] +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10],ymm6[11],ymm7[12],ymm6[13],ymm7[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6],ymm8[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] ; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] ; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm8, %xmm8 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm12[1,2],ymm3[3],ymm12[4],ymm3[5],ymm12[6,7],ymm3[8],ymm12[9,10],ymm3[11],ymm12[12],ymm3[13],ymm12[14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm15 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4],ymm2[5],ymm1[6,7],ymm2[8],ymm1[9,10],ymm2[11],ymm1[12],ymm2[13],ymm1[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6],ymm12[7] ; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0],ymm3[1],ymm13[2,3],ymm3[4],ymm13[5],ymm3[6],ymm13[7,8],ymm3[9],ymm13[10,11],ymm3[12],ymm13[13],ymm3[14],ymm13[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1,2,3],xmm10[4,5],xmm12[6,7] ; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm10 ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm10, %ymm0, %ymm12 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3],xmm11[4,5,6],xmm13[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [0,2,1,3,4,6,1,7] +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,12,13,10,11,0,1,6,7,10,11,4,5,4,5,18,19,28,29,26,27,16,17,22,23,26,27,20,21,20,21] +; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm3[0],ymm13[1],ymm3[2],ymm13[3],ymm3[4,5],ymm13[6],ymm3[7,8],ymm13[9],ymm3[10],ymm13[11],ymm3[12,13],ymm13[14],ymm3[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6],xmm14[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm11, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm15, %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm11, %ymm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm15 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3],xmm11[4,5,6],xmm13[7] +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm13, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3],xmm13[4,5,6],xmm15[7] +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] +; AVX2-SLOW-NEXT: vpermd %ymm13, %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm11, %ymm10, %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm15, %ymm10, %ymm9 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0],xmm13[1],xmm14[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm10, %xmm10 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm10 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0],xmm11[1],xmm10[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm1[5,6,7],ymm12[8,9,10,11,12],ymm1[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0,1,2,3,4],ymm8[5,6,7],ymm12[8,9,10,11,12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm13[2],xmm14[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm11[2],xmm10[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm8[5,6,7],ymm0[8,9,10,11,12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm11[2],xmm10[3] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm13[2],xmm14[3] +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5,6,7],ymm9[8,9,10,11,12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm9 -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm2[2],ymm4[3],ymm2[4],ymm4[5,6],ymm2[7],ymm4[8,9],ymm2[10],ymm4[11],ymm2[12],ymm4[13,14],ymm2[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3,4],xmm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0],xmm14[1],xmm13[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1],ymm5[2],ymm4[3],ymm5[4],ymm4[5,6],ymm5[7],ymm4[8,9],ymm5[10],ymm4[11],ymm5[12],ymm4[13,14],ymm5[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm13[0],xmm14[1],xmm13[2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm8[5,6,7],ymm0[8,9,10,11,12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13],ymm5[14],ymm3[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm6 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1],ymm5[2],ymm15[3],ymm5[4],ymm15[5,6],ymm5[7],ymm15[8,9],ymm5[10],ymm15[11],ymm5[12],ymm15[13,14],ymm5[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3,4],xmm1[5,6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0],xmm10[1],xmm11[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm12 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13],ymm4[14],ymm2[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0],xmm10[1],xmm11[2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4],ymm2[5],ymm4[6,7],ymm2[8],ymm4[9,10],ymm2[11],ymm4[12],ymm2[13],ymm4[14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm14[2],xmm13[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4,5],ymm12[6],ymm4[7,8],ymm12[9],ymm4[10],ymm12[11],ymm4[12,13],ymm12[14],ymm4[15] +; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [3,1,2,0,7,5,2,0] +; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm15, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [6,7,12,13,10,11,4,5,2,3,2,3,14,15,0,1,22,23,28,29,26,27,20,21,18,19,18,19,30,31,16,17] +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm10[2],xmm11[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7,8],ymm3[9],ymm6[10],ymm3[11],ymm6[12,13],ymm3[14],ymm6[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4],ymm0[5],ymm9[6],ymm0[7] -; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm5[0],ymm15[1,2],ymm5[3],ymm15[4],ymm5[5],ymm15[6,7],ymm5[8],ymm15[9,10],ymm5[11],ymm15[12],ymm5[13],ymm15[14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm10[2],xmm11[3] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4],ymm5[5],ymm4[6,7],ymm5[8],ymm4[9,10],ymm5[11],ymm4[12],ymm5[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0],xmm0[1],xmm12[2],xmm0[3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm15, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm14[2],xmm13[3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7],ymm2[8,9],mem[10],ymm2[11],mem[12],ymm2[13,14],mem[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4],ymm2[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5],mem[6],ymm7[7,8],mem[9],ymm7[10,11],mem[12],ymm7[13],mem[14],ymm7[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5],ymm1[6],ymm15[7,8],ymm1[9],ymm15[10,11],ymm1[12],ymm15[13],ymm1[14],ymm15[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7],ymm7[8,9],ymm6[10],ymm7[11],ymm6[12],ymm7[13,14],ymm6[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm9[2],mem[3],ymm9[4],mem[5,6],ymm9[7],mem[8,9],ymm9[10],mem[11],ymm9[12],mem[13,14],ymm9[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5],ymm4[6],mem[7,8],ymm4[9],mem[10,11],ymm4[12],mem[13],ymm4[14],mem[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3,4],xmm6[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] @@ -2727,79 +2749,75 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%r9) -; AVX2-SLOW-NEXT: addq $264, %rsp # imm = 0x108 +; AVX2-SLOW-NEXT: addq $200, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride5_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $296, %rsp # imm = 0x128 +; AVX2-FAST-NEXT: subq $264, %rsp # imm = 0x108 ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm15 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm14 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm6[1,2],ymm0[3],ymm6[4],ymm0[5],ymm6[6,7],ymm0[8],ymm6[9,10],ymm0[11],ymm6[12],ymm0[13],ymm6[14,15] +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm7 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [1,3,0,2,4,6,1,3] ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm12 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1,2,3],xmm9[4,5],xmm12[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm13 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm13, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5],ymm3[6],ymm14[7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13],ymm3[14],ymm14[15] -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1,2,3],xmm13[4,5],xmm14[6,7] ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm12 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm1[1,2],ymm15[3],ymm1[4],ymm15[5],ymm1[6,7],ymm15[8],ymm1[9,10],ymm15[11],ymm1[12],ymm15[13],ymm1[14,15] -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm2 ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm12, %ymm10, %ymm11 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,u,u,u,4,7,1,6> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,u,u,u,4,6,1,7> ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,12,13,10,11,0,1,6,7,2,3,4,5,0,1,18,19,28,29,26,27,16,17,22,23,18,19,20,21,16,17] ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm15 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3],xmm13[4,5,6],xmm15[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] ; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm13, %xmm13 ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm13, %ymm10, %ymm13 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7,8],ymm0[9],ymm3[10],ymm0[11],ymm3[12,13],ymm0[14],ymm3[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7,8],ymm3[9],ymm0[10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm0 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3],xmm10[4,5,6],xmm0[7] ; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 @@ -2811,20 +2829,21 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,3,1,3,0,3,5,7] ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm10, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm14, %ymm9 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm10 ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm14, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm9 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [1,3,2,3,1,3,6,7] ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm8, %ymm11 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm11 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm11[4,5,6,7] @@ -2835,104 +2854,98 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm9 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3,4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,2,u,u,5,7,2,4> ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm14, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,4,6,0,1,4,6,0] ; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm8, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2,3,4],ymm11[5,6,7],ymm0[8,9,10,11,12],ymm11[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm6 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0,1],ymm0[2],ymm3[3],ymm0[4],ymm3[5,6],ymm0[7],ymm3[8,9],ymm0[10],ymm3[11],ymm0[12],ymm3[13,14],ymm0[15] +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1],ymm3[2],ymm4[3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11],ymm3[12],ymm4[13,14],ymm3[15] +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm9 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3,4],xmm11[5,6,7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm1 -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15] ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm14, %ymm11 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,u,u,5,0,2,7> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,4,7,0,2,4,7,0] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,3,5,7,2,0> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,12,13,10,11,0,1,6,7,6,7,2,3,12,13,18,19,28,29,26,27,16,17,22,23,22,23,18,19,28,29] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [3,1,6,4] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm13, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,12,13,10,11,4,5,2,3,8,9,0,1,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,4,7,0,2,4,7,0] +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm6[1,2],ymm0[3],ymm6[4],ymm0[5],ymm6[6,7],ymm0[8],ymm6[9,10],ymm0[11],ymm6[12],ymm0[13],ymm6[14,15] -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm8 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm15[1],ymm3[2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7,8],ymm15[9],ymm3[10],ymm15[11],ymm3[12,13],ymm15[14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm4[1,2],ymm9[3],ymm4[4],ymm9[5],ymm4[6,7],ymm9[8],ymm4[9,10],ymm9[11],ymm4[12],ymm9[13],ymm4[14,15] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm3[0],ymm15[1],ymm3[2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7,8],ymm15[9],ymm3[10],ymm15[11],ymm3[12,13],ymm15[14],ymm3[15] ; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm5[2],ymm7[3],ymm5[4],ymm7[5,6],ymm5[7],ymm7[8,9],ymm5[10],ymm7[11],ymm5[12],ymm7[13,14],ymm5[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,12,13,10,11,0,1,6,7,6,7,2,3,12,13,18,19,28,29,26,27,16,17,22,23,22,23,18,19,28,29] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm11[2],mem[3],ymm11[4],mem[5,6],ymm11[7],mem[8,9],ymm11[10],mem[11],ymm11[12],mem[13,14],ymm11[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,3,u,u,6,0,3,5> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7] ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7] ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1],ymm14[2],ymm9[3],ymm14[4],ymm9[5,6],ymm14[7],ymm9[8,9],ymm14[10],ymm9[11],ymm14[12],ymm9[13,14],ymm14[15] +; AVX2-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm14[2],mem[3],ymm14[4],mem[5,6],ymm14[7],mem[8,9],ymm14[10],mem[11],ymm14[12],mem[13,14],ymm14[15] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] @@ -2956,31 +2969,28 @@ ; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r9) -; AVX2-FAST-NEXT: addq $296, %rsp # imm = 0x128 +; AVX2-FAST-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride5_vf32: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $264, %rsp # imm = 0x108 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm4[1,2],ymm1[3],ymm4[4],ymm1[5],ymm4[6,7],ymm1[8],ymm4[9,10],ymm1[11],ymm4[12],ymm1[13],ymm4[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1,2],ymm0[3],ymm4[4],ymm0[5],ymm4[6,7],ymm0[8],ymm4[9,10],ymm0[11],ymm4[12],ymm0[13],ymm4[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] @@ -2991,190 +3001,191 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm8[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5],ymm8[6],ymm12[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5],ymm6[6],ymm13[7,8],ymm6[9],ymm13[10,11],ymm6[12],ymm13[13],ymm6[14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1,2,3],xmm8[4,5],xmm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm14[2,3],xmm9[4,5,6],xmm14[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm9, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5],ymm7[6],ymm10[7,8],ymm7[9],ymm10[10,11],ymm7[12],ymm10[13],ymm7[14],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm9[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm9, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm13[1],ymm6[2],ymm13[3],ymm6[4,5],ymm13[6],ymm6[7,8],ymm13[9],ymm6[10],ymm13[11],ymm6[12,13],ymm13[14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5],ymm7[6],ymm10[7,8],ymm7[9],ymm10[10,11],ymm7[12],ymm10[13],ymm7[14],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,1,3,4,6,1,7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,12,13,10,11,0,1,6,7,10,11,4,5,4,5,18,19,28,29,26,27,16,17,22,23,26,27,20,21,20,21] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3],xmm14[4,5,6],xmm15[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm14, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3],xmm14[4,5,6],xmm13[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm13, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13],ymm2[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm14, %ymm8, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm9, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm14, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm14, %ymm15, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm9[0],xmm8[1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2,3,4],ymm14[5,6,7],ymm0[8,9,10,11,12],ymm14[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm13, %ymm14, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm13 = xmm9[0],xmm8[1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2,3,4],ymm13[5,6,7],ymm0[8,9,10,11,12],ymm13[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm6[1],xmm15[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1,2,3,4],ymm2[5,6,7],ymm12[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm8[2],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm6[2],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm3[1],xmm15[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm12[0,1,2,3,4],ymm13[5,6,7],ymm12[8,9,10,11,12],ymm13[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm3[2],xmm15[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0,1,2,3,4],ymm12[5,6,7],ymm1[8,9,10,11,12],ymm12[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm8[2],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5,6,7],ymm11[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm0[2],ymm5[3],ymm0[4],ymm5[5,6],ymm0[7],ymm5[8,9],ymm0[10],ymm5[11],ymm0[12],ymm5[13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3,4],xmm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0],xmm9[1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13],ymm4[14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3,4],xmm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm8[0],xmm9[1],xmm8[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1,2,3,4],ymm11[5,6,7],ymm1[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm10[1],ymm7[2,3],ymm10[4],ymm7[5],ymm10[6],ymm7[7,8],ymm10[9],ymm7[10,11],ymm10[12],ymm7[13],ymm10[14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm13[2],ymm7[3],ymm13[4],ymm7[5,6],ymm13[7],ymm7[8,9],ymm13[10],ymm7[11],ymm13[12],ymm7[13,14],ymm13[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3,4],xmm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0],xmm15[1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm10[1],ymm4[2,3],ymm10[4],ymm4[5],ymm10[6],ymm4[7,8],ymm10[9],ymm4[10,11],ymm10[12],ymm4[13],ymm10[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1],ymm6[2],ymm2[3],ymm6[4],ymm2[5,6],ymm6[7],ymm2[8,9],ymm6[10],ymm2[11],ymm6[12],ymm2[13,14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3,4],xmm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm3[0],xmm15[1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1,2,3,4],ymm11[5,6,7],ymm1[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm2[1,2],ymm6[3],ymm2[4],ymm6[5],ymm2[6,7],ymm6[8],ymm2[9,10],ymm6[11],ymm2[12],ymm6[13],ymm2[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0],xmm1[1],xmm11[2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0],ymm4[1],ymm10[2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7,8],ymm4[9],ymm10[10],ymm4[11],ymm10[12,13],ymm4[14],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [3,1,2,0,7,5,2,0] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm11, %ymm13, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [6,7,12,13,10,11,4,5,2,3,2,3,14,15,0,1,22,23,28,29,26,27,20,21,18,19,18,19,30,31,16,17] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm3[0,1],xmm15[2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1,2,3,4],ymm11[5,6,7],ymm1[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm6[1,2],ymm0[3],ymm6[4],ymm0[5],ymm6[6,7],ymm0[8],ymm6[9,10],ymm0[11],ymm6[12],ymm0[13],ymm6[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0],xmm1[1],xmm11[2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm11[1],ymm0[2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7,8],ymm11[9],ymm0[10],ymm11[11],ymm0[12,13],ymm11[14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm13, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm9[2],xmm8[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm5[1,2],ymm0[3],ymm5[4],ymm0[5],ymm5[6,7],ymm0[8],ymm5[9,10],ymm0[11],ymm5[12],ymm0[13],ymm5[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm9[2],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm15, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1],ymm4[2],ymm7[3],ymm4[4],ymm7[5,6],ymm4[7],ymm7[8,9],ymm4[10],ymm7[11],ymm4[12],ymm7[13,14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0],ymm12[1],mem[2,3],ymm12[4],mem[5],ymm12[6],mem[7,8],ymm12[9],mem[10,11],ymm12[12],mem[13],ymm12[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm5[1],ymm10[2],ymm5[3],ymm10[4,5],ymm5[6],ymm10[7,8],ymm5[9],ymm10[10],ymm5[11],ymm10[12,13],ymm5[14],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4],ymm1[5],ymm14[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0],ymm7[1,2],ymm13[3],ymm7[4],ymm13[5],ymm7[6,7],ymm13[8],ymm7[9,10],ymm13[11],ymm7[12],ymm13[13],ymm7[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm6[0,1],xmm15[2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4],ymm6[5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3],ymm13[4],ymm4[5],ymm13[6],ymm4[7,8],ymm13[9],ymm4[10,11],ymm13[12],ymm4[13],ymm13[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm11[2],ymm0[3],ymm11[4],ymm0[5,6],ymm11[7],ymm0[8,9],ymm11[10],ymm0[11],ymm11[12],ymm0[13,14],ymm11[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm10[1],ymm6[2,3],ymm10[4],ymm6[5],ymm10[6],ymm6[7,8],ymm10[9],ymm6[10,11],ymm10[12],ymm6[13],ymm10[14],ymm6[15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4],ymm4[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm8, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm9, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%r9) ; AVX2-FAST-PERLANE-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -3221,29 +3232,29 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm11[0],xmm12[1],xmm11[2,3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm17, %zmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm18, %zmm15 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm16 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] -; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm9, %xmm15 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[0,3,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm13[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,6,7,0,1,10,11,4,5,14,15,8,9] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm9, %xmm15 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[0,3,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [7,5,2,0,7,5,6,4] +; AVX512F-SLOW-NEXT: vpermd %ymm15, %ymm17, %ymm15 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[14,15,0,1,12,13,12,13,10,11,4,5,2,3,8,9,30,31,16,17,28,29,28,29,26,27,20,21,18,19,24,25] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm13[5,6,7] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX512F-SLOW-NEXT: vpermd %ymm13, %ymm15, %ymm13 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[4,5,18,19,28,29,26,27,16,17,22,23],zero,zero,zero,zero,zero,zero ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm14 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3],xmm15[4,5,6],xmm14[7] @@ -3252,8 +3263,8 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm11[0,1],xmm12[2],xmm11[3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm17, %zmm14 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm14, %zmm18 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm18, %zmm14 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm14, %zmm17 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm14[1],ymm4[2,3,4,5,6,7] @@ -3281,34 +3292,34 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm12[0],xmm11[1],xmm12[2,3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm17, %zmm14 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm14, %zmm17 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm18, %zmm14 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm14, %zmm19 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm13 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm13[2],ymm4[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0],xmm4[1],xmm13[2],xmm4[3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <3,u,u,0,7,5,2,0> +; AVX512F-SLOW-NEXT: vpermd %ymm13, %ymm15, %ymm13 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,14,15,0,1,22,23,28,29,26,27,20,21,18,19,18,19,30,31,16,17] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm9[0],xmm10[1],xmm9[2,3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm4[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0,1],xmm11[2],xmm12[3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm13, %zmm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm15[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm13 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [0,6,1,3,4,6,5,7] +; AVX512F-SLOW-NEXT: vpermd %ymm15, %ymm18, %ymm15 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[10,11,10,11,4,5,8,9,14,15,4,5,2,3,12,13,26,27,26,27,20,21,24,25,30,31,20,21,18,19,28,29] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm12[0,1],xmm11[2],xmm12[3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm15, %zmm13 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm13 ; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm13, %ymm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm14 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1,2,3],xmm15[4,5],xmm14[6,7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0],ymm14[1,2,3,4,5,6,7],ymm4[8],ymm14[9,10,11,12,13,14,15] @@ -3351,8 +3362,8 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, (%rdx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512F-SLOW-NEXT: vzeroupper @@ -3360,178 +3371,180 @@ ; ; AVX512F-FAST-LABEL: load_i16_stride5_vf32: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa 176(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[4,5,14,15,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[0,1,10,11,8,9,10,11,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa 176(%rdi), %xmm3 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[4,5,14,15,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %xmm4 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[0,1,10,11,8,9,10,11,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <2,4,7,1,4,6,u,u> -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <8,9,3,2,4,u,u,u> -; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,4,7,1,4,6,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 ; AVX512F-FAST-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm9 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <1,u,u,u,4,6,1,3> -; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm12, %ymm7 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpor %ymm7, %ymm12, %ymm12 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,3,1,u,0,3,5,u> -; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm13, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm14, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm12, %zmm15, %zmm14 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm16 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <2,u,u,u,4,7,1,6> -; AVX512F-FAST-NEXT: vpermd %ymm12, %ymm14, %ymm12 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm14, %xmm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3],xmm14[4,5,6],xmm6[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpor %ymm6, %ymm12, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <1,3,2,u,1,3,6,u> -; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm12, %ymm12 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm12, %ymm12 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm14, %ymm19 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm6, %zmm15, %zmm12 -; AVX512F-FAST-NEXT: vpsrlq $48, %xmm2, %xmm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = <0,2,5,7,4,7,u,u> -; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm17, %ymm14 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm14[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [1,4,6,3,1,4,6,3] -; AVX512F-FAST-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm17, %ymm14 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,16,17,30,31,24,25] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm12, %zmm17 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm14 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3,4],xmm14[5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm14[2],xmm2[2],xmm14[3],xmm2[3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = <0,3,5,2,5,7,u,u> -; AVX512F-FAST-NEXT: vpermd %ymm12, %ymm18, %ymm12 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm12[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4],ymm6[5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3,4],xmm12[5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = <0,2,u,u,5,7,2,4> -; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm18, %ymm14 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [1,4,6,0,1,4,6,0] -; AVX512F-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm14, %ymm14 -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm14, %ymm13 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm12, %zmm15, %zmm13 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm13, %zmm13 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm12 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0],xmm6[1],xmm12[2],xmm6[3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,3,u,u,5,0,2,7> -; AVX512F-FAST-NEXT: vpermd %ymm12, %ymm14, %ymm12 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm2[0],xmm3[1],xmm2[2,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <1,3,6,0,5,u,u,u> -; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm14 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [2,4,7,0,2,4,7,0] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm10 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <1,u,u,u,4,6,1,3> +; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm13, %ymm8 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13],ymm12[14],ymm11[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1,2,3],xmm13[4,5],xmm14[6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpor %ymm8, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [0,3,5,0,0,3,5,0] ; AVX512F-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm14, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm14 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm12 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm12 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm12, %ymm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm6[0],ymm14[1,2,3,4,5,6,7],ymm6[8],ymm14[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm12, %zmm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4],xmm11[5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <1,3,u,u,6,0,3,5> -; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm9, %ymm8 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <1,4,6,3,6,u,u,u> -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,1,3,0,2,5,7] -; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm14, %ymm15 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm15, %ymm15 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm13, %zmm16, %zmm15 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm17 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <2,u,u,u,4,6,1,7> +; AVX512F-FAST-NEXT: vpermd %ymm13, %ymm15, %ymm13 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,28,29,26,27,16,17,22,23],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm15, %xmm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3],xmm15[4,5,6],xmm2[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpor %ymm2, %ymm13, %ymm2 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [1,3,6,0,1,3,6,0] +; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm2, %zmm16, %zmm13 +; AVX512F-FAST-NEXT: vpsrlq $48, %xmm3, %xmm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = <0,2,5,7,4,7,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm18, %ymm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [1,3,6,4,1,3,6,4] +; AVX512F-FAST-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm18, %ymm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,28,29,26,27,16,17,22,23,24,25] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm13, %zmm18 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3,4],xmm7[5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = <0,3,5,2,5,7,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm13, %ymm19, %ymm13 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm13[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7],ymm12[8,9],ymm11[10],ymm12[11],ymm11[12],ymm12[13,14],ymm11[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm7, %xmm13 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm13[3,4],xmm7[5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13],ymm10[14],ymm9[15] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = <0,2,u,u,5,7,2,4> +; AVX512F-FAST-NEXT: vpermd %ymm13, %ymm19, %ymm13 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm13[3,4,5,6,7] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [1,4,6,0,1,4,6,0] +; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm7, %zmm16, %zmm13 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm13, %zmm14 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [3,1,6,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm12[1,2],ymm11[3],ymm12[4],ymm11[5],ymm12[6,7],ymm11[8],ymm12[9,10],ymm11[11],ymm12[12],ymm11[13],ymm12[14,15] +; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,12,13,10,11,4,5,2,3,8,9,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,u,u,3,5,7,2,0> +; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm13, %ymm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,18,19,28,29,26,27,16,17,22,23,22,23,18,19,28,29] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <1,3,6,0,5,u,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm13, %ymm16, %ymm13 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm13[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [2,4,7,0,2,4,7,0] +; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm7 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm7 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm7, %ymm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1,2,3],xmm13[4,5],xmm15[6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm2[0],ymm13[1,2,3,4,5,6,7],ymm2[8],ymm13[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13],ymm11[14],ymm12[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm7, %xmm11 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0,1,2],xmm7[3,4],xmm11[5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,3,u,u,6,0,3,5> +; AVX512F-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm9 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <1,4,6,3,6,u,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,1,3,0,2,5,7] +; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 ; AVX512F-FAST-NEXT: movb $7, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm3 {%k1} +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm3, %ymm4 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, (%r8) +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, (%rdx) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, (%rcx) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, (%r8) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -3614,617 +3627,621 @@ ; SSE-LABEL: load_i16_stride5_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $1016, %rsp # imm = 0x3F8 -; SSE-NEXT: movdqa 464(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 400(%rdi), %xmm8 -; SSE-NEXT: movdqa 416(%rdi), %xmm11 -; SSE-NEXT: movdqa 448(%rdi), %xmm4 +; SSE-NEXT: movdqa 384(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm15 -; SSE-NEXT: movdqa 96(%rdi), %xmm10 -; SSE-NEXT: movdqa 128(%rdi), %xmm14 -; SSE-NEXT: movdqa 112(%rdi), %xmm2 +; SSE-NEXT: movdqa 320(%rdi), %xmm7 +; SSE-NEXT: movdqa 336(%rdi), %xmm9 +; SSE-NEXT: movdqa 368(%rdi), %xmm14 +; SSE-NEXT: movdqa 352(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm13 +; SSE-NEXT: movdqa (%rdi), %xmm8 +; SSE-NEXT: movdqa 16(%rdi), %xmm10 +; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,1,0,3] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] -; SSE-NEXT: movaps {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm13, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] -; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: movaps {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm12, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,0,1] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3] +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,1,0,3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm15 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] -; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] +; SSE-NEXT: movaps %xmm12, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: movdqa 32(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: andps %xmm13, %xmm4 -; SSE-NEXT: orps %xmm4, %xmm2 +; SSE-NEXT: movdqa 112(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: andps %xmm12, %xmm3 +; SSE-NEXT: orps %xmm3, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa 48(%rdi), %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa 128(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSE-NEXT: movdqa 96(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa (%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] -; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: movdqa 80(%rdi), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] -; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] +; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: movaps %xmm12, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: andps %xmm13, %xmm4 -; SSE-NEXT: orps %xmm4, %xmm2 +; SSE-NEXT: andps %xmm12, %xmm3 +; SSE-NEXT: orps %xmm3, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm2 +; SSE-NEXT: movdqa 432(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa 368(%rdi), %xmm2 +; SSE-NEXT: movdqa 448(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 336(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSE-NEXT: movdqa 416(%rdi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 320(%rdi), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] -; SSE-NEXT: movdqa 384(%rdi), %xmm1 +; SSE-NEXT: movdqa 400(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] +; SSE-NEXT: movdqa 464(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: movaps %xmm12, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: andps %xmm13, %xmm4 -; SSE-NEXT: orps %xmm4, %xmm2 +; SSE-NEXT: andps %xmm12, %xmm3 +; SSE-NEXT: orps %xmm3, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm2 +; SSE-NEXT: movdqa 192(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 256(%rdi), %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 240(%rdi), %xmm3 +; SSE-NEXT: movdqa 160(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] -; SSE-NEXT: movdqa 304(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] +; SSE-NEXT: movdqa 224(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: movaps %xmm12, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: andps %xmm13, %xmm4 -; SSE-NEXT: orps %xmm4, %xmm2 +; SSE-NEXT: andps %xmm12, %xmm3 +; SSE-NEXT: orps %xmm3, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 592(%rdi), %xmm2 +; SSE-NEXT: movdqa 512(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa 608(%rdi), %xmm2 +; SSE-NEXT: movdqa 528(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 576(%rdi), %xmm1 +; SSE-NEXT: movdqa 496(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 560(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] +; SSE-NEXT: movdqa 480(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] -; SSE-NEXT: movdqa 624(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] +; SSE-NEXT: movdqa 544(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: movaps %xmm12, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: andps %xmm13, %xmm4 -; SSE-NEXT: orps %xmm4, %xmm2 +; SSE-NEXT: andps %xmm12, %xmm3 +; SSE-NEXT: orps %xmm3, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm2 +; SSE-NEXT: movdqa 272(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa 208(%rdi), %xmm2 +; SSE-NEXT: movdqa 288(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,3] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa 176(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,3] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa 256(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 160(%rdi), %xmm2 +; SSE-NEXT: movdqa 240(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3] -; SSE-NEXT: movdqa 224(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3] +; SSE-NEXT: movdqa 304(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movaps %xmm13, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm4 -; SSE-NEXT: andps %xmm13, %xmm5 -; SSE-NEXT: orps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 528(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm12, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: andps %xmm12, %xmm4 +; SSE-NEXT: orps %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 608(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa 512(%rdi), %xmm2 +; SSE-NEXT: movdqa 592(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa 496(%rdi), %xmm1 +; SSE-NEXT: movdqa 576(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 480(%rdi), %xmm2 +; SSE-NEXT: movdqa 560(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3] -; SSE-NEXT: movdqa 544(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] +; SSE-NEXT: movdqa 624(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps %xmm12, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: andps %xmm13, %xmm4 -; SSE-NEXT: orps %xmm4, %xmm1 +; SSE-NEXT: andps %xmm12, %xmm3 +; SSE-NEXT: orps %xmm3, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm10 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufd $237, (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: psllq $48, %xmm4 -; SSE-NEXT: movaps %xmm13, %xmm2 -; SSE-NEXT: andnps %xmm4, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: psllq $48, %xmm13 +; SSE-NEXT: movaps %xmm12, %xmm2 +; SSE-NEXT: andnps %xmm13, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: orps %xmm1, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: psrlq $48, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm12 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] -; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm13[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm15[2],xmm5[3],xmm15[3] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movaps {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: andnps %xmm1, %xmm3 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm12[2],xmm4[3],xmm12[3] +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm4[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[2,3] +; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm15[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm15[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: andnps %xmm1, %xmm3 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0,1,3] +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: andnps %xmm7, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm14[2],xmm4[3],xmm14[3] +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm9[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm9[2,3] -; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm11[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: andnps %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,1,1,3] +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm10[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm4 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: andnps %xmm1, %xmm3 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] +; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm7[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm4 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa (%rsp), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: andnps %xmm1, %xmm3 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] +; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm4 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: andnps %xmm1, %xmm3 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] +; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm4 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: andnps %xmm1, %xmm3 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[3,0] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm13, %xmm3 +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[3,0] +; SSE-NEXT: movaps %xmm5, %xmm14 +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: andnps %xmm5, %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,7,4,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,7,4,6,7] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] @@ -4234,69 +4251,97 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm14[3,0] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: andnps %xmm14, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm15[3,0] +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: andnps %xmm15, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,7,4,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm15[3,0] +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: andnps %xmm15, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,7,4,6,7] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[3,0] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: andnps %xmm12, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[3,0] +; SSE-NEXT: movaps %xmm8, %xmm2 +; SSE-NEXT: andnps %xmm11, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,7,4,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,7,4,6,7] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] -; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: andnps %xmm2, %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[3,0] +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: andnps %xmm10, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,4,6,7] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -4305,52 +4350,25 @@ ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[3,0] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: andnps %xmm7, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] -; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm4, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[3,0] +; SSE-NEXT: movaps %xmm8, %xmm2 +; SSE-NEXT: andnps %xmm6, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,4,6,7] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -4359,113 +4377,112 @@ ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm14 +; SSE-NEXT: pand %xmm8, %xmm14 ; SSE-NEXT: por %xmm2, %xmm14 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[3,0] -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: andnps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: andnps %xmm4, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,7,4,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,4,6,7] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm2[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm10 +; SSE-NEXT: pand %xmm8, %xmm10 ; SSE-NEXT: por %xmm2, %xmm10 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,2] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[3,0] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[0,2] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm9[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm9 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm11, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm9[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm15[0,2] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm7[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm15[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm11[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm12[0,2] +; SSE-NEXT: movaps %xmm11, %xmm12 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, %xmm15 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: movaps %xmm4, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] -; SSE-NEXT: movaps %xmm0, %xmm11 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[3,0] +; SSE-NEXT: movaps %xmm7, %xmm11 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm7 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: pshufhw $232, (%rsp), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,6,6,7] +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,6,6,7] +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm9 -; SSE-NEXT: andnps %xmm11, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm9 +; SSE-NEXT: andnps %xmm11, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,7,4,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload @@ -4473,15 +4490,15 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,0] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $232, (%rsp), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,2,2,3] ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[0,1,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,0] ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,2,2,3] @@ -4491,7 +4508,7 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm15[2,0] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,2,2,3] ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload @@ -4499,7 +4516,7 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm15[2,0] ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,2,2,3] @@ -4536,61 +4553,62 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm15[2,0] -; SSE-NEXT: orps %xmm9, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,2,2,3] +; SSE-NEXT: orps %xmm9, %xmm8 +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,2,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[0,1,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps %xmm1, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rdx) +; SSE-NEXT: movaps %xmm1, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps %xmm1, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rdx) +; SSE-NEXT: movaps %xmm1, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rcx) +; SSE-NEXT: movaps %xmm1, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rcx) +; SSE-NEXT: movaps %xmm1, 96(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps %xmm1, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps %xmm1, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) ; SSE-NEXT: movaps %xmm10, 112(%r8) ; SSE-NEXT: movaps %xmm14, 96(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -4605,171 +4623,85 @@ ; SSE-NEXT: movaps %xmm1, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%r8) -; SSE-NEXT: movaps %xmm6, 112(%r9) +; SSE-NEXT: movaps %xmm8, 112(%r9) ; SSE-NEXT: movaps %xmm2, 96(%r9) ; SSE-NEXT: movaps %xmm3, 80(%r9) ; SSE-NEXT: movaps %xmm4, 64(%r9) ; SSE-NEXT: movaps %xmm5, 48(%r9) -; SSE-NEXT: movaps %xmm0, 32(%r9) +; SSE-NEXT: movaps %xmm6, 32(%r9) ; SSE-NEXT: movaps %xmm7, 16(%r9) -; SSE-NEXT: movaps %xmm8, (%r9) +; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: addq $1016, %rsp # imm = 0x3F8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride5_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX1-ONLY-NEXT: subq $1000, %rsp # imm = 0x3E8 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa 592(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm12[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 560(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 624(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 608(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm13[4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, %xmm11 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,1,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm7[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] @@ -4786,1914 +4718,1957 @@ ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,2,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm7[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm0[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm10 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm4, %ymm11 +; AVX1-ONLY-NEXT: vorps %ymm11, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 624(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 608(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vmovdqa 592(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovdqa 560(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm0[4],xmm11[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm10, %ymm11 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm4, %ymm15 +; AVX1-ONLY-NEXT: vorps %ymm15, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm0[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,3],xmm0[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm1[0,1,2,3,4],xmm15[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm15 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm14 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm3[2,3],mem[4,5],xmm3[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $48, %xmm4, %xmm15 -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm5, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsllq $48, %xmm2, %xmm14 +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm4, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm12[0,1],mem[2,3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm6[0,1],mem[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm14[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm15 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0,1],xmm9[2,3],xmm13[4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm12[0,1,2,3],xmm13[4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm14[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm8[0,1],xmm12[2,3],xmm8[4,5],xmm12[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpsllq $48, %xmm11, %xmm15 -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm5, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsllq $48, %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm4, %ymm15 +; AVX1-ONLY-NEXT: vorps %ymm15, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm10[2,3],xmm13[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm5[0,1],mem[2,3],xmm5[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm7[4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm15 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2,3],xmm7[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm14[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsllq $48, %xmm7, %xmm15 -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm5, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm4, %ymm15 +; AVX1-ONLY-NEXT: vorps %ymm15, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm8[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1],xmm8[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $48, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm10[0,1],mem[2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,3,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm11[0,1],mem[2,3],xmm11[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm14 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsllq $48, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm0[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,8,9,2,3,12,13,12,13,12,13,12,13] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1],xmm1[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1],mem[2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1],xmm3[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq (%rsp), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4,5],xmm15[6,7] ; AVX1-ONLY-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,0] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm14, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm6[0,1,2,3],mem[4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0,1],xmm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm3[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1],xmm9[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,1,3] +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1],xmm3[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm14[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm12[2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,0] +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[0,1,2,0] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1],xmm5[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm6[2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[0,1,1,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm12[4,5],xmm8[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0,1],xmm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1],xmm14[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm14[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm2[0,1],xmm5[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm10[2],xmm15[3],xmm10[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm2[2],xmm15[3],xmm2[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4,5],xmm15[6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,1,2,0] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1],mem[2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm9[0,1,2,3],xmm7[4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1],mem[2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4,5],xmm4[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,2,0] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[0,1,2,0] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,10,11,4,5,14,15,14,15,14,15,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm3[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm6[0,1,2,3],mem[4,5],xmm6[6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,3],xmm14[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1],xmm8[2,3],xmm12[4,5],xmm8[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm4[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm12[0,1,2,3],xmm13[4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm14[5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, (%rsp), %xmm2, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[0,1,0,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1],xmm7[2,3],mem[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3],xmm13[4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm6[2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0,1,2,3],xmm11[4,5],xmm10[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm9[2,3],xmm7[4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm4[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm14[5,6,7] +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1],xmm11[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[0,1,0,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1],xmm11[2,3],xmm10[4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm2[4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm3[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0,1],xmm14[2,3],xmm3[4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm15, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm8[4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm6[4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,6] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm11 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2],mem[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm14[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2,3],mem[4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm9[4,5],xmm8[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm14[3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1,2],mem[3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, (%rsp), %xmm2, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1,2],mem[3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1,2,3],mem[4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3,4,5],xmm4[6,7] ; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,3,2,3] ; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1,2],mem[3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm5[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1,2,3],mem[4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm12[3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm11[3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm6[0,1,2,3],xmm3[4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3,4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm13[0,1,2],mem[3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3],xmm5[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm12, (%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) -; AVX1-ONLY-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm11, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) +; AVX1-ONLY-NEXT: addq $1000, %rsp # imm = 0x3E8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride5_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $1048, %rsp # imm = 0x418 -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm8 +; AVX2-SLOW-NEXT: subq $1016, %rsp # imm = 0x3F8 +; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5],ymm2[6,7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] ; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5],ymm3[6],ymm5[7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13],ymm3[14],ymm5[15] +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm10 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3],xmm0[4,5],xmm3[6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm11 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm8[1,2],ymm9[3],ymm8[4],ymm9[5],ymm8[6,7],ymm9[8],ymm8[9,10],ymm9[11],ymm8[12],ymm9[13],ymm8[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15] -; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] -; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4,5],xmm5[6,7] +; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm13[1,2],ymm10[3],ymm13[4],ymm10[5],ymm13[6,7],ymm10[8],ymm13[9,10],ymm10[11],ymm13[12],ymm10[13],ymm13[14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm5[1,2],ymm14[3],ymm5[4],ymm14[5],ymm5[6,7],ymm14[8],ymm5[9,10],ymm14[11],ymm5[12],ymm14[13],ymm5[14,15] +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13],ymm4[14],ymm6[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm15 -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm6 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] +; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4,5],xmm5[6,7] ; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm4[1,2],ymm9[3],ymm4[4],ymm9[5],ymm4[6,7],ymm9[8],ymm4[9,10],ymm9[11],ymm4[12],ymm9[13],ymm4[14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm6[1,2],ymm3[3],ymm6[4],ymm3[5],ymm6[6,7],ymm3[8],ymm6[9,10],ymm3[11],ymm6[12],ymm3[13],ymm6[14,15] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm13 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5],ymm7[6],ymm5[7,8],ymm7[9],ymm5[10,11],ymm7[12],ymm5[13],ymm7[14],ymm5[15] -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5],ymm4[6],ymm1[7,8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13],ymm4[14],ymm1[15] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm5, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4,5],mem[6],ymm0[7,8],mem[9],ymm0[10],mem[11],ymm0[12,13],mem[14],ymm0[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6],xmm2[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm14[1],ymm8[2],ymm14[3],ymm8[4,5],ymm14[6],ymm8[7,8],ymm14[9],ymm8[10],ymm14[11],ymm8[12,13],ymm14[14],ymm8[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3],xmm2[4,5,6],xmm8[7] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5],ymm10[6],ymm13[7,8],ymm10[9],ymm13[10,11],ymm10[12],ymm13[13],ymm10[14],ymm13[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm15[1],ymm6[2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7,8],ymm15[9],ymm6[10],ymm15[11],ymm6[12,13],ymm15[14],ymm6[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3],xmm8[4,5,6],xmm10[7] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm8, %ymm1, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5],ymm9[6],ymm12[7,8],ymm9[9],ymm12[10,11],ymm9[12],ymm12[13],ymm9[14],ymm12[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10],ymm5[11],ymm7[12,13],ymm5[14],ymm7[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3],xmm8[4,5,6],xmm10[7] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm4, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm3[0],xmm9[1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm13[1],ymm6[2,3],ymm13[4],ymm6[5],ymm13[6],ymm6[7,8],ymm13[9],ymm6[10,11],ymm13[12],ymm6[13],ymm13[14],ymm6[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,2,1,3,4,6,1,7] +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,12,13,10,11,0,1,6,7,10,11,4,5,4,5,18,19,28,29,26,27,16,17,22,23,26,27,20,21,20,21] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm13 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8],ymm10[9],ymm12[10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm11[0],mem[1],ymm11[2,3],mem[4],ymm11[5],mem[6],ymm11[7,8],mem[9],ymm11[10,11],mem[12],ymm11[13],mem[14],ymm11[15] +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm11 +; AVX2-SLOW-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm9[1],mem[2],ymm9[3],mem[4,5],ymm9[6],mem[7,8],ymm9[9],mem[10],ymm9[11],mem[12,13],ymm9[14],mem[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3],xmm1[4,5,6],xmm10[7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15] +; AVX2-SLOW-NEXT: vpermd %ymm10, %ymm5, %ymm10 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm10, %ymm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2,3],xmm10[4,5,6],xmm12[7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[0],ymm14[1],mem[2,3],ymm14[4],mem[5],ymm14[6],mem[7,8],ymm14[9],mem[10,11],ymm14[12],mem[13],ymm14[14],mem[15] +; AVX2-SLOW-NEXT: vpermd %ymm10, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 464(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0],xmm8[1],xmm7[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2,3,4],ymm8[5,6,7],ymm4[8,9,10,11,12],ymm8[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 624(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm5[1],xmm12[2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm15 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2,3,4],ymm5[5,6,7],ymm2[8,9,10,11,12],ymm5[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm2[0],xmm4[1],xmm2[2,3] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm10 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0,1,2,3,4],ymm10[5,6,7],ymm4[8,9,10,11,12],ymm10[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 464(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm4[0],xmm5[1],xmm4[2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm10 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm8 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm13 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0,1,2,3,4],ymm13[5,6,7],ymm4[8,9,10,11,12],ymm13[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0,1,2,3,4],ymm10[5,6,7],ymm3[8,9,10,11,12],ymm10[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 624(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm3[0],xmm5[1],xmm3[2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm10 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm12 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm3[0,1,2,3,4],ymm12[5,6,7],ymm3[8,9,10,11,12],ymm12[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm4[0],xmm5[1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm3[0],xmm5[1],xmm3[2,3] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7],ymm7[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7],ymm6[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm9[2],xmm3[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm5[2],xmm3[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm12 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5,6,7],ymm13[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1],xmm15[2],xmm12[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm7, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm8[2],xmm5[3] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7],ymm11[8,9,10,11,12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm10[2],xmm8[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm10 -; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm8 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm11 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm4[2],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7],ymm6[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5,6,7],ymm9[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm5[2],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm15[2],xmm10[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm15, %xmm9 +; AVX2-SLOW-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $82, (%rsp), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm13[0],mem[1],ymm13[2,3],mem[4],ymm13[5],mem[6],ymm13[7,8],mem[9],ymm13[10,11],mem[12],ymm13[13],mem[14],ymm13[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7],ymm15[8,9],mem[10],ymm15[11],mem[12],ymm15[13,14],mem[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm13[2],mem[3],ymm13[4],mem[5,6],ymm13[7],mem[8,9],ymm13[10],mem[11],ymm13[12],mem[13,14],ymm13[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0],xmm14[1],xmm9[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0],xmm5[1],xmm7[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm11[0,1],mem[2],ymm11[3],mem[4],ymm11[5,6],mem[7],ymm11[8,9],mem[10],ymm11[11],mem[12],ymm11[13,14],mem[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4],xmm6[5,6,7] +; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5],ymm3[6],mem[7,8],ymm3[9],mem[10,11],ymm3[12],mem[13],ymm3[14],mem[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3],mem[4],ymm4[5,6],mem[7],ymm4[8,9],mem[10],ymm4[11],mem[12],ymm4[13,14],mem[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3,4],xmm4[5,6,7] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = mem[0],xmm12[1],mem[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0],xmm8[1],xmm11[2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4],xmm6[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5],ymm3[6],mem[7,8],ymm3[9],mem[10,11],ymm3[12],mem[13],ymm3[14],mem[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1],ymm15[2],ymm11[3],ymm15[4],ymm11[5,6],ymm15[7],ymm11[8,9],ymm15[10],ymm11[11],ymm15[12],ymm11[13,14],ymm15[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3,4],xmm4[5,6,7] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = mem[0],xmm10[1],mem[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm9[0],xmm10[1],xmm9[2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm12[1],mem[2,3],ymm12[4],mem[5],ymm12[6],mem[7,8],ymm12[9],mem[10,11],ymm12[12],mem[13],ymm12[14],mem[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[3,4],xmm3[5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4],xmm3[5,6,7] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[0],xmm2[1],mem[2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm12, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0],xmm12[1],xmm9[2,3] ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw $181, (%rsp), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm13[1],mem[2],ymm13[3],mem[4,5],ymm13[6],mem[7,8],ymm13[9],mem[10],ymm13[11],mem[12,13],ymm13[14],mem[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm15[1,2],mem[3],ymm15[4],mem[5],ymm15[6,7],mem[8],ymm15[9,10],mem[11],ymm15[12],mem[13],ymm15[14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = mem[0,1],xmm15[2],mem[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [3,1,2,0,7,5,2,0] +; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,12,13,10,11,4,5,2,3,2,3,14,15,0,1,22,23,28,29,26,27,20,21,18,19,18,19,30,31,16,17] +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1],xmm12[2],xmm9[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm13[0],mem[1],ymm13[2],mem[3],ymm13[4,5],mem[6],ymm13[7,8],mem[9],ymm13[10],mem[11],ymm13[12,13],mem[14],ymm13[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] -; AVX2-SLOW-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0],ymm11[1,2],mem[3],ymm11[4],mem[5],ymm11[6,7],mem[8],ymm11[9,10],mem[11],ymm11[12],mem[13],ymm11[14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = mem[0,1],xmm11[2],mem[3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0,1,2,3,4],ymm6[5,6,7],ymm4[8,9,10,11,12],ymm6[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw $214, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm13[0],mem[1,2],ymm13[3],mem[4],ymm13[5],mem[6,7],ymm13[8],mem[9,10],ymm13[11],mem[12],ymm13[13],mem[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0],ymm13[1],mem[2],ymm13[3],mem[4,5],ymm13[6],mem[7,8],ymm13[9],mem[10],ymm13[11],mem[12,13],ymm13[14],mem[15] +; AVX2-SLOW-NEXT: vpermd %ymm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = xmm2[0,1],mem[2],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm14[1],ymm4[2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7,8],ymm14[9],ymm4[10],ymm14[11],ymm4[12,13],ymm14[14],ymm4[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0,1,2,3,4],ymm6[5,6,7],ymm4[8,9,10,11,12],ymm6[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm7[1,2],ymm5[3],ymm7[4],ymm5[5],ymm7[6,7],ymm5[8],ymm7[9,10],ymm5[11],ymm7[12],ymm5[13],ymm7[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm2[1],ymm10[2],ymm2[3],ymm10[4,5],ymm2[6],ymm10[7,8],ymm2[9],ymm10[10],ymm2[11],ymm10[12,13],ymm2[14],ymm10[15] +; AVX2-SLOW-NEXT: vpermd %ymm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm14[2],xmm8[3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = xmm8[0,1],mem[2],xmm8[3] +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4,5],ymm9[6],ymm12[7,8],ymm9[9],ymm12[10],ymm9[11],ymm12[12,13],ymm9[14],ymm12[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm10[1,2],ymm5[3],ymm10[4],ymm5[5],ymm10[6,7],ymm5[8],ymm10[9,10],ymm5[11],ymm10[12],ymm5[13],ymm10[14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2],xmm3[3] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm12[2],xmm10[3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0,1,2,3,4],ymm6[5,6,7],ymm4[8,9,10,11,12],ymm6[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm11[1,2],ymm15[3],ymm11[4],ymm15[5],ymm11[6,7],ymm15[8],ymm11[9,10],ymm15[11],ymm11[12],ymm15[13],ymm11[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm6[1],ymm14[2],ymm6[3],ymm14[4,5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10],ymm6[11],ymm14[12,13],ymm6[14],ymm14[15] +; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm11[2],xmm9[3] +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $107, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7],mem[8,9],ymm0[10],mem[11],ymm0[12],mem[13,14],ymm0[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm2[2],ymm10[3],ymm2[4],ymm10[5,6],ymm2[7],ymm10[8,9],ymm2[10],ymm10[11],ymm2[12],ymm10[13,14],ymm2[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = mem[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm13[2],mem[3],ymm13[4],mem[5,6],ymm13[7],mem[8,9],ymm13[10],mem[11],ymm13[12],mem[13,14],ymm13[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15] +; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5],ymm4[6],mem[7,8],ymm4[9],mem[10,11],ymm4[12],mem[13],ymm4[14],mem[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm4 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm5 = mem[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0,1],ymm6[2],ymm14[3],ymm6[4],ymm14[5,6],ymm6[7],ymm14[8,9],ymm6[10],ymm14[11],ymm6[12],ymm14[13,14],ymm6[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0],ymm15[1],mem[2,3],ymm15[4],mem[5],ymm15[6],mem[7,8],ymm15[9],mem[10,11],ymm15[12],mem[13],ymm15[14],mem[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3,4],xmm7[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm9[2],mem[3],ymm9[4],mem[5,6],ymm9[7],mem[8,9],ymm9[10],mem[11],ymm9[12],mem[13,14],ymm9[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5],mem[6],ymm6[7,8],mem[9],ymm6[10,11],mem[12],ymm6[13],mem[14],ymm6[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3],mem[4],ymm5[5,6],mem[7],ymm5[8,9],mem[10],ymm5[11],mem[12],ymm5[13,14],mem[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4],ymm5[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm5 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd $232, (%rsp), %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = mem[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm12, 96(%r8) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 64(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 96(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 64(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%r9) -; AVX2-SLOW-NEXT: addq $1048, %rsp # imm = 0x418 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-SLOW-NEXT: addq $1016, %rsp # imm = 0x3F8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride5_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $1000, %rsp # imm = 0x3E8 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-FAST-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4,5],xmm1[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4],ymm4[5],ymm3[6,7],ymm4[8],ymm3[9,10],ymm4[11],ymm3[12],ymm4[13],ymm3[14,15] +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12],ymm1[13],ymm0[14,15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,0,2,4,6,1,3] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm4[1,2,3],xmm0[4,5],xmm4[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm11[1,2],ymm6[3],ymm11[4],ymm6[5],ymm11[6,7],ymm6[8],ymm11[9,10],ymm6[11],ymm11[12],ymm6[13],ymm11[14,15] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm14 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10],ymm6[11],ymm7[12],ymm6[13],ymm7[14,15] +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm9 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm4[1],ymm9[2,3],ymm4[4],ymm9[5],ymm4[6],ymm9[7,8],ymm4[9],ymm9[10,11],ymm4[12],ymm9[13],ymm4[14],ymm9[15] +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm14[1],ymm4[2,3],ymm14[4],ymm4[5],ymm14[6],ymm4[7,8],ymm14[9],ymm4[10,11],ymm14[12],ymm4[13],ymm14[14],ymm4[15] ; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm15 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm12[1,2],ymm8[3],ymm12[4],ymm8[5],ymm12[6,7],ymm8[8],ymm12[9,10],ymm8[11],ymm12[12],ymm8[13],ymm12[14,15] -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5],ymm4[6],ymm7[7,8],ymm4[9],ymm7[10,11],ymm4[12],ymm7[13],ymm4[14],ymm7[15] -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm9 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15] +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1,2],ymm4[3],ymm7[4],ymm4[5],ymm7[6,7],ymm4[8],ymm7[9,10],ymm4[11],ymm7[12],ymm4[13],ymm7[14,15] +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] ; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm10 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15] +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4,5],ymm0[6],mem[7,8],ymm0[9],mem[10],ymm0[11],mem[12,13],ymm0[14],mem[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm8[1],ymm2[2],ymm8[3],ymm2[4,5],ymm8[6],ymm2[7,8],ymm8[9],ymm2[10],ymm8[11],ymm2[12,13],ymm8[14],ymm2[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,u,u,u,4,7,1,6> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm6 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4,5],mem[6],ymm0[7,8],mem[9],ymm0[10],mem[11],ymm0[12,13],mem[14],ymm0[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6],xmm3[7] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5],ymm14[6],ymm11[7,8],ymm14[9],ymm11[10,11],ymm14[12],ymm11[13],ymm14[14],ymm11[15] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm15[0],mem[1],ymm15[2],mem[3],ymm15[4,5],mem[6],ymm15[7,8],mem[9],ymm15[10],mem[11],ymm15[12,13],mem[14],ymm15[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,u,u,u,4,6,1,7> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,12,13,10,11,0,1,6,7,2,3,4,5,0,1,18,19,28,29,26,27,16,17,22,23,18,19,20,21,16,17] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6],xmm2[7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5],ymm9[6],mem[7,8],ymm9[9],mem[10,11],ymm9[12],mem[13],ymm9[14],mem[15] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0],ymm15[1],mem[2],ymm15[3],mem[4,5],ymm15[6],mem[7,8],ymm15[9],mem[10],ymm15[11],mem[12,13],ymm15[14],mem[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5,6],xmm11[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm8[1],ymm12[2,3],ymm8[4],ymm12[5],ymm8[6],ymm12[7,8],ymm8[9],ymm12[10,11],ymm8[12],ymm12[13],ymm8[14],ymm12[15] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm8[1],ymm14[2,3],ymm8[4],ymm14[5],ymm8[6],ymm14[7,8],ymm8[9],ymm14[10,11],ymm8[12],ymm14[13],ymm8[14],ymm14[15] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm4, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm11 ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0],ymm10[1],ymm7[2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7,8],ymm10[9],ymm7[10],ymm10[11],ymm7[12,13],ymm10[14],ymm7[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3],xmm11[4,5,6],xmm12[7] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm11, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5],ymm5[6],ymm10[7,8],ymm5[9],ymm10[10,11],ymm5[12],ymm10[13],ymm5[14],ymm10[15] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,3,1,3,0,3,5,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm12 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm14 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2,3,4],ymm5[5,6,7],ymm2[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm5 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,1,3,0,3,5,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1,2,3,4],ymm9[5,6,7],ymm3[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1,2,3,4],ymm9[5,6,7],ymm3[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,2,3,1,3,6,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm7, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm6[0,1,2,3,4],ymm11[5,6,7],ymm6[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1,2,3,4],ymm11[5,6,7],ymm1[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm12 +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0,1,2,3,4],ymm6[5,6,7],ymm2[8,9,10,11,12],ymm6[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm13 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7],ymm5[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm4[2],ymm9[3],ymm4[4],ymm9[5,6],ymm4[7],ymm9[8,9],ymm4[10],ymm9[11],ymm4[12],ymm9[13,14],ymm4[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm6 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,u,u,5,7,2,4> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,4,6,0,1,4,6,0] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm6, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm14[2],ymm15[3],ymm14[4],ymm15[5,6],ymm14[7],ymm15[8,9],ymm14[10],ymm15[11],ymm14[12],ymm15[13,14],ymm14[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm14[2],ymm15[3],ymm14[4],ymm15[5,6],ymm14[7],ymm15[8,9],ymm14[10],ymm15[11],ymm14[12],ymm15[13,14],ymm14[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,2,u,u,5,7,2,4> +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,4,6,0,1,4,6,0] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm4[0,1],mem[2],ymm4[3],mem[4],ymm4[5,6],mem[7],ymm4[8,9],mem[10],ymm4[11],mem[12],ymm4[13,14],mem[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm11 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm11 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm6, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm12[2],mem[3],ymm12[4],mem[5,6],ymm12[7],mem[8,9],ymm12[10],mem[11],ymm12[12],mem[13,14],ymm12[15] +; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm12[0,1],mem[2],ymm12[3],mem[4],ymm12[5,6],mem[7],ymm12[8,9],mem[10],ymm12[11],mem[12],ymm12[13,14],mem[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm11 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5],ymm8[6],ymm2[7,8],ymm8[9],ymm2[10,11],ymm8[12],ymm2[13],ymm8[14],ymm2[15] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm11 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm5, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm11 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm13[2],mem[3],ymm13[4],mem[5,6],ymm13[7],mem[8,9],ymm13[10],mem[11],ymm13[12],mem[13,14],ymm13[15] +; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm11 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0],ymm8[1],ymm4[2,3],ymm8[4],ymm4[5],ymm8[6],ymm4[7,8],ymm8[9],ymm4[10,11],ymm8[12],ymm4[13],ymm8[14],ymm4[15] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm9[1,2],ymm4[3],ymm9[4],ymm4[5],ymm9[6,7],ymm4[8],ymm9[9,10],ymm4[11],ymm9[12],ymm4[13],ymm9[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm11[0],mem[1],ymm11[2],mem[3],ymm11[4,5],mem[6],ymm11[7,8],mem[9],ymm11[10],mem[11],ymm11[12,13],mem[14],ymm11[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,u,u,5,0,2,7> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,4,7,0,2,4,7,0] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0],ymm10[1],ymm15[2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8],ymm10[9],ymm15[10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw $214, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm12[0],mem[1,2],ymm12[3],mem[4],ymm12[5],mem[6,7],ymm12[8],mem[9,10],ymm12[11],mem[12],ymm12[13],mem[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm2[1],ymm8[2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8],ymm2[9],ymm8[10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15] -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm12 -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm6, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,3,5,7,2,0> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,12,13,10,11,0,1,6,7,6,7,2,3,12,13,18,19,28,29,26,27,16,17,22,23,22,23,18,19,28,29] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [3,1,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,12,13,10,11,4,5,2,3,8,9,0,1,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,4,7,0,2,4,7,0] +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $214, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm7[0],mem[1,2],ymm7[3],mem[4],ymm7[5],mem[6,7],ymm7[8],mem[9,10],ymm7[11],mem[12],ymm7[13],mem[14,15] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10],ymm10[11],ymm9[12,13],ymm10[14],ymm9[15] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,12,13,10,11,0,1,6,7,6,7,2,3,12,13,18,19,28,29,26,27,16,17,22,23,22,23,18,19,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm10[1,2],ymm13[3],ymm10[4],ymm13[5],ymm10[6,7],ymm13[8],ymm10[9,10],ymm13[11],ymm10[12],ymm13[13],ymm10[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0],ymm12[1,2],ymm10[3],ymm12[4],ymm10[5],ymm12[6,7],ymm10[8],ymm12[9,10],ymm10[11],ymm12[12],ymm10[13],ymm12[14,15] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4,5],ymm9[6],ymm12[7,8],ymm9[9],ymm12[10],ymm9[11],ymm12[12,13],ymm9[14],ymm12[15] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] -; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm11[0,1],mem[2],ymm11[3],mem[4],ymm11[5,6],mem[7],ymm11[8,9],mem[10],ymm11[11],mem[12],ymm11[13,14],mem[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4],ymm15[5],ymm13[6,7],ymm15[8],ymm13[9,10],ymm15[11],ymm13[12],ymm15[13],ymm13[14,15] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10],ymm4[11],ymm8[12,13],ymm4[14],ymm8[15] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,12,13,10,11,0,1,6,7,6,7,2,3,12,13,18,19,28,29,26,27,16,17,22,23,22,23,18,19,28,29] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7],ymm2[8,9],mem[10],ymm2[11],mem[12],ymm2[13,14],mem[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3,4],xmm5[5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <1,3,u,u,6,0,3,5> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,1,3,0,2,5,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5],ymm3[6],mem[7,8],ymm3[9],mem[10,11],ymm3[12],mem[13],ymm3[14],mem[15] +; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm10[1],mem[2,3],ymm10[4],mem[5],ymm10[6],mem[7,8],ymm10[9],mem[10,11],ymm10[12],mem[13],ymm10[14],mem[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm11 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3,4],xmm11[5,6,7] -; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7],ymm15[8,9],mem[10],ymm15[11],mem[12],ymm15[13,14],mem[15] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm9[2],ymm12[3],ymm9[4],ymm12[5,6],ymm9[7],ymm12[8,9],ymm9[10],ymm12[11],ymm9[12],ymm12[13,14],ymm9[15] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm5, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm11 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm12[0,1],mem[2],ymm12[3],mem[4],ymm12[5,6],mem[7],ymm12[8,9],mem[10],ymm12[11],mem[12],ymm12[13,14],mem[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3,4],xmm12[5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm11, %xmm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3,4],xmm11[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm6 -; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5],ymm4[6],mem[7,8],ymm4[9],mem[10,11],ymm4[12],mem[13],ymm4[14],mem[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3,4],xmm11[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[0,1],ymm8[2],mem[3],ymm8[4],mem[5,6],ymm8[7],mem[8,9],ymm8[10],mem[11],ymm8[12],mem[13,14],ymm8[15] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm5, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm15[1],mem[2,3],ymm15[4],mem[5],ymm15[6],mem[7,8],ymm15[9],mem[10,11],ymm15[12],mem[13],ymm15[14],mem[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3,4],xmm11[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = ymm8[0,1],mem[2],ymm8[3],mem[4],ymm8[5,6],mem[7],ymm8[8,9],mem[10],ymm8[11],mem[12],ymm8[13,14],mem[15] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm5 ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 64(%r8) +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm8, (%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 96(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%r9) -; AVX2-FAST-NEXT: addq $1000, %rsp # imm = 0x3E8 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 64(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 32(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-FAST-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride5_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $1080, %rsp # imm = 0x438 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12],ymm1[13],ymm0[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13],ymm2[14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm2[1],ymm15[2,3],ymm2[4],ymm15[5],ymm2[6],ymm15[7,8],ymm2[9],ymm15[10,11],ymm2[12],ymm15[13],ymm2[14],ymm15[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4,5],xmm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2,3],xmm4[4,5],xmm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm14[1,2],ymm13[3],ymm14[4],ymm13[5],ymm14[6,7],ymm13[8],ymm14[9,10],ymm13[11],ymm14[12],ymm13[13],ymm14[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0],ymm5[1,2],ymm14[3],ymm5[4],ymm14[5],ymm5[6,7],ymm14[8],ymm5[9,10],ymm14[11],ymm5[12],ymm14[13],ymm5[14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5],ymm2[6],ymm5[7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13],ymm2[14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2,3],xmm4[4,5],xmm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm3[1,2],ymm6[3],ymm3[4],ymm6[5],ymm3[6,7],ymm6[8],ymm3[9,10],ymm6[11],ymm3[12],ymm6[13],ymm3[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5],ymm3[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3],xmm3[4,5],xmm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm15[1],mem[2],ymm15[3],mem[4,5],ymm15[6],mem[7,8],ymm15[9],mem[10],ymm15[11],mem[12,13],ymm15[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10],ymm6[11],ymm7[12],ymm6[13],ymm7[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm5, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm15[1],mem[2],ymm15[3],mem[4,5],ymm15[6],mem[7,8],ymm15[9],mem[10],ymm15[11],mem[12,13],ymm15[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5,6],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13],ymm13[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3],xmm8[4,5,6],xmm11[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm8, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm8[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3],xmm8[4,5,6],xmm11[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm6, %ymm7, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,1,3,4,6,1,7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,12,13,10,11,0,1,6,7,10,11,4,5,4,5,18,19,28,29,26,27,16,17,22,23,26,27,20,21,20,21] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm15[1],ymm11[2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7,8],ymm15[9],ymm11[10],ymm15[11],ymm11[12,13],ymm15[14],ymm11[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5],ymm2[6],mem[7,8],ymm2[9],mem[10,11],ymm2[12],mem[13],ymm2[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm10[1],ymm13[2],ymm10[3],ymm13[4,5],ymm10[6],ymm13[7,8],ymm10[9],ymm13[10],ymm10[11],ymm13[12,13],ymm10[14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5,6],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0],ymm14[1],mem[2,3],ymm14[4],mem[5],ymm14[6],mem[7,8],ymm14[9],mem[10,11],ymm14[12],mem[13],ymm14[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm4, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm7, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3],xmm7[4,5,6],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13],ymm14[14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0],xmm5[1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 624(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1,2,3,4],ymm4[5,6,7],ymm2[8,9,10,11,12],ymm4[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0],xmm9[1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 624(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0],xmm12[1],xmm7[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm7[0],xmm4[1],xmm7[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm3[0],xmm4[1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7],ymm8[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm1[2],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm4[2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm10[2],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm5[2],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7],ymm6[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm6[2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm9[2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7],ymm11[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm11[2],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm12[2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5],ymm8[6],mem[7,8],ymm8[9],mem[10,11],ymm8[12],mem[13],ymm8[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm5[0,1],mem[2],ymm5[3],mem[4],ymm5[5,6],mem[7],ymm5[8,9],mem[10],ymm5[11],mem[12],ymm5[13,14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7],mem[8,9],ymm1[10],mem[11],ymm1[12],mem[13,14],ymm1[15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0],xmm4[1],xmm14[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0],xmm10[1],xmm5[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5],ymm6[6],ymm4[7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13],ymm6[14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm7[0,1],ymm15[2],ymm7[3],ymm15[4],ymm7[5,6],ymm15[7],ymm7[8,9],ymm15[10],ymm7[11],ymm15[12],ymm7[13,14],ymm15[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm12[3,4],xmm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0],xmm10[1],xmm13[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2,3,4],ymm8[5,6,7],ymm3[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm11[2],ymm10[3],ymm11[4],ymm10[5,6],ymm11[7],ymm10[8,9],ymm11[10],ymm10[11],ymm11[12],ymm10[13,14],ymm11[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm12[3,4],xmm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm3[0],xmm13[1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5],ymm2[6],mem[7,8],ymm2[9],mem[10,11],ymm2[12],mem[13],ymm2[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm15[0],xmm6[1],xmm15[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2,3,4],ymm8[5,6,7],ymm3[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13],ymm13[14],ymm14[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0,1],ymm14[2],ymm15[3],ymm14[4],ymm15[5,6],ymm14[7],ymm15[8,9],ymm14[10],ymm15[11],ymm14[12],ymm15[13,14],ymm14[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0],xmm7[1],xmm12[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2,3,4],ymm8[5,6,7],ymm3[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm9[2],ymm4[3],ymm9[4],ymm4[5,6],ymm9[7],ymm4[8,9],ymm9[10],ymm4[11],ymm9[12],ymm4[13,14],ymm9[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm8[3,4],xmm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[0],xmm2[1],mem[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm13[1],xmm5[2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm8[0],mem[1],ymm8[2],mem[3],ymm8[4,5],mem[6],ymm8[7,8],mem[9],ymm8[10],mem[11],ymm8[12,13],mem[14],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm5[1,2],ymm8[3],ymm5[4],ymm8[5],ymm5[6,7],ymm8[8],ymm5[9,10],ymm8[11],ymm5[12],ymm8[13],ymm5[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm1[0,1],mem[2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm4[1,2],ymm9[3],ymm4[4],ymm9[5],ymm4[6,7],ymm9[8],ymm4[9,10],ymm9[11],ymm4[12],ymm9[13],ymm4[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [3,1,2,0,7,5,2,0] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,12,13,10,11,4,5,2,3,2,3,14,15,0,1,22,23,28,29,26,27,20,21,18,19,18,19,30,31,16,17] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm5[0,1],xmm13[2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0],ymm7[1,2],mem[3],ymm7[4],mem[5],ymm7[6,7],mem[8],ymm7[9,10],mem[11],ymm7[12],mem[13],ymm7[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1,2,3,4],ymm9[5,6,7],ymm3[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $214, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1,2],ymm3[3],mem[4],ymm3[5],mem[6,7],ymm3[8],mem[9,10],ymm3[11],mem[12],ymm3[13],mem[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm9[0],xmm3[1],xmm9[2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm4[0],mem[1],ymm4[2],mem[3],ymm4[4,5],mem[6],ymm4[7,8],mem[9],ymm4[10],mem[11],ymm4[12,13],mem[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $4, (%rsp), %xmm4, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm9 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm4[0,1],mem[2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm15[1],ymm14[2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10],ymm15[11],ymm14[12,13],ymm15[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm10[1,2],ymm11[3],ymm10[4],ymm11[5],ymm10[6,7],ymm11[8],ymm10[9,10],ymm11[11],ymm10[12],ymm11[13],ymm10[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm15[0,1],xmm13[2],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7,8],ymm7[9],ymm12[10],ymm7[11],ymm12[12,13],ymm7[14],ymm12[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1,2,3,4],ymm9[5,6,7],ymm3[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm9[0],xmm3[1],xmm9[2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm5[2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm11[2],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1,2,3,4],ymm9[5,6,7],ymm3[8,9,10,11,12],ymm9[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4,5],ymm12[6],ymm9[7,8],ymm12[9],ymm9[10],ymm12[11],ymm9[12,13],ymm12[14],ymm9[15] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm8, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm14[2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7],ymm0[8,9],mem[10],ymm0[11],mem[12],ymm0[13,14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5],ymm8[6],mem[7,8],ymm8[9],mem[10,11],ymm8[12],mem[13],ymm8[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7],ymm2[8,9],mem[10],ymm2[11],mem[12],ymm2[13,14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0,1,2],xmm4[3,4],xmm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0,1],ymm7[2],ymm12[3],ymm7[4],ymm12[5,6],ymm7[7],ymm12[8,9],ymm7[10],ymm12[11],ymm7[12],ymm12[13,14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4],ymm5[5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4],ymm5[5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3,4],xmm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm15, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3],mem[4],ymm3[5,6],mem[7],ymm3[8,9],mem[10],ymm3[11],mem[12],ymm3[13,14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5],ymm4[6],mem[7,8],ymm4[9],mem[10,11],ymm4[12],mem[13],ymm4[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7],ymm7[8,9],ymm6[10],ymm7[11],ymm6[12],ymm7[13,14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4],ymm3[5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5],ymm5[6],mem[7,8],ymm5[9],mem[10,11],ymm5[12],mem[13],ymm5[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4],xmm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3],mem[4],ymm5[5,6],mem[7],ymm5[8,9],mem[10],ymm5[11],mem[12],ymm5[13,14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0],ymm6[1],mem[2,3],ymm6[4],mem[5],ymm6[6],mem[7,8],ymm6[9],mem[10,11],ymm6[12],mem[13],ymm6[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1],ymm12[2],ymm9[3],ymm12[4],ymm9[5,6],ymm12[7],ymm9[8,9],ymm12[10],ymm9[11],ymm12[12],ymm9[13,14],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4],ymm6[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0],ymm7[1],mem[2,3],ymm7[4],mem[5],ymm7[6],mem[7,8],ymm7[9],mem[10,11],ymm7[12],mem[13],ymm7[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3,4],xmm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm7, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: addq $1080, %rsp # imm = 0x438 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-FAST-PERLANE-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: load_i16_stride5_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $584, %rsp # imm = 0x248 +; AVX512F-SLOW-NEXT: subq $616, %rsp # imm = 0x268 ; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa 416(%rdi), %ymm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12],ymm1[13],ymm2[14,15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm6 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm7 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm8 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,7,16,17,26,27,20,21,30,31,24,25,128,128,128,128,128,128] @@ -6701,742 +6676,740 @@ ; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %ymm3 ; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %ymm4 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm4, %ymm8 -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm9 -; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa %ymm4, %ymm9 +; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm11 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> ; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX512F-SLOW-NEXT: vporq %ymm2, %ymm3, %ymm19 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm13 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0],xmm3[1],xmm13[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa 576(%rdi), %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm12 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm10 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm10[1,2],ymm3[3],ymm10[4],ymm3[5],ymm10[6,7],ymm3[8],ymm10[9,10],ymm3[11],ymm10[12],ymm3[13],ymm10[14,15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm11 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm6 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5],ymm12[6],ymm5[7,8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13],ymm12[14],ymm5[15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] ; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,8,9,18,19,28,29,22,23,16,17,26,27,128,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7] +; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm17 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5],ymm6[6],ymm4[7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13],ymm6[14],ymm4[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,1,3,4,6,1,7] +; AVX512F-SLOW-NEXT: vpermd %ymm0, %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,4,5,18,19,28,29,26,27,16,17,22,23,128,128,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm3 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7,8],ymm5[9],ymm12[10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3],xmm1[4,5,6],xmm4[7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm11, (%rsp) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX512F-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm9[1],ymm11[2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10],ymm9[11],ymm11[12,13],ymm9[14],ymm11[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm11, %ymm21 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm9, %ymm27 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] +; AVX512F-SLOW-NEXT: vmovdqa64 144(%rdi), %xmm16 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm30 +; AVX512F-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm7, %ymm22 +; AVX512F-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermd %ymm3, %ymm2, %ymm2 ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7] -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7],ymm15[8,9],ymm12[10],ymm15[11],ymm12[12],ymm15[13,14],ymm12[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0],xmm14[1],xmm13[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm26 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm24 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm3[2],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm18 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10],ymm11[11],ymm10[12,13],ymm11[14],ymm10[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa 288(%rdi), %ymm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm5 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm13 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[3,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm25 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm24[3,1,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm7 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm3 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm16[3,1,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm18[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm13[1],xmm7[2,3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm7, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm20 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 608(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512F-SLOW-NEXT: vmovdqa 608(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm15[2],ymm1[3],ymm15[4],ymm1[5,6],ymm15[7],ymm1[8,9],ymm15[10],ymm1[11],ymm15[12],ymm1[13,14],ymm15[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 512(%rdi), %ymm6 -; AVX512F-SLOW-NEXT: vmovdqa 544(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1],ymm2[2,3],ymm6[4],ymm2[5],ymm6[6],ymm2[7,8],ymm6[9],ymm2[10,11],ymm6[12],ymm2[13],ymm6[14],ymm2[15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 496(%rdi), %xmm27 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm27[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 480(%rdi), %xmm28 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm28[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm25, %zmm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 512(%rdi), %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa 544(%rdi), %ymm3 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5],ymm8[6],ymm3[7,8],ymm8[9],ymm3[10,11],ymm8[12],ymm3[13],ymm8[14],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm12 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 496(%rdi), %xmm31 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm31[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 480(%rdi), %xmm20 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm20[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,10,11,10,11,14,15,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23] +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm10, %zmm23, %zmm14 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm14, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm5[2],ymm4[3],ymm5[4],ymm4[5,6],ymm5[7],ymm4[8,9],ymm5[10],ymm4[11],ymm5[12],ymm4[13,14],ymm5[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3,4],xmm0[5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5],ymm12[6],ymm15[7,8],ymm12[9],ymm15[10,11],ymm12[12],ymm15[13],ymm12[14],ymm15[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm2[1,2],ymm7[3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm24[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm26[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 288(%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm14 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %ymm7 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm9 +; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm3 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm3[1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm5[0],xmm2[1],xmm5[2,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm23, %zmm9 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm13[2],xmm10[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm29 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm30 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm4[1,2],ymm14[3],ymm4[4],ymm14[5],ymm4[6,7],ymm14[8],ymm4[9,10],ymm14[11],ymm4[12],ymm14[13],ymm4[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm14, %ymm28 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm26 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2],xmm0[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm14 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm3, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm17 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [7,5,2,0,7,5,6,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] +; AVX512F-SLOW-NEXT: vpermd %ymm11, %ymm10, %ymm11 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [14,15,0,1,12,13,12,13,10,11,4,5,2,3,8,9,30,31,16,17,28,29,28,29,26,27,20,21,18,19,24,25] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm11, %ymm11 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm11[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,6,7,0,1,10,11,4,5,14,15,8,9] +; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm5[0,1],xmm2[2],xmm5[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm18 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm31 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm9 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm5 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm5[1,2],ymm15[3],ymm5[4],ymm15[5],ymm5[6,7],ymm15[8],ymm5[9,10],ymm15[11],ymm5[12],ymm15[13],ymm5[14,15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2],xmm0[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5],ymm12[6],ymm8[7,8],ymm12[9],ymm8[10,11],ymm12[12],ymm8[13],ymm12[14],ymm8[15] +; AVX512F-SLOW-NEXT: vpermd %ymm9, %ymm10, %ymm9 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm20[0,3,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm31, %xmm10 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm1[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm11[1],ymm6[2,3],ymm11[4],ymm6[5],ymm11[6],ymm6[7,8],ymm11[9],ymm6[10,11],ymm11[12],ymm6[13],ymm11[14],ymm6[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm23 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm11, %ymm19 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm8[1],ymm2[2],ymm8[3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm28[0,3,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm27, %xmm13 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm2[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm3[1,2],ymm6[3],ymm3[4],ymm6[5],ymm3[6,7],ymm6[8],ymm3[9,10],ymm6[11],ymm3[12],ymm6[13],ymm3[14,15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0],xmm8[1],xmm14[2],xmm8[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,6,7,0,1,10,11,4,5,14,15,8,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm8[5,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm0 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm4[2],xmm13[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm25, %zmm23, %zmm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13],ymm15[14],ymm12[15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm15, %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa %ymm12, %ymm9 -; AVX512F-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm27, %ymm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0],xmm13[1],xmm4[2,3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm23, %zmm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7,8],ymm8[9],ymm12[10],ymm8[11],ymm12[12,13],ymm8[14],ymm12[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm22 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm12, %ymm30 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm26[0,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm20 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm24, %xmm8 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm20[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,7] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm31[2],xmm11[3],xmm31[3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm0[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5],ymm15[6],ymm5[7,8],ymm15[9],ymm5[10,11],ymm15[12],ymm5[13],ymm15[14],ymm5[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm10 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm10 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa %ymm7, %ymm8 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX512F-SLOW-NEXT: vmovdqa %ymm6, %ymm12 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm10 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm17[2],xmm1[3],xmm17[3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm11 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5],ymm15[6],ymm11[7,8],ymm15[9],ymm11[10,11],ymm15[12],ymm11[13],ymm15[14],ymm11[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm6[2],ymm3[3],ymm6[4],ymm3[5,6],ymm6[7],ymm3[8,9],ymm6[10],ymm3[11],ymm6[12],ymm3[13,14],ymm6[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm9[3,4],xmm2[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm7 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0],xmm9[1],xmm7[2,3] +; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm23, %zmm2 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm3[1,2],ymm6[3],ymm3[4],ymm6[5],ymm3[6,7],ymm6[8],ymm3[9,10],ymm6[11],ymm3[12],ymm6[13],ymm3[14,15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [3,1,2,0,7,5,2,0] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] +; AVX512F-SLOW-NEXT: vpermd %ymm1, %ymm24, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [6,7,12,13,10,11,4,5,2,3,2,3,14,15,0,1,22,23,28,29,26,27,20,21,18,19,18,19,30,31,16,17] +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0],xmm14[1],xmm10[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm23 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [0,6,1,3,4,6,5,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm12[2],ymm8[3],ymm12[4],ymm8[5,6],ymm12[7],ymm8[8,9],ymm12[10],ymm8[11],ymm12[12],ymm8[13,14],ymm12[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm12, %ymm16 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm17 +; AVX512F-SLOW-NEXT: vpermd %ymm0, %ymm25, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [10,11,10,11,4,5,8,9,14,15,4,5,2,3,12,13,26,27,26,27,20,21,24,25,30,31,20,21,18,19,28,29] +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4],ymm5[5],ymm4[6,7],ymm5[8],ymm4[9,10],ymm5[11],ymm4[12],ymm5[13],ymm4[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm18 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1],xmm7[2],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm21 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm22 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],xmm9[2],xmm7[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5],ymm11[6],ymm15[7,8],ymm11[9],ymm15[10,11],ymm11[12],ymm15[13],ymm11[14],ymm15[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm11, %ymm28 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm15, %ymm29 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm15 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm15[1,2,3],xmm1[4,5],xmm15[6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm27, %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm6 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4],ymm7[5],ymm6[6,7],ymm7[8],ymm6[9,10],ymm7[11],ymm6[12],ymm7[13],ymm6[14,15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm5[1],ymm11[2,3],ymm5[4],ymm11[5],ymm5[6],ymm11[7,8],ymm5[9],ymm11[10,11],ymm5[12],ymm11[13],ymm5[14],ymm11[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11],ymm2[12],ymm1[13,14],ymm2[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3,4],xmm2[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm12 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0],xmm8[1],xmm12[2,3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm13, %xmm13 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm25, %zmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm28[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm27[2],xmm15[3],xmm27[3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm2[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13],ymm6[14],ymm3[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm19 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm23 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm12 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4],xmm12[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm12 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm12[5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm13, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm10, %ymm31 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm12 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm20[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm10 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm12[2],xmm24[2],xmm12[3],xmm24[3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm2[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw $173, (%rsp), %ymm3, %ymm12 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm12 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5],ymm3[6],mem[7,8],ymm3[9],mem[10,11],ymm3[12],mem[13],ymm3[14],mem[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6],ymm12[7] -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm7 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm12 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm12 = ymm3[0,1],mem[2],ymm3[3],mem[4],ymm3[5,6],mem[7],ymm3[8,9],mem[10],ymm3[11],mem[12],ymm3[13,14],mem[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3,4],xmm12[5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm6 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm6[1],xmm3[2,3] -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5],ymm0[6],ymm3[7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13],ymm0[14],ymm3[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm24 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3,4],xmm8[5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7,8],ymm11[9],ymm1[10],ymm11[11],ymm1[12,13],ymm11[14],ymm1[15] +; AVX512F-SLOW-NEXT: vpermd %ymm2, %ymm24, %ymm2 +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0],xmm8[1],xmm14[2,3] +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm30, %ymm5 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm5[0,1],ymm10[2],ymm5[3],ymm10[4],ymm5[5,6],ymm10[7],ymm5[8,9],ymm10[10],ymm5[11],ymm10[12],ymm5[13,14],ymm10[15] +; AVX512F-SLOW-NEXT: vpermd %ymm9, %ymm25, %ymm9 +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm9, %ymm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm4 = mem[0,1],xmm4[2],mem[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm26, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm12 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10,11],ymm12[12],ymm13[13],ymm12[14],ymm13[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4,5],xmm3[6,7] ; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm18 -; AVX512F-SLOW-NEXT: vmovdqa %ymm5, %ymm12 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm11[1],ymm5[2],ymm11[3],ymm5[4,5],ymm11[6],ymm5[7,8],ymm11[9],ymm5[10],ymm11[11],ymm5[12,13],ymm11[14],ymm5[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm15 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0],xmm2[1],xmm7[2],xmm2[3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm1[2],ymm4[3],ymm1[4],ymm4[5,6],ymm1[7],ymm4[8,9],ymm1[10],ymm4[11],ymm1[12],ymm4[13,14],ymm1[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0],xmm1[1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm2[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm5[2],xmm7[3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm8 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1,2,3],xmm0[4,5],xmm7[6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm7 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm21 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7],ymm12[8,9],ymm11[10],ymm12[11],ymm11[12],ymm12[13,14],ymm11[15] +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm15 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm11[2],ymm1[3],ymm11[4],ymm1[5,6],ymm11[7],ymm1[8,9],ymm11[10],ymm1[11],ymm11[12],ymm1[13,14],ymm11[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0,1,2],xmm2[3,4],xmm7[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm11 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2],xmm6[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm5[1,2],ymm10[3],ymm5[4],ymm10[5],ymm5[6,7],ymm10[8],ymm5[9,10],ymm10[11],ymm5[12],ymm10[13],ymm5[14,15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm14[0,1],xmm8[2],xmm14[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm3 ; AVX512F-SLOW-NEXT: movb $7, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3],xmm0[4,5,6],xmm4[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm6 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1,2,3,4,5,6,7],ymm6[8],ymm4[9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm6 -; AVX512F-SLOW-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0],ymm6[1,2],mem[3],ymm6[4],mem[5],ymm6[6,7],mem[8],ymm6[9,10],mem[11],ymm6[12],mem[13],ymm6[14,15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1],xmm4[2],xmm10[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k1} +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6],xmm2[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm13 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm5 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4],ymm3[5],ymm5[6,7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12],ymm3[13],ymm5[14,15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2],ymm3[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw $148, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3],mem[4],ymm4[5,6],mem[7],ymm4[8,9],mem[10],ymm4[11],mem[12],ymm4[13,14],mem[15] +; AVX512F-SLOW-NEXT: vpblendw $107, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm5 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7,8],ymm4[9],ymm1[10],ymm4[11],ymm1[12,13],ymm4[14],ymm1[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm24, %ymm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5,6],xmm4[7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%rsi) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%rdx) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, (%rcx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, 64(%r8) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm2, (%r8) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512F-SLOW-NEXT: addq $584, %rsp # imm = 0x248 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7],ymm4[8],ymm2[9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm1, 64(%rsi) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm1, (%rsi) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm1, 64(%rdx) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm1, (%rdx) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm1, 64(%rcx) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm1, (%rcx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, 64(%r8) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm1, (%r8) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 64(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512F-SLOW-NEXT: addq $616, %rsp # imm = 0x268 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: load_i16_stride5_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa 496(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX512F-FAST-NEXT: vmovdqa 512(%rdi), %ymm10 -; AVX512F-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 544(%rdi), %ymm11 -; AVX512F-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 576(%rdi), %ymm7 -; AVX512F-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 608(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %ymm4 -; AVX512F-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13],ymm1[14],ymm4[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3],xmm1[4,5],xmm4[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX512F-FAST-NEXT: vmovdqa 416(%rdi), %ymm5 -; AVX512F-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4],ymm6[5],ymm5[6,7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12],ymm6[13],ymm5[14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm27 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3] -; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,4,7,1,4,6,u,u> -; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm10, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [8,9,3,2,4,5,7,6] -; AVX512F-FAST-NEXT: vpermt2d %ymm2, %ymm17, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [0,3,1,3,0,3,5,7] -; AVX512F-FAST-NEXT: vmovdqa64 448(%rdi), %ymm29 -; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm18, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa %ymm8, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm26, %zmm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 176(%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm31 +; AVX512F-FAST-NEXT: subq $552, %rsp # imm = 0x228 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,14,15,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm29 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa %xmm4, %xmm15 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm22 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] -; AVX512F-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] -; AVX512F-FAST-NEXT: vmovdqa %ymm5, %ymm11 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm5 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX512F-FAST-NEXT: vmovdqa %ymm5, %ymm13 -; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa 256(%rdi), %ymm12 -; AVX512F-FAST-NEXT: vmovdqa 288(%rdi), %ymm9 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1],ymm12[2],ymm9[3],ymm12[4],ymm9[5,6],ymm12[7],ymm9[8,9],ymm12[10],ymm9[11],ymm12[12],ymm9[13,14],ymm12[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm4, %xmm10 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3,4],xmm4[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm24 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm27 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <2,4,7,1,4,6,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm12 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm3[2,3,4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm8 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm14 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm6[1,2],ymm14[3],ymm6[4],ymm14[5],ymm6[6,7],ymm14[8],ymm6[9,10],ymm14[11],ymm6[12],ymm14[13],ymm6[14,15] +; AVX512F-FAST-NEXT: vmovdqa %ymm6, %ymm15 +; AVX512F-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,0,2,4,6,1,3] +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm9, %ymm21 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm19 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0],xmm8[1,2,3],xmm6[4,5],xmm8[6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm8 +; AVX512F-FAST-NEXT: vpor %ymm0, %ymm8, %ymm8 +; AVX512F-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 288(%rdi), %ymm10 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm0[2],ymm10[3],ymm0[4],ymm10[5,6],ymm0[7],ymm10[8,9],ymm0[10],ymm10[11],ymm0[12],ymm10[13,14],ymm0[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm10, %ymm25 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2],xmm9[3,4],xmm0[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm9[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [0,3,1,3,0,3,5,7] +; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %ymm31 +; AVX512F-FAST-NEXT: vpermd %ymm31, %ymm20, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa %ymm10, %ymm13 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm8, %zmm23, %zmm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %ymm8 +; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %ymm9 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm9, %ymm17 +; AVX512F-FAST-NEXT: vmovdqa %ymm8, %ymm9 +; AVX512F-FAST-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm4, %xmm8 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1,2,3],xmm4[4,5],xmm8[6,7] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 384(%rdi), %ymm11 +; AVX512F-FAST-NEXT: vmovdqa 416(%rdi), %ymm10 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm11[0],ymm10[1,2],ymm11[3],ymm10[4],ymm11[5],ymm10[6,7],ymm11[8],ymm10[9,10],ymm11[11],ymm10[12],ymm11[13],ymm10[14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm10, %ymm18 +; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm7, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm3 +; AVX512F-FAST-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa 496(%rdi), %xmm4 ; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512F-FAST-NEXT: vpermt2d %ymm0, %ymm17, %ymm3 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm18, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm26 +; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm16 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512F-FAST-NEXT: vmovdqa 512(%rdi), %ymm6 +; AVX512F-FAST-NEXT: vmovdqa 544(%rdi), %ymm8 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm6[1],ymm8[2,3],ymm6[4],ymm8[5],ymm6[6],ymm8[7,8],ymm6[9],ymm8[10,11],ymm6[12],ymm8[13],ymm6[14],ymm8[15] +; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa 576(%rdi), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 608(%rdi), %ymm5 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm30 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm28 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4],xmm2[5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm20, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm23, %zmm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7,8],ymm11[9],ymm6[10],ymm11[11],ymm6[12,13],ymm11[14],ymm6[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm30 -; AVX512F-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm17 -; AVX512F-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512F-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm12 +; AVX512F-FAST-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2],ymm0[3],ymm12[4,5],ymm0[6],ymm12[7,8],ymm0[9],ymm12[10],ymm0[11],ymm12[12,13],ymm0[14],ymm12[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <2,u,u,u,4,7,1,6> -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm22 -; AVX512F-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <2,u,u,u,4,6,1,7> +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15] ; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm21, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vporq %ymm1, %ymm0, %ymm18 -; AVX512F-FAST-NEXT: vpsrlq $48, %xmm31, %xmm0 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [2,3,12,13,2,3,12,13,2,3,12,13,2,3,12,13] -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm11 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,18,19,28,29,26,27,16,17,22,23],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vporq %ymm1, %ymm0, %ymm19 +; AVX512F-FAST-NEXT: vpsrlq $48, %xmm29, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm13 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[2,3,12,13,12,13,14,15,8,9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = <0,2,5,7,4,7,u,u> -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm13[1],ymm5[2,3],ymm13[4],ymm5[5],ymm13[6],ymm5[7,8],ymm13[9],ymm5[10,11],ymm13[12],ymm5[13],ymm13[14],ymm5[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm13, %ymm19 -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm20, %ymm10 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm10 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm10[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [1,4,6,3,1,4,6,3] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = <0,2,5,7,4,7,u,u> +; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm5 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5],ymm14[6],ymm5[7,8],ymm14[9],ymm5[10,11],ymm14[12],ymm5[13],ymm14[14],ymm5[15] +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm22, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [1,3,6,4,1,3,6,4] ; AVX512F-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm9[1,2],ymm12[3],ymm9[4],ymm12[5],ymm9[6,7],ymm12[8],ymm9[9,10],ymm12[11],ymm9[12],ymm12[13],ymm9[14,15] -; AVX512F-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm7 +; AVX512F-FAST-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm7[1,2],ymm15[3],ymm7[4],ymm15[5],ymm7[6,7],ymm15[8],ymm7[9,10],ymm15[11],ymm7[12],ymm15[13],ymm7[14,15] ; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm24, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,2,3,12,13,10,11,0,1,6,7,8,9,16,17,18,19,18,19,28,29,26,27,16,17,22,23,24,25] ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [1,3,2,3,1,3,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm25, %ymm10 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm10 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm18, %zmm26, %zmm10 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 +; AVX512F-FAST-NEXT: vpermd %ymm31, %ymm25, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm10, %ymm27 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm19, %zmm23, %zmm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-FAST-NEXT: vpblendw $181, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4,5],ymm0[6],mem[7,8],ymm0[9],mem[10],ymm0[11],mem[12,13],ymm0[14],mem[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3],xmm0[4,5,6],xmm10[7] -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm13 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm13[1],ymm7[2,3],ymm13[4],ymm7[5],ymm13[6],ymm7[7,8],ymm13[9],ymm7[10,11],ymm13[12],ymm7[13],ymm13[14],ymm7[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm28 -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm21, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpor %ymm3, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15] -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm20, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload -; AVX512F-FAST-NEXT: vpsrlq $48, %xmm27, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm10 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10],ymm10[11],ymm9[12,13],ymm10[14],ymm9[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6],xmm2[7] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm9 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10,11],ymm11[12],ymm9[13],ymm11[14],ymm9[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm19 +; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm21, %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[0,1,18,19,28,29,26,27,16,17,22,23],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5],ymm8[6],ymm6[7,8],ymm8[9],ymm6[10,11],ymm8[12],ymm6[13],ymm8[14],ymm6[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm18 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm17 +; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm22, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm3 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,12,13,14,15,8,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpsrlq $48, %xmm26, %xmm4 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm6 +; AVX512F-FAST-NEXT: vmovdqu64 %ymm28, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm8 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm8[1,2],ymm6[3],ymm8[4],ymm6[5],ymm8[6,7],ymm6[8],ymm8[9,10],ymm6[11],ymm8[12],ymm6[13],ymm8[14,15] +; AVX512F-FAST-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm24, %ymm3 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm25, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm26, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, %zmm21 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm30 +; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm25, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm23, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm21 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7,8],ymm12[9],ymm9[10,11],ymm12[12],ymm9[13],ymm12[14],ymm9[15] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm15[1],ymm7[2,3],ymm15[4],ymm7[5],ymm15[6],ymm7[7,8],ymm15[9],ymm7[10,11],ymm15[12],ymm7[13],ymm15[14],ymm7[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm11, %xmm20 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm31[2],xmm0[3],xmm31[3] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = <0,3,5,2,5,7,u,u> -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm12 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7,8],ymm5[9],ymm12[10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm26 -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm24, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,4,5,14,15,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm13, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm13, %xmm20 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm29[2],xmm1[3],xmm29[3] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = <0,3,5,2,5,7,u,u> +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0],ymm5[1],ymm14[2],ymm5[3],ymm14[4,5],ymm5[6],ymm14[7,8],ymm5[9],ymm14[10],ymm5[11],ymm14[12,13],ymm5[14],ymm14[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm27 +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm25, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5,6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4],ymm1[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = <0,2,u,u,5,7,2,4> -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm25, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm0[2],ymm12[3],ymm0[4],ymm12[5,6],ymm0[7],ymm12[8,9],ymm0[10],ymm12[11],ymm0[12],ymm12[13,14],ymm0[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = <0,2,u,u,5,7,2,4> +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5],ymm2[6],mem[7,8],ymm2[9],mem[10,11],ymm2[12],mem[13],ymm2[14],mem[15] +; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm24, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm5 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,4,6,0,1,4,6,0] ; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm5, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm14, %ymm14 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm21, %zmm14 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm0 +; AVX512F-FAST-NEXT: vpermd %ymm31, %ymm5, %ymm15 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm15, %ymm15 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm23, %zmm15 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm6[1],ymm8[2,3],ymm6[4],ymm8[5],ymm6[6],ymm8[7,8],ymm6[9],ymm8[10,11],ymm6[12],ymm8[13],ymm6[14],ymm8[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3,4],xmm10[5,6,7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3,4],xmm6[5,6,7] ; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa %ymm15, %ymm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm15[1],ymm6[2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7,8],ymm15[9],ymm6[10],ymm15[11],ymm6[12,13],ymm15[14],ymm6[15] -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm24, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] -; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm14 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm27[2],xmm4[3],xmm27[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm15 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm11 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm15 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm15[1],ymm11[2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7,8],ymm15[9],ymm11[10],ymm15[11],ymm11[12,13],ymm15[14],ymm11[15] +; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm25, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm12 +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm7 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm26[2],xmm4[3],xmm26[3] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm1[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm4 -; AVX512F-FAST-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm4[2],ymm7[3],ymm4[4],ymm7[5,6],ymm4[7],ymm7[8,9],ymm4[10],ymm7[11],ymm4[12],ymm7[13,14],ymm4[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm30 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm22 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4],xmm2[5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm4[1],ymm13[2,3],ymm4[4],ymm13[5],ymm4[6],ymm13[7,8],ymm4[9],ymm13[10,11],ymm4[12],ymm13[13],ymm4[14],ymm13[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm13, %ymm18 -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm25, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm5, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %ymm23 # 32-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm10[2],ymm1[3],ymm10[4],ymm1[5,6],ymm10[7],ymm1[8,9],ymm10[10],ymm1[11],ymm10[12],ymm1[13,14],ymm10[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm10, %ymm28 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4],xmm1[5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm10 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm9, %ymm16 +; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm24, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm30, %ymm5, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 ; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm21, %zmm2 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm6 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm9 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0],xmm9[1],xmm7[2,3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,6,7,0,1,10,11,0,0] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0],xmm9[1],xmm6[2,3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,3,6,0,5,u,u,u> -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm10 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1],ymm10[2],ymm12[3],ymm10[4],ymm12[5,6],ymm10[7],ymm12[8,9],ymm10[10],ymm12[11],ymm10[12],ymm12[13,14],ymm10[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm13 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1],ymm13[2],ymm14[3],ymm13[4],ymm14[5,6],ymm13[7],ymm14[8,9],ymm13[10],ymm14[11],ymm13[12],ymm14[13,14],ymm13[15] ; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm4 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 @@ -7444,157 +7417,155 @@ ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,7,0,2,4,7,0] ; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vpermd %ymm31, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm5 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm25 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0],xmm14[1],xmm15[2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm31 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm14, %xmm17 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0],xmm12[1],xmm7[2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm12, %xmm17 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm21 ; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1],ymm8[2],ymm6[3],ymm8[4],ymm6[5,6],ymm8[7],ymm6[8,9],ymm8[10],ymm6[11],ymm8[12],ymm6[13,14],ymm8[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm20 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0,1],ymm15[2],ymm11[3],ymm15[4],ymm11[5,6],ymm15[7],ymm11[8,9],ymm15[10],ymm11[11],ymm15[12],ymm11[13,14],ymm15[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm29 ; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm4, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpermd %ymm30, %ymm4, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm9[2],xmm7[3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm12[1,2],ymm10[3],ymm12[4],ymm10[5],ymm12[6,7],ymm10[8],ymm12[9,10],ymm10[11],ymm12[12],ymm10[13],ymm12[14,15] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1],xmm9[2],xmm6[3] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm14[1,2],ymm13[3],ymm14[4],ymm13[5],ymm14[6,7],ymm13[8],ymm14[9,10],ymm13[11],ymm14[12],ymm13[13],ymm14[14,15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15] ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = <1,4,6,3,6,u,u,u> ; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm27, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7] -; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm26, %ymm3 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512F-FAST-NEXT: vpermd %ymm31, %ymm26, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,3,5,7,2,0> +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10],ymm4[11],ymm8[12,13],ymm4[14],ymm8[15] +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,12,13,10,11,0,1,6,7,6,7,2,3,12,13,18,19,28,29,26,27,16,17,22,23,22,23,18,19,28,29] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [3,1,6,4] ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm15[1,2],ymm4[3],ymm15[4],ymm4[5],ymm15[6,7],ymm4[8],ymm15[9,10],ymm4[11],ymm15[12],ymm4[13],ymm15[14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm23 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,3,u,u,5,0,2,7> -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm12 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] -; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm5, %zmm25 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX512F-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm3 = ymm14[0],mem[1],ymm14[2,3],mem[4],ymm14[5],mem[6],ymm14[7,8],mem[9],ymm14[10,11],mem[12],ymm14[13],mem[14],ymm14[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0],xmm8[1,2,3],xmm3[4,5],xmm8[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm8 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm25, %ymm9 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm25, %zmm21 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm11 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm11[1,2],ymm9[3],ymm11[4],ymm9[5],ymm11[6,7],ymm9[8],ymm11[9,10],ymm9[11],ymm11[12],ymm9[13],ymm11[14,15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm15[1,2],ymm6[3],ymm15[4],ymm6[5],ymm15[6,7],ymm6[8],ymm15[9,10],ymm6[11],ymm15[12],ymm6[13],ymm15[14,15] +; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm3, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,12,13,10,11,4,5,2,3,8,9,0,1,2,3] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm6, %zmm25 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm0 = ymm11[0],mem[1],ymm11[2,3],mem[4],ymm11[5],mem[6],ymm11[7,8],mem[9],ymm11[10,11],mem[12],ymm11[13],mem[14],ymm11[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm12 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm0[0],xmm12[1,2,3],xmm0[4,5],xmm12[6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm12 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm25, %ymm13 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm25, %zmm20 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm13 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1],ymm13[2],ymm8[3],ymm13[4,5],ymm8[6],ymm13[7,8],ymm8[9],ymm13[10],ymm8[11],ymm13[12,13],ymm8[14],ymm13[15] -; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm24 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm5 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm27, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm26, %ymm2 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm14 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm14[1,2],ymm13[3],ymm14[4],ymm13[5],ymm14[6,7],ymm13[8],ymm14[9,10],ymm13[11],ymm14[12],ymm13[13],ymm14[14,15] +; AVX512F-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa %ymm10, %ymm12 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm7 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7,8],ymm10[9],ymm7[10],ymm10[11],ymm7[12,13],ymm10[14],ymm7[15] +; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm6, %zmm24 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm3 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] +; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm27, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm30, %ymm26, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10,11],ymm9[12],ymm11[13],ymm9[14],ymm11[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm24, %ymm3 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm24, %zmm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm12[2],ymm10[3],ymm12[4],ymm10[5,6],ymm12[7],ymm10[8,9],ymm12[10],ymm10[11],ymm12[12],ymm10[13,14],ymm12[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <1,3,u,u,6,0,3,5> -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm6 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm24, %ymm2 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm24, %zmm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm4[2],ymm8[3],ymm4[4],ymm8[5,6],ymm4[7],ymm8[8,9],ymm4[10],ymm8[11],ymm4[12],ymm8[13,14],ymm4[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,3,u,u,6,0,3,5> +; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm6 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm6[1],ymm15[2,3],ymm6[4],ymm15[5],ymm6[6],ymm15[7,8],ymm6[9],ymm15[10,11],ymm6[12],ymm15[13],ymm6[14],ymm15[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] ; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7] ; AVX512F-FAST-NEXT: movb $7, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-FAST-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm6 = mem[0],ymm14[1],mem[2],ymm14[3],mem[4,5],ymm14[6],mem[7,8],ymm14[9],mem[10],ymm14[11],mem[12,13],ymm14[14],mem[15] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm5 {%k1} +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm5, %ymm2 +; AVX512F-FAST-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm6 = mem[0],ymm11[1],mem[2],ymm11[3],mem[4,5],ymm11[6],mem[7,8],ymm11[9],mem[10],ymm11[11],mem[12,13],ymm11[14],mem[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6],xmm8[7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] ; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0],ymm6[1,2,3,4,5,6,7],ymm3[8],ymm6[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13],ymm6[14],ymm3[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm13[0,1],ymm6[2],ymm13[3],ymm6[4],ymm13[5,6],ymm6[7],ymm13[8,9],ymm6[10],ymm13[11],ymm6[12],ymm13[13,14],ymm6[15] -; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13],ymm13[14],ymm14[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm12[2],ymm6[3],ymm12[4],ymm6[5,6],ymm12[7],ymm6[8,9],ymm12[10],ymm6[11],ymm12[12],ymm6[13,14],ymm12[15] +; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10],ymm10[11],ymm9[12,13],ymm10[14],ymm9[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] ; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm1, %ymm4 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, (%rsi) +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovaps %zmm3, 64(%rsi) ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm3, (%rsi) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovaps %zmm3, 64(%rdx) ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovaps %zmm3, (%rdx) @@ -7602,11 +7573,11 @@ ; AVX512F-FAST-NEXT: vmovaps %zmm3, 64(%rcx) ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovaps %zmm3, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 64(%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, (%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 64(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%r9) -; AVX512F-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512F-FAST-NEXT: addq $552, %rsp # imm = 0x228 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -7727,10 +7698,8 @@ ; AVX2: {{.*}} ; AVX2-ONLY: {{.*}} ; AVX512: {{.*}} -; AVX512BW-FAST: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} ; AVX512BW-ONLY-SLOW: {{.*}} -; AVX512BW-SLOW: {{.*}} ; AVX512DQ-FAST: {{.*}} ; AVX512DQ-SLOW: {{.*}} ; AVX512DQBW-FAST: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -359,7 +359,7 @@ ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm2[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] @@ -385,10 +385,11 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpsrld $16, %xmm1, %xmm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,2,3,14,15,u,u,u,u] ; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm2[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] @@ -414,10 +415,11 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpsrld $16, %xmm1, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,2,3,14,15,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] @@ -442,28 +444,25 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1,10,7] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512F-SLOW-NEXT: vpermi2d %ymm2, %ymm4, %ymm1 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,13,10,3] -; AVX512F-SLOW-NEXT: vpermi2d %ymm4, %ymm2, %ymm6 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpsrld $16, %xmm1, %xmm4 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm2[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vmovq %xmm3, (%rsi) -; AVX512F-SLOW-NEXT: vmovq %xmm0, (%rdx) -; AVX512F-SLOW-NEXT: vmovq %xmm5, (%rcx) -; AVX512F-SLOW-NEXT: vmovq %xmm1, (%r8) -; AVX512F-SLOW-NEXT: vmovq %xmm2, (%r9) -; AVX512F-SLOW-NEXT: vmovq %xmm4, (%rax) -; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: vmovq %xmm4, (%rdx) +; AVX512F-SLOW-NEXT: vmovq %xmm6, (%rcx) +; AVX512F-SLOW-NEXT: vmovq %xmm5, (%r8) +; AVX512F-SLOW-NEXT: vmovq %xmm1, (%r9) +; AVX512F-SLOW-NEXT: vmovq %xmm0, (%rax) ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: load_i16_stride6_vf4: @@ -475,27 +474,25 @@ ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1,10,7] -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512F-FAST-NEXT: vpermi2d %ymm2, %ymm4, %ymm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,13,10,3] -; AVX512F-FAST-NEXT: vpermi2d %ymm4, %ymm2, %ymm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpsrld $16, %xmm1, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,2,3,14,15,u,u,u,u] +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm2[2,3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vmovq %xmm3, (%rsi) -; AVX512F-FAST-NEXT: vmovq %xmm0, (%rdx) -; AVX512F-FAST-NEXT: vmovq %xmm5, (%rcx) -; AVX512F-FAST-NEXT: vmovq %xmm1, (%r8) -; AVX512F-FAST-NEXT: vmovq %xmm2, (%r9) -; AVX512F-FAST-NEXT: vmovq %xmm4, (%rax) -; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: vmovq %xmm4, (%rdx) +; AVX512F-FAST-NEXT: vmovq %xmm6, (%rcx) +; AVX512F-FAST-NEXT: vmovq %xmm5, (%r8) +; AVX512F-FAST-NEXT: vmovq %xmm1, (%r9) +; AVX512F-FAST-NEXT: vmovq %xmm0, (%rax) ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-LABEL: load_i16_stride6_vf4: @@ -742,7 +739,7 @@ ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] @@ -751,7 +748,7 @@ ; AVX2-SLOW-NEXT: vpbroadcastw 74(%rdi), %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,14,15,u,u,6,7,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3],xmm7[4,5],xmm5[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] @@ -805,7 +802,7 @@ ; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,0,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] @@ -813,7 +810,7 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vpbroadcastw 74(%rdi), %xmm6 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,14,15,u,u,6,7,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3],xmm7[4,5],xmm5[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] @@ -865,7 +862,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] @@ -873,7 +870,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw 74(%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,14,15,u,u,6,7,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3],xmm7[4,5],xmm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] @@ -1581,29 +1578,29 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[0,2,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7] ; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3] ; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm3[0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,2,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3],xmm9[4,5],xmm10[6,7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3],xmm9[4,5],xmm10[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 @@ -1623,9 +1620,8 @@ ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm12[0,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5] @@ -1637,9 +1633,8 @@ ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] @@ -1684,10 +1679,10 @@ ; ; AVX2-FAST-LABEL: load_i16_stride6_vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] @@ -1696,30 +1691,30 @@ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm7 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7] -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3] -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm3[0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm5[2,3] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm5[0,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] @@ -1729,16 +1724,15 @@ ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5] @@ -1748,43 +1742,42 @@ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rdx) +; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rdx) ; AVX2-FAST-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm4, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-NEXT: vzeroupper @@ -1792,10 +1785,10 @@ ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride6_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] @@ -1804,30 +1797,30 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm3[0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm5[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] @@ -1837,16 +1830,15 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5] @@ -1856,43 +1848,42 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm10, %ymm7, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm5, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -1916,25 +1907,25 @@ ; AVX512F-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm7 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2],ymm9[3,4,5,6,7],ymm1[8,9,10],ymm9[11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,2,0,3] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2],ymm9[3,4,5,6,7],ymm1[8,9,10],ymm9[11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7],ymm8[8,9,10],ymm5[11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3],xmm9[4,5],xmm10[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7],ymm8[8,9,10],ymm5[11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 @@ -1972,7 +1963,7 @@ ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7] @@ -1993,7 +1984,7 @@ ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpternlogq $236, %ymm11, %ymm4, %ymm3 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] @@ -2028,25 +2019,25 @@ ; AVX512F-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm7 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm9[3,4,5,6,7],ymm2[8,9,10],ymm9[11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,0,3] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm2[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm9[3,4,5,6,7],ymm2[8,9,10],ymm9[11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7],ymm8[8,9,10],ymm5[11,12,13,14,15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7],ymm8[8,9,10],ymm5[11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 @@ -2080,7 +2071,7 @@ ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7] @@ -2099,7 +2090,7 @@ ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpternlogq $236, %ymm11, %ymm4, %ymm3 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] @@ -2191,9 +2182,9 @@ ; SSE-LABEL: load_i16_stride6_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $456, %rsp # imm = 0x1C8 -; SSE-NEXT: movdqa 304(%rdi), %xmm9 +; SSE-NEXT: movdqa 112(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm5 +; SSE-NEXT: movdqa 128(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdi), %xmm3 ; SSE-NEXT: movdqa 80(%rdi), %xmm0 @@ -2239,7 +2230,7 @@ ; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 288(%rdi), %xmm0 +; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2248,8 +2239,8 @@ ; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: movdqa 352(%rdi), %xmm2 -; SSE-NEXT: movdqa 368(%rdi), %xmm1 +; SSE-NEXT: movdqa 160(%rdi), %xmm2 +; SSE-NEXT: movdqa 176(%rdi), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: movdqa %xmm1, %xmm0 @@ -2263,7 +2254,7 @@ ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa 336(%rdi), %xmm1 +; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,0,2,4,5,6,7] @@ -2308,13 +2299,13 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm7[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm0 +; SSE-NEXT: movdqa 320(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] ; SSE-NEXT: movdqa %xmm10, %xmm7 ; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: movdqa 112(%rdi), %xmm11 -; SSE-NEXT: movdqa 96(%rdi), %xmm0 +; SSE-NEXT: movdqa 304(%rdi), %xmm11 +; SSE-NEXT: movdqa 288(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,4,6,6,7] @@ -2322,8 +2313,8 @@ ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: movdqa 160(%rdi), %xmm5 -; SSE-NEXT: movdqa 176(%rdi), %xmm9 +; SSE-NEXT: movdqa 352(%rdi), %xmm5 +; SSE-NEXT: movdqa 368(%rdi), %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,2,3,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] ; SSE-NEXT: movdqa %xmm9, %xmm2 @@ -2337,7 +2328,7 @@ ; SSE-NEXT: pslld $16, %xmm9 ; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; SSE-NEXT: movdqa 144(%rdi), %xmm2 +; SSE-NEXT: movdqa 336(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm9[0,1,0,2,4,5,6,7] @@ -2735,45 +2726,45 @@ ; SSE-NEXT: pandn %xmm8, %xmm15 ; SSE-NEXT: por %xmm0, %xmm15 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rcx) +; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r8) +; SSE-NEXT: movaps %xmm0, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movdqa %xmm3, 16(%r9) +; SSE-NEXT: movdqa %xmm3, 48(%r9) ; SSE-NEXT: movdqa %xmm4, 32(%r9) -; SSE-NEXT: movdqa %xmm5, 48(%r9) +; SSE-NEXT: movdqa %xmm5, 16(%r9) ; SSE-NEXT: movdqa %xmm11, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm15, 16(%rax) +; SSE-NEXT: movdqa %xmm15, 48(%rax) ; SSE-NEXT: movdqa %xmm9, 32(%rax) -; SSE-NEXT: movdqa %xmm1, 48(%rax) +; SSE-NEXT: movdqa %xmm1, 16(%rax) ; SSE-NEXT: movdqa %xmm2, (%rax) ; SSE-NEXT: addq $456, %rsp # imm = 0x1C8 ; SSE-NEXT: retq @@ -3237,51 +3228,49 @@ ; ; AVX2-SLOW-LABEL: load_i16_stride6_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: subq $456, %rsp # imm = 0x1C8 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm12 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm3[2,3],ymm2[2,3] -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[2,3],ymm2[2,3] +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm3[0,1],ymm2[0,1] ; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm5[2],ymm12[3,4],ymm5[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm0 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm9[1],xmm0[2,3],xmm9[4],xmm0[5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm7[1],xmm0[2,3],xmm7[4],xmm0[5,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm9, %ymm6, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm9 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm14[1],ymm13[2,3,4,5],ymm14[6],ymm13[7] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm11, %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm9, %ymm8, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm7, %ymm6, %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm8 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm6 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm6[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3],xmm11[4],xmm8[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm13[1],ymm14[2,3,4,5],ymm13[6],ymm14[7] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm8, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] @@ -3290,512 +3279,497 @@ ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm11, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm10[1],ymm4[2,3],ymm10[4],ymm4[5,6],ymm10[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3],xmm6[4],xmm3[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[0,2,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0],ymm13[1],ymm14[2,3,4,5],ymm13[6],ymm14[7] -; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm7, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm7, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6],ymm9[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,2,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm3[2],xmm6[3],xmm3[4,5],xmm6[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm14[1],ymm13[2,3,4,5],ymm14[6],ymm13[7] +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm5, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[0,1,2,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm4, %xmm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3],xmm11[4,5],xmm9[6],xmm11[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2],ymm9[3,4,5,6,7],ymm8[8,9,10],ymm9[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm9 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm15 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0,1,2],xmm10[3],xmm15[4,5],xmm10[6],xmm15[7] -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm15, %xmm6 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,2,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm13[2],xmm6[3],xmm13[4,5],xmm6[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[0],ymm8[1],mem[2,3,4,5],ymm8[6],mem[7] -; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm13, %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm6[0,1,2],ymm10[3,4,5,6,7],ymm6[8,9,10],ymm10[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm4, %xmm11 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm14[3],xmm11[4,5],xmm14[6],xmm11[7] +; AVX2-SLOW-NEXT: vpblendd $109, (%rsp), %ymm12, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[0],ymm12[1],mem[2,3],ymm12[4],mem[5,6],ymm12[7] +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm14, %xmm15 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[0,2,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm12[2],xmm15[3],xmm12[4,5],xmm15[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm6[0],mem[1],ymm6[2,3,4,5],mem[6],ymm6[7] +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm15, %ymm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2],ymm11[3,4,5,6,7],ymm10[8,9,10],ymm11[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3],xmm5[4,5],xmm2[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3],xmm4[4,5],xmm0[6],xmm4[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm7 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3],xmm7[4,5],xmm2[6],xmm7[7] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm9, %xmm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2],xmm1[3],xmm9[4,5],xmm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm15, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm14, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,0,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,0,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,0,0,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,6,5,6,4] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,5,6,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,1,2,3] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1,2],xmm6[3],xmm3[4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm8, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm13, %ymm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm9[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm14 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,0,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,0,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,5,6,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,6,5,6,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm15[4],xmm9[5,6],xmm15[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $219, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1,2],xmm12[3],xmm15[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm2, %ymm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm11[0,1,2],ymm13[3,4,5,6,7],ymm11[8,9,10],ymm13[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4],xmm11[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm1[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm15, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0,1,2],ymm9[3,4,5,6,7],ymm6[8,9,10],ymm9[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0,1,2,3,4],xmm6[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[1,1,1,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5,6],xmm7[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[3,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3],xmm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7],ymm8[8,9,10],ymm6[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm8[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5,6],xmm5[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[3,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm13, %ymm7 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7],ymm7[8,9,10],ymm5[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm7[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm14[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5,6],xmm5[7] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm15, %ymm5 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7],ymm5[8,9,10],ymm1[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm3[1],ymm8[2,3],ymm3[4],ymm8[5,6],ymm3[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5,6],mem[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm4[4],xmm8[5],xmm4[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4],xmm5[5],xmm7[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0],ymm7[1],mem[2,3],ymm7[4],mem[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3],xmm6[4],xmm10[5],xmm6[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm2 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm2 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm11, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 32(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 32(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 32(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rax) -; AVX2-SLOW-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX2-SLOW-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride6_vf32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm14 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm10 ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm3 ; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm3[2,3],ymm2[2,3] -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm3[0,1],ymm2[0,1] ; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm14[2],ymm5[3,4],ymm14[5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm0 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm10[1],xmm0[2,3],xmm10[4],xmm0[5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm6[1],xmm0[2,3],xmm6[4],xmm0[5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm6, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm10 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm8[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm8 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm4 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm4[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm14[1],ymm13[2,3,4,5],ymm14[6],ymm13[7] +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7] ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm8, %ymm7, %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm7 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,6,7,6,7,6,7,10,11,10,11,10,11,10,11] ; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3],xmm3[4],xmm6[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm3 +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7] +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0],ymm13[1],ymm4[2,3,4,5],ymm13[6],ymm4[7] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm10 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm10, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[4,5,4,5,4,5,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6],ymm9[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[2,1,0,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,0,1,0,1,0,1,8,9,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1],xmm9[2],xmm3[3],xmm9[4,5],xmm3[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0],ymm13[1],ymm15[2,3,4,5],ymm13[6],ymm15[7] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1,2],xmm12[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2],ymm12[3,4],ymm0[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[4,5,4,5,4,5,4,5,8,9,8,9,8,9,8,9] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3],xmm9[4,5],xmm4[6],xmm9[7] -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm15 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm12[2],xmm15[3],xmm12[4,5],xmm15[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0,1,2],ymm4[3,4,5,6,7],ymm12[8,9,10],ymm4[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm11[2],xmm5[3],xmm11[4,5],xmm5[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3],xmm10[4,5],xmm1[6],xmm10[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7],ymm5[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm15[3],xmm11[4,5],xmm15[6],xmm11[7] +; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm14[0],mem[1],ymm14[2,3],mem[4],ymm14[5,6],mem[7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm14 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm10 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm10[2],xmm6[3],xmm10[4,5],xmm6[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm11, %ymm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0,1,2],ymm10[3,4,5,6,7],ymm8[8,9,10],ymm10[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4,5],xmm2[6],xmm5[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,2,3,2,3,2,3,10,11,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2],xmm1[3],xmm7[4,5],xmm1[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm1 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm13, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd $36, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm1 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,3,2,1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,2,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm11, %xmm7 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0],xmm7[1,2],xmm13[3],xmm7[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm14 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3,4,5,6,7],ymm14[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm14[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm15[4],xmm0[5,6],xmm15[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm1 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[0,3,2,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,0,1,4,5,6,7,12,13,12,13,12,13,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm11, %xmm10 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm14[0],xmm10[1,2],xmm14[3],xmm10[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm14 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0,1,2],ymm3[3,4,5,6,7],ymm14[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm14[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm14, %xmm6 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5,6],xmm7[7] +; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm13[2],mem[3],ymm13[4],mem[5,6],ymm13[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm12 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm9 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm8 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm15, %ymm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7],ymm10[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5,6],xmm4[7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm11, %xmm10 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1,2],xmm3[3],xmm10[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm8[1,2],xmm13[3],xmm8[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm7, %ymm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm13[0,1,2],ymm6[3,4,5,6,7],ymm13[8,9,10],ymm6[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm13[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5,6],xmm2[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,6,7,6,7,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm11, %xmm11 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm11[1,2],xmm1[3],xmm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm1 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm2 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm12, %xmm3 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5,6],ymm9[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,12,13,12,13,12,13,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm8 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd $109, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5,6],ymm8[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm3 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4],xmm6[5],xmm3[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm3[4],xmm7[5],xmm3[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,6,7,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm6, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm9, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm6, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm4, 32(%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm3, 32(%r9) @@ -3810,274 +3784,266 @@ ; AVX2-FAST-PERLANE-LABEL: load_i16_stride6_vf32: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm3[2,3],ymm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm3[0,1],ymm2[0,1] ; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm14[2],ymm5[3,4],ymm14[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm10[1],xmm0[2,3],xmm10[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm6[1],xmm0[2,3],xmm6[4],xmm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm6, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm8[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm4[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm14[1],ymm13[2,3,4,5],ymm14[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm12, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm8, %ymm7, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm7 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,6,7,6,7,6,7,10,11,10,11,10,11,10,11] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm12, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3],xmm3[4],xmm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0],ymm13[1],ymm4[2,3,4,5],ymm13[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm7, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[4,5,4,5,4,5,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,0,1,0,1,0,1,8,9,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm7, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1],xmm9[2],xmm3[3],xmm9[4,5],xmm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0],ymm13[1],ymm15[2,3,4,5],ymm13[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm4, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1,2],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2],ymm12[3,4],ymm0[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[4,5,4,5,4,5,4,5,8,9,8,9,8,9,8,9] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm3, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3],xmm9[4,5],xmm4[6],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm9, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm13, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm12[2],xmm15[3],xmm12[4,5],xmm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0,1,2],ymm4[3,4,5,6,7],ymm12[8,9,10],ymm4[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm14 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm11[2],xmm5[3],xmm11[4,5],xmm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3],xmm10[4,5],xmm1[6],xmm10[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7],ymm5[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm3, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm15[3],xmm11[4,5],xmm15[6],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm14[0],mem[1],ymm14[2,3],mem[4],ymm14[5,6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm10[2],xmm6[3],xmm10[4,5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm11, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0,1,2],ymm10[3,4,5,6,7],ymm8[8,9,10],ymm10[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4,5],xmm2[6],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,2,3,2,3,2,3,10,11,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2],xmm1[3],xmm7[4,5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm15, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm9 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm11, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0],xmm7[1,2],xmm13[3],xmm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm6, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3,4,5,6,7],ymm14[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,6,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm15[4],xmm0[5,6],xmm15[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,0,1,4,5,6,7,12,13,12,13,12,13,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm11, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm14[0],xmm10[1,2],xmm14[3],xmm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm4, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0,1,2],ymm3[3,4,5,6,7],ymm14[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm14, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5,6],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1],ymm13[2],mem[3],ymm13[4],mem[5,6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm3, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm15, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7],ymm10[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5,6],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1,2],xmm3[3],xmm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm8[1,2],xmm13[3],xmm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm7, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm13[0,1,2],ymm6[3,4,5,6,7],ymm13[8,9,10],ymm6[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm13[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5,6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,6,7,6,7,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm11[1,2],xmm1[3],xmm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm14, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm15, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5,6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,12,13,12,13,12,13,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4],xmm6[5],xmm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm3[4],xmm7[5],xmm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,6,7,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm10, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm9, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 32(%r9) @@ -4092,540 +4058,547 @@ ; AVX512F-ONLY-SLOW-LABEL: load_i16_stride6_vf32: ; AVX512F-ONLY-SLOW: # %bb.0: ; AVX512F-ONLY-SLOW-NEXT: subq $136, %rsp -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6],ymm12[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0],ymm0[1],ymm15[2,3],ymm0[4],ymm15[5,6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,2,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm5, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm1[2],ymm13[3,4],ymm1[5],ymm13[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,2] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3],xmm7[4,5],xmm9[6],xmm7[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm7, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm6[2,3],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5,6],ymm11[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,2,0,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm6[2,3],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6],ymm11[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,2,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2],xmm7[3],xmm9[4,5],xmm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm6, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm13[1],ymm9[2,3,4,5],ymm13[6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm6, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm12[1],ymm0[2,3,4,5],ymm12[6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm6, %ymm9 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5],xmm9[6],xmm8[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1],ymm7[2],ymm12[3,4],ymm7[5],ymm12[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0,1,2],xmm12[3],xmm8[4,5],xmm12[6],xmm8[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm8[2,3],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm8, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm11[1],ymm13[2,3,4,5],ymm11[6],ymm13[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm8[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7],ymm11[8,9,10],ymm9[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3],xmm5[4,5],xmm1[6],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm8[2,3],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm8, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm14[1],ymm0[2,3,4,5],ymm14[6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm8, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0,1,2],ymm12[3,4,5,6,7],ymm10[8,9,10],ymm12[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3],xmm5[4,5],xmm4[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm3, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm8, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm21 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,0,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,1,2,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm13[2],ymm1[3,4],ymm13[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm20 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,0,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[2,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,0,0,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,0,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm12[2],ymm13[3],ymm12[4],ymm13[5,6],ymm12[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm12[2],ymm14[3],ymm12[4],ymm14[5,6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm4, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm1[0,1,2,3],xmm13[4],xmm1[5,6],xmm13[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm17, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm15[2],ymm1[3],ymm15[4],ymm1[5,6],ymm15[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm1, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm11[0,1,2],ymm13[3,4,5,6,7],ymm11[8,9,10],ymm13[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm17, %zmm11, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[0,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1,2],xmm2[3],xmm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5,6],xmm8[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm8, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1,2],xmm0[3],xmm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,1,1,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5,6],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm15 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm17, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3,4,5],ymm13[6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm4[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm11, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm17, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm11, %ymm10, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4],xmm5[5],xmm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3],xmm11[4],xmm8[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm22, %ymm11, %ymm8 ; AVX512F-ONLY-SLOW-NEXT: movw $31, %ax ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5,6],ymm15[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm14[4],xmm10[5],xmm14[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3,4,5],ymm10[6],ymm12[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6],ymm11[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm12[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4],xmm11[5],xmm10[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm11[1],ymm13[2,3,4,5],ymm11[6],ymm13[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm13[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm11, %ymm4, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4],xmm6[5],xmm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm7, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3],xmm6[4],xmm3[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm22, %ymm5, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5],xmm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm22, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm18, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm15, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm18, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm17, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, (%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, (%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512F-ONLY-SLOW-NEXT: addq $136, %rsp ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: load_i16_stride6_vf32: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $136, %rsp -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX512F-ONLY-FAST-NEXT: subq $168, %rsp +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %ymm15 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6],ymm15[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6],ymm15[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,0,1,0,1,0,1,8,9,12,13,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm11, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm4[2],ymm13[3,4],ymm4[5],ymm13[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm7 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,4,5,4,5,4,5,8,9,8,9,8,9,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1,2],xmm4[3],xmm10[4,5],xmm4[6],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm6[2],xmm8[3],xmm6[4,5],xmm8[6,7] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm9, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0],ymm0[1],ymm14[2,3,4,5],ymm0[6],ymm14[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm6[2],xmm2[3],xmm6[4,5],xmm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm10, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm0[1],ymm14[2,3,4,5],ymm0[6],ymm14[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm11[3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm8[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm8, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm0[1],ymm12[2,3,4,5],ymm0[6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm11[2],xmm3[3],xmm11[4,5],xmm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3],xmm4[4,5],xmm1[6],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2],xmm13[3],xmm9[4,5],xmm13[6],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm9[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm9, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0],ymm11[1],ymm13[2,3,4,5],ymm11[6],ymm13[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm10 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,2,3,2,3,2,3,10,11,14,15,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm12, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3],xmm12[4,5],xmm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3],xmm8[4,5],xmm3[6],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm15, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm15, %ymm23 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,4,5,6,7,12,13,12,13,12,13,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,1,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,2,0,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm12[2],ymm14[3],ymm12[4],ymm14[5,6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1],ymm11[2],ymm14[3],ymm11[4],ymm14[5,6],ymm11[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm30 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2],ymm15[3,4],ymm1[5],ymm15[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm1, %xmm16 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm16[0,1,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm17, %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm16[0,1,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm13 = xmm1[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3],ymm15[4],ymm0[5,6],ymm15[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0,1,2],ymm13[3,4,5,6,7],ymm10[8,9,10],ymm13[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm17, %zmm10, %zmm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm11, %zmm17, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5,6],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm7, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2],xmm5[3],xmm4[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm7, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm17, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm12, %zmm17, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,6,7,6,7,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5,6],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm6, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm7, %zmm10, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm3[4],xmm7[5],xmm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,0,1,4,5,12,13,12,13,12,13,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2,3],xmm3[4],xmm8[5],xmm3[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm11[1],xmm4[2,3],xmm11[4],xmm4[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm14[1],ymm11[2,3,4,5],ymm14[6],ymm11[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1],ymm8[2],mem[3,4],ymm8[5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm13[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2,3],xmm12[4],xmm7[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm23, %ymm13, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm23, %ymm12, %ymm7 ; AVX512F-ONLY-FAST-NEXT: movw $31, %ax ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6],ymm15[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm15[4],xmm9[5],xmm15[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0],ymm9[1],ymm11[2,3,4,5],ymm9[6],ymm11[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm9[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm15 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm10, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6],ymm7[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3],xmm6[4],xmm10[5],xmm6[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm10[1],ymm12[2,3,4,5],ymm10[6],ymm12[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm12[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,6,7,6,7,6,7,10,11,10,11,10,11,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm13[1],xmm8[2,3],xmm13[4],xmm8[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm7, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6,7] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm8, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,2,3,6,7,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm12[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] @@ -4635,558 +4608,562 @@ ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm19, %zmm4 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm28, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm22, %zmm4 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rdx) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, (%rcx) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-ONLY-FAST-NEXT: addq $136, %rsp +; AVX512F-ONLY-FAST-NEXT: addq $168, %rsp ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: load_i16_stride6_vf32: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: pushq %rax -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm13 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm0[1],ymm13[2,3],ymm0[4],ymm13[5,6],ymm0[7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm0 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm14 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,2,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm5 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm23 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3],xmm4[4,5],xmm6[6],xmm4[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm16 -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm3[2,3],mem[2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm23 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm16 +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],mem[2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5,6],ymm9[7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm22 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,2,0,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm10[1],ymm12[2,3,4,5],ymm10[6],ymm12[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm29 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2],ymm4[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3],xmm4[4,5],xmm1[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm0[1],ymm4[2,3,4,5],ymm0[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm9, %ymm2, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2],ymm4[3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm24 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm7 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm4[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3],xmm7[4,5],xmm11[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm7 -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm7[2,3],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm7, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm0[1],ymm11[2,3,4,5],ymm0[6],ymm11[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm27 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2],ymm9[3,4,5,6,7],ymm0[8,9,10],ymm9[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,2] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm10[3],xmm6[4,5],xmm10[6],xmm6[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm6[2,3],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm6, %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm10[1],ymm12[2,3,4,5],ymm10[6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm12, %ymm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm27 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm9, %ymm6, %ymm9 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm17, %zmm10 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm17, %zmm11 ; AVX512DQ-SLOW-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm8 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3],xmm9[4,5],xmm8[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm15, %xmm10 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3],xmm10[4,5],xmm5[6],xmm10[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm5 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm14[2],xmm8[3],xmm14[4,5],xmm8[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm7 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm15[3],xmm7[4,5],xmm15[6],xmm7[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm7 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm0 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm17, %zmm3 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm17, %zmm12 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm12 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm20 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm30 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,0,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[2,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,0,0,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,6,4] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,2,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm1[2],ymm10[3],ymm1[4],ymm10[5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,0,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,4,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4],xmm8[5,6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm0, %zmm9 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,0,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,6,5,6,4] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6],xmm11[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3],ymm13[4],ymm0[5,6],ymm13[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm8 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm8[0,1,2],ymm11[3,4,5,6,7],ymm8[8,9,10],ymm11[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm8, %zmm9 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm9, %zmm17, %zmm18 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm15[3,1,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm10[0,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1,2],xmm9[3],xmm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[1,1,1,1,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4],xmm9[5,6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm7 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm6[4],xmm9[5,6],xmm6[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm6 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[1,1,1,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4],xmm4[5,6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[1,1,1,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5,6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm8, %zmm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm2 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6],ymm2[7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,1,0,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm5 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm12[1],ymm11[2,3,4,5],ymm12[6],ymm11[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2],ymm6[3,4],ymm13[5],ymm6[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm10[1],ymm4[2,3,4,5],ymm10[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2],ymm9[3,4],ymm5[5],ymm9[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm6 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm13 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm9[1],xmm5[2,3],xmm9[4],xmm5[5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm9[1],xmm6[2,3],xmm9[4],xmm6[5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm10, %ymm9, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm14, %ymm9, %ymm6 ; AVX512DQ-SLOW-NEXT: movw $31, %ax ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm5, %zmm0, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5,6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm14 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm6, %zmm0, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm6[1],ymm9[2,3],ymm6[4],ymm9[5,6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm8 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm11[0,1,0,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm14[4],xmm9[5],xmm14[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4,5],ymm11[6],ymm14[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm11[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3,4],ymm9[5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5],xmm8[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm9[1],ymm10[2,3,4,5],ymm9[6],ymm10[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm10[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] ; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5],xmm2[6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3],xmm6[4],xmm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm10, %ymm4, %ymm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm14, %ymm4, %ymm2 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,1,1,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5],xmm1[6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, (%rsi) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, (%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, (%rdx) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm9 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm8 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, (%rcx) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%r9) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-SLOW-NEXT: popq %rax ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: load_i16_stride6_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %ymm12 +; AVX512DQ-FAST-NEXT: pushq %rax +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %ymm13 ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0],ymm12[1],ymm2[2,3],ymm12[4],ymm2[5,6],ymm12[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm1 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm22 -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm4 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm15 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4,5],xmm4[6],xmm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm16 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm9[1],ymm2[2,3],ymm9[4],ymm2[5,6],ymm9[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm9, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm19 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6],ymm13[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm18 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,8,9,12,13,12,13,14,15] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm3 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm5[2,3],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm5, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,4,5,4,5,4,5,8,9,8,9,8,9,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm16 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm10[1],ymm6[2,3],ymm10[4],ymm6[5,6],ymm10[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm22 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm4, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm11, %ymm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm4[3,4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3],xmm8[4,5],xmm0[6],xmm8[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm0[2,3],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm10[1],ymm0[2,3,4,5],ymm10[6],ymm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm23 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm26 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm6 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm9 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2],xmm6[3],xmm9[4,5],xmm6[6],xmm9[7] +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm6[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm6, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm9[1],ymm12[2,3,4,5],ymm9[6],ymm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm9, %ymm26 +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm10 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm16, %zmm17, %zmm11 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm16, %zmm17, %zmm2 ; AVX512DQ-FAST-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm14 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,2,3,2,3,2,3,10,11,14,15,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm14 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm14[2],xmm7[3],xmm14[4,5],xmm7[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm15, %xmm15 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3],xmm15[4,5],xmm6[6],xmm15[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm6 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm6, %zmm17, %zmm13 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm30 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm15[3],xmm8[4,5],xmm15[6],xmm8[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm8, %zmm7 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm7, %zmm17, %zmm5 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,4,5,6,7,12,13,12,13,12,13,12,13] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm30 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm14, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,1,2,1] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,0,3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,2,1] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5,6],xmm3[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3],xmm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1,2],xmm5[3],xmm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1],ymm9[2],ymm13[3],ymm9[4],ymm13[5,6],ymm9[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm14, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] +; AVX512DQ-FAST-NEXT: vextracti32x4 $1, %ymm1, %xmm17 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,0,3] ; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm10[4],xmm0[5,6],xmm10[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm16, %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm17[0,1,2,1] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm12 = xmm1[0,1,2,3,6,5,6,4] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm0[0,1,2,3],xmm12[4],xmm0[5,6],xmm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2],ymm0[3],ymm11[4],ymm0[5,6],ymm11[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm8 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm8[0,1,2],ymm12[3,4,5,6,7],ymm8[8,9,10],ymm12[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm16, %zmm8, %zmm10 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm9, %zmm17, %zmm18 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm10 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3],xmm6[4],xmm10[5,6],xmm6[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm6, %zmm6 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm2[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm6, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm10, %zmm17, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,6,7,6,7,14,15,14,15,14,15,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm7 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm15[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0],xmm7[1,2],xmm12[3],xmm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5,6],xmm4[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm14, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm6[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5,6],xmm1[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm4, %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm1 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4],xmm7[5],xmm4[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm12[1],ymm11[2,3,4,5],ymm12[6],ymm11[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm10 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[2,2,2,2,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm12[1],xmm6[2,3],xmm12[4],xmm6[5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm2 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,3,2,1] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,0,1,4,5,12,13,12,13,12,13,12,13] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm3[4],xmm7[5],xmm3[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0],ymm13[1],ymm9[2,3,4,5],ymm13[6],ymm9[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm11[2],ymm7[3,4],ymm11[5],ymm7[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm4 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm12 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm12[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm11[1],xmm4[2,3],xmm11[4],xmm4[5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm19, %ymm12, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm19, %ymm11, %ymm4 ; AVX512DQ-FAST-NEXT: movw $31, %ax ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm6, %zmm0, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm12[1],ymm6[2,3],ymm12[4],ymm6[5,6],ymm12[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm9 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5],xmm8[6,7] +; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm4, %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0],ymm4[1],ymm9[2,3],ymm4[4],ymm9[5,6],ymm4[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3],xmm8[4],xmm10[5],xmm8[6,7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2,3,4,5],ymm11[6],ymm9[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm9[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2,3,4,5],ymm10[6],ymm13[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3,4],ymm8[5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm15 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm10, %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [6,7,6,7,6,7,6,7,10,11,10,11,10,11,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm19, %ymm7, %ymm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2,3],xmm12[4],xmm7[5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm19, %ymm7, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 ; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4],xmm1[5],xmm5[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm4, %zmm0, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,2,3,6,7,14,15,14,15,14,15,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm3, %zmm0, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm3 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5],xmm1[6,7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm3, (%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, (%rdx) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm8 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm1 @@ -5194,6 +5171,7 @@ ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, (%r8) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-FAST-NEXT: popq %rax ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -5303,30 +5281,30 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i16_stride6_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1176, %rsp # imm = 0x498 -; SSE-NEXT: movdqa 496(%rdi), %xmm5 +; SSE-NEXT: subq $1144, %rsp # imm = 0x478 +; SSE-NEXT: movdqa 400(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 512(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm3 -; SSE-NEXT: movdqa 176(%rdi), %xmm0 -; SSE-NEXT: movdqa 112(%rdi), %xmm6 +; SSE-NEXT: movdqa 416(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm4 +; SSE-NEXT: movdqa 64(%rdi), %xmm3 +; SSE-NEXT: movdqa 80(%rdi), %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm1 @@ -5343,26 +5321,25 @@ ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,3,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,0,2,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 480(%rdi), %xmm0 +; SSE-NEXT: movdqa 384(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa 544(%rdi), %xmm3 -; SSE-NEXT: movdqa 560(%rdi), %xmm1 +; SSE-NEXT: movdqa 448(%rdi), %xmm3 +; SSE-NEXT: movdqa 464(%rdi), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5377,48 +5354,45 @@ ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa 528(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: movdqa 432(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,1,0,2,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 112(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa 64(%rdi), %xmm3 -; SSE-NEXT: movdqa 80(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm3 +; SSE-NEXT: movdqa 176(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: movdqa 144(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5426,65 +5400,65 @@ ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 416(%rdi), %xmm0 +; SSE-NEXT: movdqa 512(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 400(%rdi), %xmm2 +; SSE-NEXT: movdqa 496(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 384(%rdi), %xmm0 +; SSE-NEXT: movdqa 480(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa 448(%rdi), %xmm3 -; SSE-NEXT: movdqa 464(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa 544(%rdi), %xmm3 +; SSE-NEXT: movdqa 560(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 432(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa 528(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm0 +; SSE-NEXT: movdqa 224(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 304(%rdi), %xmm2 +; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm0 +; SSE-NEXT: movdqa 192(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa 352(%rdi), %xmm3 -; SSE-NEXT: movdqa 368(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE-NEXT: movdqa 256(%rdi), %xmm3 +; SSE-NEXT: movdqa 272(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5496,7 +5470,7 @@ ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 336(%rdi), %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5504,38 +5478,38 @@ ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 704(%rdi), %xmm0 +; SSE-NEXT: movdqa 608(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 688(%rdi), %xmm2 +; SSE-NEXT: movdqa 592(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm0 +; SSE-NEXT: movdqa 576(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa 736(%rdi), %xmm3 -; SSE-NEXT: movdqa 752(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm3[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; SSE-NEXT: movdqa 640(%rdi), %xmm4 +; SSE-NEXT: movdqa 656(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[3,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[2,3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa 720(%rdi), %xmm1 +; SSE-NEXT: movdqa 624(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5543,95 +5517,95 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm0 +; SSE-NEXT: movdqa 320(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 208(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm0 +; SSE-NEXT: movdqa 304(%rdi), %xmm8 +; SSE-NEXT: movdqa 288(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa 352(%rdi), %xmm3 +; SSE-NEXT: movdqa 368(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa 256(%rdi), %xmm4 -; SSE-NEXT: movdqa 272(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[3,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa 240(%rdi), %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa 336(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 608(%rdi), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 704(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa 592(%rdi), %xmm13 -; SSE-NEXT: movdqa 576(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa 688(%rdi), %xmm15 +; SSE-NEXT: movdqa 672(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa 640(%rdi), %xmm5 -; SSE-NEXT: movdqa 656(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[3,0] +; SSE-NEXT: movdqa 736(%rdi), %xmm13 +; SSE-NEXT: movdqa 752(%rdi), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[3,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm2[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[2,3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm4[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm4[2,3] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslld $16, %xmm4 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movdqa 624(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm7[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm12[1,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movdqa 720(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm4[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm13[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,1,3,4,5,6,7] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm0[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,0] @@ -5641,103 +5615,101 @@ ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,1,3,4,5,6,7] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa (%rsp), %xmm12 # 16-byte Reload +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: psrld $16, %xmm8 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm13 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3] -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: por %xmm3, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm4[2,0] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: psrld $16, %xmm15 +; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] @@ -5746,8 +5718,8 @@ ; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm11[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[1,0,2,3,4,5,6,7] @@ -5760,49 +5732,22 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE-NEXT: movdqa %xmm4, %xmm5 ; SSE-NEXT: pandn %xmm2, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm12[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm12[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm9[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm9[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] @@ -5815,221 +5760,227 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[1,1,1,1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm15[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm15[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm11[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm11[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] ; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm6[0] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm8, %xmm7 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm7, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm8 +; SSE-NEXT: por %xmm7, %xmm8 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm6 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm13, %xmm9 +; SSE-NEXT: pandn %xmm7, %xmm9 +; SSE-NEXT: pand %xmm13, %xmm8 +; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm8[0] +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm9[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm7 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: pandn %xmm8, %xmm11 +; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: por %xmm7, %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm8[0] +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm0[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm11[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm7 +; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm5 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm13, %xmm12 +; SSE-NEXT: pandn %xmm8, %xmm12 +; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: por %xmm7, %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm8[0] +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: pandn %xmm7, %xmm12 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,2,3,4,5,6,7] ; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: por %xmm12, %xmm7 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm6 -; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: pandn %xmm6, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm6[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por %xmm12, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: pandn %xmm12, %xmm15 -; SSE-NEXT: pand %xmm14, %xmm6 -; SSE-NEXT: por %xmm6, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm9 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm7 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm9[0] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm12, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm8 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: pshufhw $231, (%rsp), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] -; SSE-NEXT: movdqa %xmm14, %xmm12 -; SSE-NEXT: pandn %xmm6, %xmm12 -; SSE-NEXT: pand %xmm14, %xmm8 -; SSE-NEXT: por %xmm8, %xmm12 -; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: psrlq $48, %xmm6 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm6[0] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm6 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm7[0] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm12, %xmm7 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] -; SSE-NEXT: movdqa %xmm14, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: pshufd $250, (%rsp), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm1[0] ; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm14, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: pand %xmm14, %xmm6 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm4, %xmm3 @@ -6037,20 +5988,20 @@ ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[2,2,3,3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm4, %xmm3 @@ -6058,17 +6009,15 @@ ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm10 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm10[0] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] @@ -6079,17 +6028,20 @@ ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm4, %xmm2 @@ -6097,18 +6049,38 @@ ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,2,3,3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm4, %xmm1 @@ -6117,244 +6089,245 @@ ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 ; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm5[1] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: andps %xmm13, %xmm1 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: pshufd $196, (%rsp), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm9[1] -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm14, %xmm9 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: andps %xmm14, %xmm4 -; SSE-NEXT: por %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm15[1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm14, %xmm9 -; SSE-NEXT: pandn %xmm4, %xmm9 -; SSE-NEXT: andps %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm12[1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm14, %xmm9 -; SSE-NEXT: pandn %xmm4, %xmm9 -; SSE-NEXT: andps %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm8[1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm14, %xmm12 -; SSE-NEXT: pandn %xmm4, %xmm12 -; SSE-NEXT: andps %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm12 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm6[1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: pandn %xmm4, %xmm15 -; SSE-NEXT: andps %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1],mem[1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm15[1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm15[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm14, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: andps %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: andps %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,1,0,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm7[1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm12[1] +; SSE-NEXT: movss {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3] ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: andps %xmm13, %xmm5 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,0,3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm7[1] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,0,3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[1],mem[1] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,0,3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm10[1] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,0,3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm9[1] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm13, %xmm9 +; SSE-NEXT: pandn %xmm5, %xmm9 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm9 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,0,3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: psrld $16, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm13, %xmm7 ; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: andps %xmm14, %xmm3 +; SSE-NEXT: andps %xmm13, %xmm3 ; SSE-NEXT: por %xmm3, %xmm7 -; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm4 ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: psrld $16, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm4 +; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: movdqa %xmm13, %xmm6 ; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: andps %xmm14, %xmm2 +; SSE-NEXT: andps %xmm13, %xmm2 ; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm3 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: movdqa %xmm13, %xmm5 ; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: andps %xmm13, %xmm1 ; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm14, %xmm9 -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: psrlq $48, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm3 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm14, %xmm11 -; SSE-NEXT: pandn %xmm2, %xmm11 -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm12 +; SSE-NEXT: pandn %xmm2, %xmm12 +; SSE-NEXT: andps %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm3 @@ -6366,15 +6339,15 @@ ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: andps %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm3 @@ -6386,96 +6359,96 @@ ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: andps %xmm13, %xmm1 ; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm3 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: andps %xmm14, %xmm2 +; SSE-NEXT: andps %xmm13, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm14 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm14[1] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: andps %xmm13, %xmm1 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: por %xmm1, %xmm14 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: pandn %xmm2, %xmm13 +; SSE-NEXT: por %xmm1, %xmm13 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps %xmm1, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 112(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -6488,37 +6461,38 @@ ; SSE-NEXT: movaps %xmm0, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%r8) +; SSE-NEXT: movdqa %xmm8, 112(%r9) +; SSE-NEXT: movdqa %xmm9, 96(%r9) +; SSE-NEXT: movdqa %xmm10, 80(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movdqa %xmm7, 112(%r9) -; SSE-NEXT: movdqa %xmm8, 96(%r9) -; SSE-NEXT: movdqa %xmm15, 80(%r9) -; SSE-NEXT: movdqa %xmm12, 64(%r9) +; SSE-NEXT: movaps %xmm0, 64(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm14, 112(%rax) +; SSE-NEXT: movdqa %xmm13, 112(%rax) ; SSE-NEXT: movdqa %xmm3, 96(%rax) ; SSE-NEXT: movdqa %xmm4, 80(%rax) -; SSE-NEXT: movdqa %xmm10, 64(%rax) -; SSE-NEXT: movdqa %xmm11, 48(%rax) -; SSE-NEXT: movdqa %xmm9, 32(%rax) -; SSE-NEXT: movdqa %xmm5, 16(%rax) -; SSE-NEXT: movdqa %xmm6, (%rax) -; SSE-NEXT: addq $1176, %rsp # imm = 0x498 +; SSE-NEXT: movdqa %xmm15, 64(%rax) +; SSE-NEXT: movdqa %xmm12, 48(%rax) +; SSE-NEXT: movdqa %xmm5, 32(%rax) +; SSE-NEXT: movdqa %xmm6, 16(%rax) +; SSE-NEXT: movdqa %xmm7, (%rax) +; SSE-NEXT: addq $1144, %rsp # imm = 0x478 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride6_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1368, %rsp # imm = 0x558 +; AVX1-ONLY-NEXT: subq $1384, %rsp # imm = 0x568 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] @@ -6537,13 +6511,13 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] @@ -6551,16 +6525,16 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm2, %xmm2 @@ -6570,149 +6544,150 @@ ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpslld $16, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 560(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 560(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,0,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm10, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,1,0,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 656(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa 672(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa 672(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa 608(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 624(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 624(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm3[0,1,0,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vmovdqa 592(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] @@ -6734,15 +6709,15 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,3,3] @@ -6751,12 +6726,12 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] ; AVX1-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm15 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3,4,5],xmm14[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] @@ -6770,9 +6745,10 @@ ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm14[6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm15 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6782,38 +6758,36 @@ ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm13 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3,4,5],xmm12[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = mem[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm13 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,3,3] @@ -6825,56 +6799,58 @@ ; AVX1-ONLY-NEXT: # xmm9 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm9 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3,4,5],xmm8[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6882,156 +6858,186 @@ ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm4[0,1],mem[2,3],xmm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm6[0,1],mem[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm4[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm15, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm10[0,1],mem[2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3,4],xmm5[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, (%rsp), %xmm3, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm3[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm5[0,1],mem[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm12[0,1],mem[2,3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vpshufd $85, (%rsp), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm9[0],xmm5[0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2,3],xmm10[4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm4[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3,4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm15, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm11[0,1],mem[2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm13[0,1],mem[2,3],xmm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm13[0],xmm5[0] +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm14[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3,4],xmm14[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm15, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm6[3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,3],xmm6[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm9, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3,4],xmm14[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm1[3,4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm4, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm13, %xmm6 +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm7[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm15, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7039,112 +7045,79 @@ ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm6 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3,4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm9, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm7, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm8[0],xmm7[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm6 ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm7[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3,4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm7[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3,4],xmm6[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm9, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm7, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm8[0],xmm7[0] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm7[0],xmm6[0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm5, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm7[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm0 +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm9, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3,4],xmm6[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -7153,12 +7126,53 @@ ; AVX1-ONLY-NEXT: # xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $243, (%rsp), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1],xmm1[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm12[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm2[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm13[1] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -7168,20 +7182,22 @@ ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm13[1] +; AVX1-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[1],mem[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm8, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm2[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm10[1] +; AVX1-ONLY-NEXT: vpunpckhqdq (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[1],mem[1] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -7193,232 +7209,192 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1],xmm2[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[1],mem[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm8, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,1],xmm2[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,5,4,6] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm8[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm7, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm15[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[1],mem[1] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm14[5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm14[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1],xmm6[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm9[1],mem[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm7, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm9 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm7[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1],xmm6[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm9 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm14[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm14[0,1,2,3,4,5,4,6] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm1[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm1[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm9[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm10 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm12[1],xmm10[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm8, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3,4],ymm9[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm7 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm11[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm9 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm8[1],xmm6[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm7[1],xmm5[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm6[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm9[1],xmm8[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm8[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm8[5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm15[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm9[1],xmm7[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm5 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm5[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7436,10 +7412,10 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) @@ -7456,311 +7432,311 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX1-ONLY-NEXT: addq $1368, %rsp # imm = 0x558 +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rax) +; AVX1-ONLY-NEXT: addq $1384, %rsp # imm = 0x568 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride6_vf64: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $1272, %rsp # imm = 0x4F8 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3] ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1] -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm5[2,3],ymm4[2,3] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm7[0,1],ymm6[0,1] ; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm5[0,1],ymm4[0,1] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm5[2,3],ymm4[2,3] +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm5[0,1],ymm4[0,1] +; AVX2-SLOW-NEXT: vmovdqu %ymm12, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm0[2,3],ymm1[2,3] -; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm0[0,1],ymm1[0,1] -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm7, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm3 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm13 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm13[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3],xmm7[4],xmm3[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4,5],ymm12[6],ymm14[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm12, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm7, %ymm3 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm1[0,1] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3,4,5],ymm4[6],ymm3[7] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm7 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm8 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm15[1],ymm11[2,3,4,5],ymm15[6],ymm11[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm11 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm10, %ymm11, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm3 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0],xmm9[1],xmm3[2,3],xmm9[4],xmm3[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm11[1],ymm10[2,3,4,5],ymm11[6],ymm10[7] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm9, %ymm10, %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm9 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm13[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4,5],ymm12[6],ymm14[7] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm9, %ymm10, %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm10, %xmm11 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm6 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm6[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm11[0],xmm14[1],xmm11[2,3],xmm14[4],xmm11[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[0],ymm11[1],mem[2,3,4,5],ymm11[6],mem[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm11, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm14, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm9[0],xmm13[1],xmm9[2,3],xmm13[4],xmm9[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm12 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm13, %ymm12, %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm7, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm9 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm8 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm8[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm9[0],xmm14[1],xmm9[2,3],xmm14[4],xmm9[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[0],ymm9[1],mem[2,3,4,5],ymm9[6],mem[7] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm9, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm14, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm6[1],xmm2[2,3],xmm6[4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm6, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm12, %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm11, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm9, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm10, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[1,1,2,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[0,2,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm13, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm1 +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm10, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm8, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7] -; AVX2-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $189, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm1 +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm14, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3,4,5,6,7],ymm1[8,9,10],ymm5[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm12 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2],xmm5[3],xmm0[4,5],xmm5[6],xmm0[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm12 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm0 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm5 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[0,2,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm5[0,1],xmm9[2],xmm5[3],xmm9[4,5],xmm5[6,7] +; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[0,2,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] -; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm5, %ymm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm15[3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5,6,7],ymm9[8,9,10],ymm8[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm14, %xmm15 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm13, %xmm13 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0,1],xmm15[2],xmm13[3],xmm15[4,5],xmm13[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm3[3],xmm0[4,5],xmm3[6],xmm0[7] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm15 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm11[3],xmm0[4,5],xmm11[6],xmm0[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7],ymm14[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm13 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1],xmm0[2],xmm13[3],xmm0[4,5],xmm13[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm0[2],xmm10[3],xmm0[4,5],xmm10[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm10, %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm10, %xmm10 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3],xmm10[4,5],xmm7[6],xmm10[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2],ymm7[3,4,5,6,7],ymm0[8,9,10],ymm7[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm6, %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm5, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm10, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7782,593 +7758,585 @@ ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX2-SLOW-NEXT: vpblendd $107, (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,1,0,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,0,0,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[2,1,2,3] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,0,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,1,0,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,0,0,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1,2],xmm13[3],xmm12[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm9, %ymm13 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm13[0,1,2],ymm4[3,4,5,6,7],ymm13[8,9,10],ymm4[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[2,1,0,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,0,0,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,6,5,6,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm14[4],xmm4[5,6],xmm14[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[2,1,2,3] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm15[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1,2],xmm11[3],xmm10[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm14, %ymm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = mem[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm0[4],xmm8[5,6],xmm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm5[1,2],xmm7[3],xmm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6],xmm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,6,5,6,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[2,1,2,3] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[2,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm9, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7],ymm15[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm14 = mem[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2,3],xmm0[4],xmm14[5,6],xmm0[7] +; AVX2-SLOW-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm14 = mem[3,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm15 = mem[0,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3,4],xmm0[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw $103, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[3,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm11[1,2],xmm1[3],xmm11[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm11, %ymm11 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm11[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[3,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1,2],xmm1[3],xmm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7],ymm7[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm7[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[1,1,1,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7] -; AVX2-SLOW-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[0,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5,6],xmm1[7] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm14, %ymm2 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[3,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm10[4],xmm1[5],xmm10[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm15 -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1,2,3],xmm7[4],xmm10[5],xmm7[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm7 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm12[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0,1,2,3],xmm7[4],xmm11[5],xmm7[6,7] +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0],ymm6[1],mem[2,3],ymm6[4],mem[5,6],ymm6[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1,2,3],xmm4[4],xmm10[5],xmm4[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm9 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm13[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4],xmm10[5],xmm9[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,2,3,4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5],xmm7[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm5 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm15[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm6 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm14[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0,1,2,3],xmm6[4],xmm11[5],xmm6[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm1 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm10, 96(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, 96(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm12, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 64(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 96(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm3, 64(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 64(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm10, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-SLOW-NEXT: addq $1272, %rsp # imm = 0x4F8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride6_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $1256, %rsp # imm = 0x4E8 +; AVX2-FAST-NEXT: subq $1272, %rsp # imm = 0x4F8 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 640(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm7 -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1] -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3] -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1] -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm7[2,3],ymm6[2,3] +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[0,1],ymm6[0,1] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm5[2,3],ymm4[2,3] ; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm5[0,1],ymm4[0,1] ; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm7 +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm3[2,3],ymm2[2,3] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[2,3] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[0,1],ymm1[0,1] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4,5],ymm3[6],ymm4[7] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm11 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm4 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0],ymm10[1],ymm11[2,3,4,5],ymm10[6],ymm11[7] +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm8 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm12 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm8, %ymm12, %ymm8 ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm8 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm9 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm13 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm12, %ymm13, %ymm12 +; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm12 ; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm8[2],ymm12[3,4],ymm8[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm12 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm7 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm7[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm12[0],xmm14[1],xmm12[2,3],xmm14[4],xmm12[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm12, %ymm13 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm13, %ymm13 ; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm13 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm3 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [6,7,6,7,6,7,6,7,10,11,10,11,10,11,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm14 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm1[0],xmm14[1],xmm1[2,3],xmm14[4],xmm1[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm11, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm3[1],xmm9[2,3],xmm3[4],xmm9[5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm9 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm9, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm11, %xmm14 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm7[0],xmm14[1],xmm7[2,3],xmm14[4],xmm7[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm1 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm3 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm1 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm0 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm0 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm6 +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[2,1,0,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,8,9,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX2-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm4[0],mem[1],ymm4[2,3,4,5],mem[6],ymm4[7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm2 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm6, %xmm4 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm1 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm13 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] +; AVX2-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm6 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4,5,6,7],ymm1[8,9,10],ymm6[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3],xmm4[4,5],xmm1[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm15[1],mem[2,3,4,5],ymm15[6],mem[7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm12 -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2],xmm0[3],xmm6[4,5],xmm0[6],xmm6[7] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[4,5,4,5,4,5,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm13 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm0[3],xmm4[4,5],xmm0[6],xmm4[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm6 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm11 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm6[0,1],xmm11[2],xmm6[3],xmm11[4,5],xmm6[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm6 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[0,1,0,1,0,1,0,1,8,9,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm15[2],xmm6[3],xmm15[4,5],xmm6[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm15 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm15[3,4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7],ymm11[8,9,10],ymm7[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm0[0,1,2],ymm15[3,4,5,6,7],ymm0[8,9,10],ymm15[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm7 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm15[2],xmm7[3],xmm15[4,5],xmm7[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm0[0,1,2],xmm11[3],xmm0[4,5],xmm11[6],xmm0[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,2,3,2,3,2,3,10,11,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm14, %xmm14 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm14[2],xmm10[3],xmm14[4,5],xmm10[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm0[0,1,2],ymm15[3,4,5,6,7],ymm0[8,9,10],ymm15[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm9[3],xmm0[4,5],xmm9[6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2],xmm7[3],xmm9[4,5],xmm7[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7],ymm14[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2],xmm0[3],xmm8[4,5],xmm0[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3],xmm4[4,5],xmm0[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3],xmm0[4,5],xmm5[6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7],ymm5[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm0 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8378,36 +8346,35 @@ ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,0,3] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm0 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,6,5,6,4] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,0,1,4,5,6,7,12,13,12,13,12,13,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,1,2,0,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -8417,548 +8384,543 @@ ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[2,1,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,1,2,0,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,1,0,3] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm0 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3],xmm3[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm13, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,1,2,1] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm0 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,3,2,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,1,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm1 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm3 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,1,2,1] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[2,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm15, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,3,2,1] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[0,1,0,1,4,5,6,7,12,13,12,13,12,13,12,13] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm1 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,6,7,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm15 = mem[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm0[1,2],xmm15[3],xmm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6],xmm3[7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2],xmm3[3],xmm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm0 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm1 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2],xmm3[3],xmm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm0 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm4 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm2 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm1 -; AVX2-FAST-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5,6],xmm4[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm4 -; AVX2-FAST-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = mem[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm1 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm2 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,12,13,12,13,12,13,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,3,2,1] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,3,2,1] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm14, %xmm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3],xmm1[4],xmm15[5],xmm1[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm14 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2,3],xmm1[4],xmm14[5],xmm1[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,1] +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm15, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm12, %xmm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm7 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,3,2,1] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm14, %xmm7 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5],xmm6[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5],xmm7[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7] +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,2,3,6,7,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm10[4],xmm4[5],xmm10[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm10 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3],xmm8[4],xmm10[5],xmm8[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = mem[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm14, %xmm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm11, 96(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 32(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 64(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm13, 96(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm11, (%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 96(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm3, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 64(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm12, (%rax) -; AVX2-FAST-NEXT: addq $1256, %rsp # imm = 0x4E8 +; AVX2-FAST-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm10, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-FAST-NEXT: addq $1272, %rsp # imm = 0x4F8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride6_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $1256, %rsp # imm = 0x4E8 +; AVX2-FAST-PERLANE-NEXT: subq $1272, %rsp # imm = 0x4F8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 672(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 640(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1] -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm7[2,3],ymm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[0,1],ymm6[0,1] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm5[2,3],ymm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm5[0,1],ymm4[0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm3[2,3],ymm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[0,1],ymm1[0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4,5],ymm3[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm3, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0],ymm10[1],ymm11[2,3,4,5],ymm10[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm9, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm10, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm8, %ymm12, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm8, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm12, %ymm13, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm8[2],ymm12[3,4],ymm8[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm14 = xmm7[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm12[0],xmm14[1],xmm12[2,3],xmm14[4],xmm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm12, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm14, %ymm13, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm13, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm14, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [6,7,6,7,6,7,6,7,10,11,10,11,10,11,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm6, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm1[0],xmm14[1],xmm1[2,3],xmm14[4],xmm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm14, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm11, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm3[1],xmm9[2,3],xmm3[4],xmm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm10, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm2 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm7[0],xmm14[1],xmm7[2,3],xmm14[4],xmm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm14, %ymm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm6, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm13, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm12, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm7, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm10, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm0 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,8,9,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FAST-PERLANE-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm4[0],mem[1],ymm4[2,3,4,5],mem[6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm7, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm2, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4,5,6,7],ymm1[8,9,10],ymm6[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3],xmm4[4,5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm15[1],mem[2,3,4,5],ymm15[6],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm12, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2],xmm0[3],xmm6[4,5],xmm0[6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[4,5,4,5,4,5,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm0[3],xmm4[4,5],xmm0[6],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm1, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm6[0,1],xmm11[2],xmm6[3],xmm11[4,5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[0,1,0,1,0,1,0,1,8,9,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm15[2],xmm6[3],xmm15[4,5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm6, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm15[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7],ymm11[8,9,10],ymm7[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm15 = ymm0[0,1,2],ymm15[3,4,5,6,7],ymm0[8,9,10],ymm15[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm10, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm10 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm15[2],xmm7[3],xmm15[4,5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm15 = xmm0[0,1,2],xmm11[3],xmm0[4,5],xmm11[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,2,3,2,3,2,3,10,11,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm14[2],xmm10[3],xmm14[4,5],xmm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm15 = ymm0[0,1,2],ymm15[3,4,5,6,7],ymm0[8,9,10],ymm15[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm9[3],xmm0[4,5],xmm9[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2],xmm7[3],xmm9[4,5],xmm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm9, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7],ymm14[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2],xmm0[3],xmm8[4,5],xmm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm4, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3],xmm4[4,5],xmm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3],xmm0[4,5],xmm5[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7],ymm5[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm13, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm4, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm13, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm6, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8968,36 +8930,35 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm4, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,6,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm10 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm9, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,0,1,4,5,6,7,12,13,12,13,12,13,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -9007,463 +8968,451 @@ ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm1, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm8, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3],xmm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm13, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm14, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm8, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm15, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[0,1,0,1,4,5,6,7,12,13,12,13,12,13,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,6,7,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm0[1,2],xmm15[3],xmm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2],xmm3[3],xmm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2],xmm3[3],xmm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm13, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm6, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5,6],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm15, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm10, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,12,13,12,13,12,13,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm12, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm14, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3],xmm1[4],xmm15[5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2,3],xmm1[4],xmm14[5],xmm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm15, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm14, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5],xmm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm15, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,2,3,6,7,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm10[4],xmm4[5],xmm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3],xmm8[4],xmm10[5],xmm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $1256, %rsp # imm = 0x4E8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $1272, %rsp # imm = 0x4F8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: load_i16_stride6_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $1480, %rsp # imm = 0x5C8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 608(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: subq $1512, %rsp # imm = 0x5E8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,0,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm16 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,2,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 544(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm16 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm2, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,0,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm22 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 480(%rdi), %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 736(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm18 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 672(%rdi), %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm19 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 608(%rdi), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[0,2,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,0,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm23 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm13, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm9, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 544(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm14, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm7, %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,2,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm5, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,2,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 480(%rdi), %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm10, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1,2],xmm12[3],xmm1[4,5],xmm12[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],mem[2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm15, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3],xmm0[4,5],xmm13[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm6, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4,5],xmm4[6],xmm6[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 672(%rdi), %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1],xmm0[2],xmm11[3],xmm0[4,5],xmm11[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm12[3],xmm0[4,5],xmm12[6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm10, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm1, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm14[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm14, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3],xmm1[4,5],xmm8[6],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm10, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm6, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm7, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,5,5,5,5] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm25 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] @@ -9471,14 +9420,14 @@ ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm24 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm28 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7] @@ -9489,365 +9438,357 @@ ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm19 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm18 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm4 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,0,0,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm4[0,1],mem[2],ymm4[3],mem[4],ymm4[5,6],mem[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm29, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,0,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm16 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,0,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[0,0,0,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,6,5,6,4] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm12 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm12 = ymm3[0,1],mem[2],ymm3[3],mem[4],ymm3[5,6],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm12, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,0,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[2,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,0,0,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm23 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm29, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm22, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm4, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm2[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm1[0,1,2,3],xmm15[4],xmm1[5,6],xmm15[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $107, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm1, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm13[0,1,2],ymm15[3,4,5,6,7],ymm13[8,9,10],ymm15[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm29, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm22, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm19, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm20, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,1,1,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3],xmm2[4],xmm14[5,6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm24, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm13[1,2],xmm0[3],xmm13[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm19, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm20, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,1,1,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2,3],xmm13[4],xmm15[5,6],xmm13[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm13, %zmm28 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[3,1,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm14[1,2],xmm2[3],xmm14[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0],xmm15[1,2],xmm13[3],xmm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[1,1,1,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm2[4],xmm10[5,6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm13, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7],ymm13[8,9,10],ymm2[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,4,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2,3,4],xmm0[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,1,1,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1,2,3],xmm11[4],xmm14[5,6],xmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm12, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7],ymm12[8,9,10],ymm11[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm27 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm28, %zmm29, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm22, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[1,1,1,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm22, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1,2],xmm0[3],xmm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,1,1,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5,6],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm4[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2],xmm2[3],xmm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm5, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm22, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm22, %zmm28 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm24 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5,6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5,6],mem[7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,3,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm24 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm5, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm23 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm29 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm9 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm25 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm30 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm4, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm30, %ymm4, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: movw $31, %ax ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm30 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0],ymm3[1],mem[2,3,4,5],ymm3[6],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm29 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $109, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6],ymm3[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0],ymm3[1],mem[2,3,4,5],ymm3[6],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5,6],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,3,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm21 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm6, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm6, %xmm18 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm5, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm19 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0],ymm3[1],mem[2,3,4,5],ymm3[6],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4,5],mem[6],ymm4[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm14 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm14 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm6, %xmm18 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3],xmm5[4],xmm3[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm30, %ymm0, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm12, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm31 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm15, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm5, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm31 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd $66, (%rsp), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm7, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm8, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm1[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm13, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm24[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0,1,2,3],xmm3[4],xmm13[5],xmm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm26[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3],xmm10[4],xmm3[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm13, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0,1,2,3],xmm3[4],xmm13[5],xmm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm3, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm24, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1,2,3],xmm0[4],xmm12[5],xmm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm25[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3],xmm9[4],xmm0[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm30, %ymm12, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1,2,3],xmm0[4],xmm12[5],xmm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm18[1,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm3, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm14[1],xmm9[2,3],xmm14[4],xmm9[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm30, %ymm1, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm15, %xmm10 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5],xmm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm9, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm8, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4],xmm11[5],xmm10[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm9, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm13, %ymm8 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload @@ -9855,12 +9796,12 @@ ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rsi) ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload @@ -9872,452 +9813,433 @@ ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rdx) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm6, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm31, %zmm6, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm29, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm31, %zmm6, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm6, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 64(%rcx) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, (%rcx) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 64(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r9) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $1480, %rsp # imm = 0x5C8 +; AVX512F-ONLY-SLOW-NEXT: addq $1512, %rsp # imm = 0x5E8 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: load_i16_stride6_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $1480, %rsp # imm = 0x5C8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: subq $1512, %rsp # imm = 0x5E8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,0,1,0,1,8,9,12,13,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm13, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 544(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,4,5,8,9,8,9,8,9,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm14, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm20 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm22 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm23 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm21 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 480(%rdi), %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm22 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 736(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm26 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm11, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm6, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm16 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 640(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 672(%rdi), %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 544(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm19 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0],ymm1[1],ymm3[2,3,4,5],ymm1[6],ymm3[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm12, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 480(%rdi), %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2],xmm9[3],xmm2[4,5],xmm9[6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2],ymm9[3,4,5,6,7],ymm0[8,9,10],ymm9[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm11, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm7[2],xmm0[3],xmm7[4,5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3],xmm5[4,5],xmm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 640(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 672(%rdi), %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm0[1],ymm3[2,3,4,5],ymm0[6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm15[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm26, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,10,11,14,15,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm12[0,1],xmm13[2],xmm12[3],xmm13[4,5],xmm12[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm14[3],xmm0[4,5],xmm14[6],xmm0[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm10[2],xmm0[3],xmm10[4,5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm11[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm9[3],xmm0[4,5],xmm9[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2],xmm0[3],xmm8[4,5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,0,1,4,5,6,7,12,13,12,13,12,13,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm28 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm22 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm21 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm20 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm3 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm21 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm18 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,2,1] ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm17 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm19 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm18 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm16 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[0,1,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5,6],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm15 = ymm6[0,1],mem[2],ymm6[3],mem[4],ymm6[5,6],mem[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm3, %zmm29, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,1,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm4 = ymm3[0,1],mem[2],ymm3[3],mem[4],ymm3[5,6],mem[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm23 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm5 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm11, %zmm29, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm22, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm13[4],xmm11[5,6],xmm13[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm11, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm13, %xmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm11[1,2],xmm13[3],xmm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm12[0,1,2],ymm14[3,4,5,6,7],ymm12[8,9,10],ymm14[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm23, %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm24, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,6,7,6,7,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm10[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm12[1,2],xmm14[3],xmm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm0, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1,2],xmm14[3],xmm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm0[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm14[4],xmm13[5,6],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm15, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7],ymm15[8,9,10],ymm13[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm24, %zmm29, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm22, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1,2],xmm10[3],xmm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5,6],xmm8[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0,1,2,3],xmm10[4],xmm13[5,6],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0,1,2],ymm10[3,4,5,6,7],ymm13[8,9,10],ymm10[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm28, %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm24, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1,2],xmm9[3],xmm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5,6],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1,2],xmm6[3],xmm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm22, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm24, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,0,1,4,5,12,13,12,13,12,13,12,13] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm26 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm30 @@ -10339,15 +10261,15 @@ ; AVX512F-ONLY-FAST-NEXT: movw $31, %ax ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm30 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm22 @@ -10359,7 +10281,7 @@ ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm21 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -10386,32 +10308,32 @@ ; AVX512F-ONLY-FAST-NEXT: # ymm3 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm13 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm13, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm5 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5],xmm3[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm31 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm31 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm10 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm7[4],xmm1[5],xmm7[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm6, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4],xmm1[5],xmm5[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm1[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm1[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,6,7,6,7,6,7,10,11,10,11,10,11,10,11] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm11, %xmm11 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3],xmm9[4],xmm11[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 @@ -10420,13 +10342,13 @@ ; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm11, %ymm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm12, %xmm12 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,2,3,6,7,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm26, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm15 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4],xmm15[5],xmm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -10447,54 +10369,54 @@ ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm15 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm14, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm14, %xmm14 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3],xmm0[4],xmm14[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm15, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm14 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4],xmm7[5],xmm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5],xmm4[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5],xmm4[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm6 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%rsi) ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rsi) ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 64(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%rdx) ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm6 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rdx) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm30, %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm30, %zmm4, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm31, %zmm4, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm4, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm4, %zmm0 @@ -10503,91 +10425,88 @@ ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, (%rcx) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 64(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, (%r8) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 64(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-FAST-NEXT: addq $1480, %rsp # imm = 0x5C8 +; AVX512F-ONLY-FAST-NEXT: addq $1512, %rsp # imm = 0x5E8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: load_i16_stride6_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $840, %rsp # imm = 0x348 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vmovdqa 608(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: subq $936, %rsp # imm = 0x3A8 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm0 -; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm1, %xmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm24[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm26 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 544(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm21 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm15, %xmm2 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm18 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm2 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm19 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,2,0,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm20 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 480(%rdi), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm20 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm3, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm21 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 736(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 704(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm2 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm19 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],mem[2,3] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 672(%rdi), %ymm2, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm2 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3,4,5],ymm4[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm4, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm23 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm3 @@ -10595,1122 +10514,1091 @@ ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 608(%rdi), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm24 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm13, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm10, %xmm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 544(%rdi), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm8 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm8, %xmm2 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm25 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm3, %xmm1 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm9 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm5, %xmm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm14 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,2,0,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 480(%rdi), %ymm2, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm31 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm9, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 736(%rdi), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm7 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm7, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1,2],xmm12[3],xmm1[4,5],xmm12[6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm17, %zmm16, %zmm6 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm14, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3],xmm0[4,5],xmm13[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm8, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4,5],xmm4[6],xmm6[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm5[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm7, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 704(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],mem[2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 672(%rdi), %ymm4, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0],ymm1[1],ymm4[2,3,4,5],ymm1[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm6, %ymm1 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm5 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, %xmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm17, %zmm16, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1],xmm0[2],xmm12[3],xmm0[4,5],xmm12[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm11[3],xmm0[4,5],xmm11[6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm11 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1],xmm1[2],xmm11[3],xmm1[4,5],xmm11[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm15 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm15, %ymm15 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2],ymm15[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[3],xmm1[4,5],xmm13[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm13 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm13, %ymm13 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2],ymm1[3,4,5,6,7],ymm13[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm15 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm15 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3],xmm1[4,5],xmm7[6],xmm1[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm15, %xmm4 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm14, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm9, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm6, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm8, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm4 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm24 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,1,2,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm23 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm22 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,5,6,4] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm20 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,2,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm3[2],ymm0[3],ymm3[4],ymm0[5,6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,0,0,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,6,5,6,4] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm5[2],ymm4[3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm27 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm20, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,1,2,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm16 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm3 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,0,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[2,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,0,0,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,6,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,6,5,6,4] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm8 = ymm3[0,1],mem[2],ymm3[3],mem[4],ymm3[5,6],mem[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm8, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm21, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm30, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[2,1,2,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,1,2,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[2,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,0,0,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm19 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm28 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm19, %zmm20, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm18, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendd $219, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm4, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm2[0,1,2,3,6,5,6,4] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0,1,2,3],xmm11[4],xmm1[5,6],xmm11[7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm1, %ymm12 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7],ymm12[8,9,10],ymm11[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm29 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm19, %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm30, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm0 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm14 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,1,1,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3],xmm2[4],xmm14[5,6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, %xmm11 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1,2],xmm0[3],xmm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm20, %xmm11 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,5,6,5] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm12 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,1,1,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5,6],xmm11[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm11, %zmm20 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[3,1,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm14[1,2],xmm2[3],xmm14[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[3,1,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[1,1,1,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm0[5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,1,1,1,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm2[4],xmm10[5,6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm13, %ymm13 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7],ymm13[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,4,5] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm27 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm21, %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm18, %zmm27 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[3,1,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[1,1,1,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4],xmm10[5,6],xmm9[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm8, %ymm8 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2],ymm9[3,4,5,6,7],ymm8[8,9,10],ymm9[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm28 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm20, %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm30, %zmm28 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[3,1,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm14[0,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1,2],xmm0[3],xmm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm15[1,1,1,1,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5,6],xmm7[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm4[5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,1,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2],xmm2[3],xmm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm5, %ymm5 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm22 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm20, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm18, %zmm22 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm21, %zmm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm30, %zmm20 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm1 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm14 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm11 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,2,2,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm13 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm27 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm18 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm21 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm10 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm1 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm30 +; AVX512DQ-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm26 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm5, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm31 -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm20, %ymm3, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm5, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm22 +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm25, %ymm4, %ymm2 ; AVX512DQ-SLOW-NEXT: movw $31, %ax ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm23 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm21 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm29 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm19 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm6, %xmm16 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm15, %xmm6, %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm6, %xmm17 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm17 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm15 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm5, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm18 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm5 = ymm4[0],mem[1],ymm4[2,3,4,5],mem[6],ymm4[7] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm2 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm13 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm13[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm15, %ymm1 -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm20, %ymm1, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd $36, (%rsp), %ymm2, %ymm14 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm14 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm2 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm6, %xmm31 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm16 +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm25, %ymm1, %ymm2 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] ; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm11 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3],xmm5[4],xmm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5,6],ymm5[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm9 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm25 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm25 {%k1} +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5,6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm15, %xmm9, %xmm5 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm24 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm24 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm7 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm12 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm15, %xmm5, %xmm15 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,1,0,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5],xmm12[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm15[4],xmm0[5],xmm15[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm6, %ymm15 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[1,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm12[1],xmm4[2,3],xmm12[4],xmm4[5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm14 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3],xmm0[4],xmm14[5],xmm0[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm26 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm10 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm30[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm3[0],xmm12[1],xmm3[2,3],xmm12[4],xmm3[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm13[4],xmm11[5],xmm13[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm11, %zmm11 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm8 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm26[1,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm14 -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm20, %ymm14, %ymm10 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm10, %zmm0, %zmm26 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm14[0,1,2,3],xmm10[4],xmm14[5],xmm10[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm13 +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm25, %ymm13, %ymm8 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm8, %zmm0, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0,1,2,3],xmm8[4],xmm13[5],xmm8[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm15, %ymm10 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm12[1],xmm3[2,3],xmm12[4],xmm3[5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm20, %ymm10, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm8 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm8, %ymm8 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm12 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm31[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3],xmm14[4],xmm12[5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm25, %ymm8, %ymm12 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[1,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5],xmm2[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm3, %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm7, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm12, %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm6, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,3] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5],xmm3[6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, (%rsi) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 64(%rsi) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, (%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 64(%rdx) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, (%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm3, %zmm24 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm25, %zmm3, %zmm21 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm26, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx) +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm30, %zmm23 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm24, %zmm30, %zmm15 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm11, %zmm30, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm30, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 64(%rcx) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 64(%r9) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, (%r9) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, (%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 64(%r9) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, (%r9) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512DQ-SLOW-NEXT: addq $840, %rsp # imm = 0x348 +; AVX512DQ-SLOW-NEXT: addq $936, %rsp # imm = 0x3A8 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: load_i16_stride6_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $904, %rsp # imm = 0x388 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512DQ-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: subq $968, %rsp # imm = 0x3C8 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm0 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,0,1,0,1,8,9,12,13,12,13,14,15] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm14, %xmm1 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 544(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm10 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,4,5,4,5,4,5,8,9,8,9,8,9,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, %xmm7 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm22 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm19 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 416(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm3[2,3],mem[2,3] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 480(%rdi), %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm20 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],mem[2,3] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm4[1],ymm3[2,3,4,5],ymm4[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 736(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0],ymm3[1],ymm2[2,3,4,5],ymm3[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm21 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm19 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa 640(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm27 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm3[2,3],mem[2,3] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 672(%rdi), %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm5[1],ymm3[2,3,4,5],ymm5[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm18 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3,4,5],ymm4[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm22 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm3 ; AVX512DQ-FAST-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm11, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm13 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa 544(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 416(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm0 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm11, %xmm3 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm3 ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0],ymm1[1],ymm3[2,3,4,5],ymm1[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 480(%rdi), %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm15, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 736(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm10 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0,1,2],xmm10[3],xmm2[4,5],xmm10[6],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm17, %zmm16, %zmm6 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm0 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm7[2],xmm0[3],xmm7[4,5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2],xmm4[3],xmm6[4,5],xmm4[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm5, %zmm16, %zmm4 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm4 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3],xmm5[4,5],xmm4[6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm2[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5],xmm3[6],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm25, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa 640(%rdi), %ymm4 +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vinserti128 $1, 672(%rdi), %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm1[1],ymm4[2,3,4,5],ymm1[6],ymm4[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm17, %zmm16, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,10,11,14,15,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm13 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm13[3],xmm0[4,5],xmm13[6],xmm0[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm16, %zmm4 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm13 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm13 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm10[0,1],xmm13[2],xmm10[3],xmm13[4,5],xmm10[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm14 +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm14 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2],ymm14[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm13 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm13 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0,1,2],xmm9[3],xmm13[4,5],xmm9[6],xmm13[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm13 +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm13 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0,1,2],ymm9[3,4,5,6,7],ymm13[8,9,10],ymm9[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm14 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm9, %zmm0, %zmm14 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2],xmm0[3],xmm8[4,5],xmm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm5 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm15, %xmm1 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,0,1,4,5,6,7,12,13,12,13,12,13,12,13] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm25 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,5,6,4] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm20 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm23 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm3 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm22 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm19 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm18 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,2,1] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm17 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm23 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,1,2,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm22 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm24 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm25 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,2,1] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm18 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5,6],xmm6[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1],ymm7[2],ymm6[3],ymm7[4],ymm6[5,6],ymm7[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm8, %ymm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm17 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm3, %zmm20, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm16 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[2,1,2,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm12, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[0,1,2,1] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm9 +; AVX512DQ-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm4 = ymm3[0,1],mem[2],ymm3[3],mem[4],ymm3[5,6],mem[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm21, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,1,2,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,2,1] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3],xmm10[4],xmm0[5,6],xmm10[7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm29 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm9, %zmm20, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm28 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[3,1,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5,6],xmm10[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm9, %zmm21 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm0 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[3,1,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0],xmm9[1,2],xmm15[3],xmm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2,3,4],xmm0[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5,6],xmm14[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,4,5] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm27 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm21, %zmm20, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5,6],xmm11[7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm1 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1,2],xmm7[3],xmm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm12 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm12[0,1,2],ymm14[3,4,5,6,7],ymm12[8,9,10],ymm14[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm31 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm19, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm26, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,6,7,6,7,14,15,14,15,14,15,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm12 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm10[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm12[1,2],xmm14[3],xmm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm10 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,7,5,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm0, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1,2],xmm14[3],xmm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm0[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm10 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm10 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,5,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0,1,2,3],xmm10[4],xmm13[5,6],xmm10[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm13 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm13 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0,1,2],ymm10[3,4,5,6,7],ymm13[8,9,10],ymm10[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm29 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm20, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm26, %zmm29 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1,2],xmm9[3],xmm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5,6],xmm7[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1,2],xmm6[3],xmm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm21 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm20, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm21 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm20 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm21, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm20 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm13 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm15 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,2,2,2,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm26 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm28 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,4,5,12,13,12,13,12,13,12,13] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm30 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm22 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm21 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7] +; AVX512DQ-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm4 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm12 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm10 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,2,2,2,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm18 -; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm20, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm28, %ymm3, %ymm2 ; AVX512DQ-FAST-NEXT: movw $31, %ax ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7] +; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm4 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm19 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm30 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm22 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm27 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm31 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm19 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm23 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm15 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm15, %xmm2 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm6 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[2,2,2,2,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm17 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm16 -; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm20, %ymm1, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[2,2,2,2,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm4 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7] +; AVX512DQ-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm13 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm2 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm5 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm17 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 +; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm28, %ymm1, %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7] +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm24 -; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm3, %zmm0, %zmm24 {%k1} +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5,6],mem[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5],xmm3[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm24 +; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm24 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX512DQ-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm14 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm14[4],xmm0[5],xmm14[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm14 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm25 -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm11 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm12, %xmm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,6,7,6,7,6,7,10,11,10,11,10,11,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm11, %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3],xmm9[4],xmm10[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 -; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm20, %ymm12, %ymm11 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm12 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm13 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm12[1],xmm13[2,3],xmm12[4],xmm13[5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm26, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm14 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0,1,2,3],xmm4[4],xmm14[5],xmm4[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm26 -; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm11, %zmm0, %zmm26 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0,1,2,3],xmm4[4],xmm11[5],xmm4[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm14 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm14, %ymm18 +; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm28, %ymm10, %ymm9 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm10 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm10[1],xmm12[2,3],xmm10[4],xmm12[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm26, %xmm12 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm12, %xmm14 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,2,3,6,7,14,15,14,15,14,15,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm15 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm15, %xmm15 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5],xmm14[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm14, %zmm26 +; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm9, %zmm0, %zmm26 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm14[0,1,2,3],xmm9[4],xmm14[5],xmm9[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm15 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3,4],ymm9[5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm15 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2,3],xmm0[4],xmm15[5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm20, %ymm14, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm13 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3],xmm0[4],xmm13[5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm28, %ymm15, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 ; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm0, %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm6, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm3 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, (%rsi) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 64(%rsi) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, (%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 64(%rdx) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm2, (%rdx) ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm22, %zmm2, %zmm23 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm21, %zmm2, %zmm23 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm24, %zmm2, %zmm25 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm26, %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm26, %zmm2, %zmm9 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 64(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 64(%rcx) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm1, (%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 64(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, (%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, (%r8) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 64(%r9) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, (%r9) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512DQ-FAST-NEXT: addq $904, %rsp # imm = 0x388 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512DQ-FAST-NEXT: addq $968, %rsp # imm = 0x3C8 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll @@ -119,20 +119,20 @@ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpsrld $16, %xmm0, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpbroadcastw 8(%rdi), %xmm8 ; AVX2-FAST-NEXT: vpsrlq $48, %xmm1, %xmm9 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovd %xmm2, (%rsi) ; AVX2-FAST-NEXT: vmovd %xmm4, (%rdx) ; AVX2-FAST-NEXT: vmovd %xmm7, (%rcx) -; AVX2-FAST-NEXT: vmovd %xmm5, (%r8) +; AVX2-FAST-NEXT: vmovd %xmm6, (%r8) ; AVX2-FAST-NEXT: vmovd %xmm8, (%r9) ; AVX2-FAST-NEXT: vmovd %xmm3, (%r10) ; AVX2-FAST-NEXT: vmovd %xmm0, (%rax) @@ -147,20 +147,20 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpsrld $16, %xmm0, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw 8(%rdi), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpsrlq $48, %xmm1, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm2, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm4, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm7, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovd %xmm5, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovd %xmm6, (%r8) ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm8, (%r9) ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm3, (%r10) ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm0, (%rax) @@ -205,20 +205,20 @@ ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpsrld $16, %xmm0, %xmm3 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm7 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpbroadcastw 8(%rdi), %xmm8 ; AVX512F-FAST-NEXT: vpsrlq $48, %xmm1, %xmm9 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX512F-FAST-NEXT: vmovd %xmm2, (%rsi) ; AVX512F-FAST-NEXT: vmovd %xmm4, (%rdx) ; AVX512F-FAST-NEXT: vmovd %xmm7, (%rcx) -; AVX512F-FAST-NEXT: vmovd %xmm5, (%r8) +; AVX512F-FAST-NEXT: vmovd %xmm6, (%r8) ; AVX512F-FAST-NEXT: vmovd %xmm8, (%r9) ; AVX512F-FAST-NEXT: vmovd %xmm3, (%r10) ; AVX512F-FAST-NEXT: vmovd %xmm0, (%rax) @@ -406,49 +406,49 @@ ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm5[2],xmm3[3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6],xmm1[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm6[1,2,3,4,5,6],xmm4[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm5[2],xmm6[3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6],xmm2[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4,5,6],xmm4[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-SLOW-NEXT: vmovq %xmm1, (%rsi) -; AVX2-SLOW-NEXT: vmovq %xmm6, (%rdx) -; AVX2-SLOW-NEXT: vmovq %xmm3, (%rcx) -; AVX2-SLOW-NEXT: vmovq %xmm4, (%r8) -; AVX2-SLOW-NEXT: vmovq %xmm5, (%r9) +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-SLOW-NEXT: vmovq %xmm2, (%rsi) +; AVX2-SLOW-NEXT: vmovq %xmm3, (%rdx) +; AVX2-SLOW-NEXT: vmovq %xmm4, (%rcx) +; AVX2-SLOW-NEXT: vmovq %xmm5, (%r8) +; AVX2-SLOW-NEXT: vmovq %xmm6, (%r9) ; AVX2-SLOW-NEXT: vmovq %xmm7, (%r10) ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rax) ; AVX2-SLOW-NEXT: vzeroupper @@ -463,26 +463,26 @@ ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1],xmm4[2],xmm2[3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm3[6],xmm5[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0,1],xmm4[2],xmm5[3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm3[6],xmm6[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6],xmm3[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm8 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm8 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm7 @@ -493,11 +493,11 @@ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-NEXT: vmovq %xmm5, (%rsi) -; AVX2-FAST-NEXT: vmovq %xmm6, (%rdx) -; AVX2-FAST-NEXT: vmovq %xmm2, (%rcx) -; AVX2-FAST-NEXT: vmovq %xmm3, (%r8) -; AVX2-FAST-NEXT: vmovq %xmm4, (%r9) +; AVX2-FAST-NEXT: vmovq %xmm6, (%rsi) +; AVX2-FAST-NEXT: vmovq %xmm2, (%rdx) +; AVX2-FAST-NEXT: vmovq %xmm3, (%rcx) +; AVX2-FAST-NEXT: vmovq %xmm4, (%r8) +; AVX2-FAST-NEXT: vmovq %xmm5, (%r9) ; AVX2-FAST-NEXT: vmovq %xmm7, (%r10) ; AVX2-FAST-NEXT: vmovq %xmm0, (%rax) ; AVX2-FAST-NEXT: vzeroupper @@ -512,26 +512,26 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1],xmm4[2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm3[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0,1],xmm4[2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm3[6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm9, %xmm7 @@ -542,11 +542,11 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm5, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm6, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm2, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm3, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm4, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm6, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm2, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm3, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm4, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm5, (%r9) ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm7, (%r10) ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -556,20 +556,20 @@ ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm4[2],xmm2[3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm3[2],xmm4[3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6],xmm3[7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6],xmm2[7] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] @@ -612,45 +612,45 @@ ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm3[2],xmm4[3] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6],xmm2[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6],xmm2[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm8 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm7 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX512F-FAST-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-FAST-NEXT: vmovq %xmm4, (%rdx) -; AVX512F-FAST-NEXT: vmovq %xmm1, (%rcx) +; AVX512F-FAST-NEXT: vmovq %xmm1, (%rdx) +; AVX512F-FAST-NEXT: vmovq %xmm2, (%rcx) ; AVX512F-FAST-NEXT: vmovq %xmm5, (%r8) ; AVX512F-FAST-NEXT: vmovq %xmm6, (%r9) ; AVX512F-FAST-NEXT: vmovq %xmm7, (%r10) -; AVX512F-FAST-NEXT: vmovq %xmm2, (%rax) +; AVX512F-FAST-NEXT: vmovq %xmm3, (%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -831,7 +831,7 @@ ; SSE-NEXT: movdqa %xmm2, %xmm9 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] @@ -844,7 +844,7 @@ ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,2,2,2] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm10[0,2] ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero @@ -1017,7 +1017,7 @@ ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3,4],xmm10[5,6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,7,0,1,14,15,u,u,10,11] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm2[6],xmm11[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 @@ -1100,7 +1100,7 @@ ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,4,5,2,3,0,1,14,15,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3,4],xmm10[5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,7,0,1,14,15,u,u,10,11] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm2[6],xmm11[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] @@ -1179,7 +1179,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,4,5,2,3,0,1,14,15,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3,4],xmm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,7,0,1,14,15,u,u,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm2[6],xmm11[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] @@ -1233,53 +1233,55 @@ ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,8,9,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm7[0,1],xmm8[2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm2[3],xmm9[4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,12,13,10,11,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3,4],xmm6[5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4],xmm7[5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4],xmm7[5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,2,1,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,0,1,14,15,12,13,10,11,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3,4],xmm6[5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm2[4],xmm9[5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0],xmm11[1,2,3,4,5,6],xmm2[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,7,6] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,2,1,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3,4],xmm10[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm2[6],xmm11[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm7[1,2,3,4,5,6],xmm2[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,7,6] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,4,6,7] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 @@ -1292,10 +1294,10 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, (%rsi) ; AVX512F-SLOW-NEXT: vmovdqa %xmm6, (%rdx) -; AVX512F-SLOW-NEXT: vmovdqa %xmm7, (%rcx) -; AVX512F-SLOW-NEXT: vmovdqa %xmm8, (%r8) -; AVX512F-SLOW-NEXT: vmovdqa %xmm9, (%r9) -; AVX512F-SLOW-NEXT: vmovdqa %xmm10, (%r10) +; AVX512F-SLOW-NEXT: vmovdqa %xmm9, (%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %xmm10, (%r8) +; AVX512F-SLOW-NEXT: vmovdqa %xmm11, (%r9) +; AVX512F-SLOW-NEXT: vmovdqa %xmm7, (%r10) ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, (%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq @@ -1317,47 +1319,49 @@ ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,8,9,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm7[0,1],xmm8[2,3] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm2[3],xmm9[4,5,6,7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,u,8,9,6,7,4,5] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,12,13,10,11,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3,4],xmm6[5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4],xmm7[5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,10,11,8,9,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4],xmm7[5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,0,1,14,15,12,13,10,11,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3,4],xmm6[5,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm2[4],xmm9[5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,10,11,8,9,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[6,7,4,5,2,3,0,1,14,15,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm12 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm11 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0],xmm11[1,2,3,4,5,6],xmm2[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,4,5,2,3,0,1,14,15,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3,4],xmm10[5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm2[6],xmm11[7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm14 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm13 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm7[1,2,3,4,5,6],xmm2[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] @@ -1369,10 +1373,10 @@ ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vmovdqa %xmm3, (%rsi) ; AVX512F-FAST-NEXT: vmovdqa %xmm6, (%rdx) -; AVX512F-FAST-NEXT: vmovdqa %xmm7, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa %xmm8, (%r8) -; AVX512F-FAST-NEXT: vmovdqa %xmm9, (%r9) -; AVX512F-FAST-NEXT: vmovdqa %xmm10, (%r10) +; AVX512F-FAST-NEXT: vmovdqa %xmm9, (%rcx) +; AVX512F-FAST-NEXT: vmovdqa %xmm10, (%r8) +; AVX512F-FAST-NEXT: vmovdqa %xmm11, (%r9) +; AVX512F-FAST-NEXT: vmovdqa %xmm7, (%r10) ; AVX512F-FAST-NEXT: vmovdqa %xmm0, (%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -1427,121 +1431,117 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i16_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $232, %rsp -; SSE-NEXT: movdqa 80(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm12 -; SSE-NEXT: movdqa 128(%rdi), %xmm6 -; SSE-NEXT: movaps 160(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm13 -; SSE-NEXT: movdqa 176(%rdi), %xmm15 -; SSE-NEXT: movdqa 208(%rdi), %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0] +; SSE-NEXT: subq $216, %rsp +; SSE-NEXT: movdqa 192(%rdi), %xmm15 +; SSE-NEXT: movdqa 176(%rdi), %xmm14 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm8 +; SSE-NEXT: movdqa 16(%rdi), %xmm7 +; SSE-NEXT: movaps 32(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rdi), %xmm13 +; SSE-NEXT: movdqa 80(%rdi), %xmm6 +; SSE-NEXT: movdqa 64(%rdi), %xmm11 +; SSE-NEXT: movdqa 96(%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm5[2,2] -; SSE-NEXT: movaps {{.*#+}} xmm8 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm8, %xmm4 +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm13[2,2] +; SSE-NEXT: movaps %xmm13, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm10, %xmm4 ; SSE-NEXT: andnps %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,1,0,3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,1,0,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa 96(%rdi), %xmm0 +; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 ; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] -; SSE-NEXT: pand %xmm1, %xmm9 -; SSE-NEXT: por %xmm2, %xmm9 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm9, %xmm2 -; SSE-NEXT: movaps 32(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movaps 160(%rdi), %xmm14 +; SSE-NEXT: movaps 144(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm8, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm14[2,2] +; SSE-NEXT: movaps %xmm10, %xmm4 ; SSE-NEXT: andnps %xmm0, %xmm4 -; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 112(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 ; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4,5] +; SSE-NEXT: pslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4,5] ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm14, %xmm2 -; SSE-NEXT: psrld $16, %xmm13 -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm14 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm12, %xmm4 -; SSE-NEXT: pand %xmm7, %xmm6 -; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm9, %xmm2 +; SSE-NEXT: psrld $16, %xmm6 +; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] +; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: por %xmm2, %xmm11 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm8, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: por %xmm4, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pand %xmm10, %xmm4 ; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movdqa %xmm15, %xmm5 @@ -1549,273 +1549,269 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: pand %xmm1, %xmm5 ; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pandn %xmm10, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pandn %xmm13, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: pand %xmm6, %xmm5 ; SSE-NEXT: por %xmm4, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: pand %xmm14, %xmm8 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,0,1] +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: pand %xmm12, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] ; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,1] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,1,2,3] +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,4,7] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm11 = xmm0[0],xmm11[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm11 -; SSE-NEXT: orps %xmm4, %xmm11 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm2 +; SSE-NEXT: orps %xmm5, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,0,1] -; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,0,1] +; SSE-NEXT: pandn %xmm5, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: movaps %xmm14, %xmm15 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm1[1] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm11[0],xmm4[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm4 -; SSE-NEXT: orps %xmm0, %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm11[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: orps %xmm10, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,4,7] -; SSE-NEXT: movaps %xmm15, %xmm2 -; SSE-NEXT: andnps %xmm5, %xmm2 -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm9 -; SSE-NEXT: movdqa %xmm14, %xmm5 -; SSE-NEXT: movdqa %xmm14, %xmm13 -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,1,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: pand %xmm6, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: andps %xmm15, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; SSE-NEXT: andps %xmm12, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: andnps %xmm1, %xmm15 -; SSE-NEXT: orps %xmm0, %xmm15 -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm14 ; SSE-NEXT: psrld $16, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: psrlq $16, %xmm0 -; SSE-NEXT: movdqa (%rsp), %xmm11 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: psrld $16, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa %xmm9, %xmm4 ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; SSE-NEXT: movdqa (%rsp), %xmm9 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: psrld $16, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: psrlq $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: pand %xmm7, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: movdqa %xmm11, %xmm4 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[0,2] -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: pandn %xmm9, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pandn %xmm11, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,1,1] +; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[1,1,1,1] -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[2,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm9[0],xmm11[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) +; SSE-NEXT: movaps %xmm2, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: movaps %xmm2, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%r8) +; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: movdqa %xmm12, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%r8) -; SSE-NEXT: movapd %xmm1, (%r9) +; SSE-NEXT: movaps %xmm2, (%r8) +; SSE-NEXT: movapd %xmm5, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%r9) +; SSE-NEXT: movaps %xmm1, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm15, (%rax) -; SSE-NEXT: movaps %xmm14, 16(%rax) +; SSE-NEXT: movaps %xmm15, 16(%rax) +; SSE-NEXT: movaps %xmm14, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm11, (%rax) -; SSE-NEXT: movapd %xmm0, 16(%rax) -; SSE-NEXT: addq $232, %rsp +; SSE-NEXT: movapd %xmm11, 16(%rax) +; SSE-NEXT: movapd %xmm0, (%rax) +; SSE-NEXT: addq $216, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride7_vf16: @@ -2120,10 +2116,10 @@ ; ; AVX2-SLOW-LABEL: load_i16_stride7_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm6 @@ -2138,22 +2134,22 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5],xmm10[6],xmm8[7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2,3,4,5],xmm8[6],xmm10[7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7,8,9,10],ymm11[11],ymm10[12,13,14,15] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] @@ -2169,12 +2165,12 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5,6],ymm7[7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm11 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3,4,5],xmm11[6],xmm8[7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4],ymm12[5,6,7,8,9,10,11],ymm11[12],ymm12[13,14,15] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm11, %ymm8 @@ -2199,12 +2195,12 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm12[0,1,2,0,4,5,6,4] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2],xmm11[3],xmm13[4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[1,3,2,3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm11, %ymm13, %ymm10 @@ -2218,18 +2214,18 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm11[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1,2,3,4,5,6,7],ymm12[8],ymm10[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7,8,9,10,11,12,13],ymm11[14],ymm12[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6],ymm11[7,8,9,10,11,12,13],ymm12[14],ymm11[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 @@ -2243,19 +2239,19 @@ ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,6] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7],ymm11[8,9,10,11,12],ymm14[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7,8],ymm14[9,10,11,12,13,14],ymm15[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1,2,3,4,5,6,7],ymm14[8],ymm11[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm14[2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm15[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm15[1,2,3,4,5,6],ymm4[7,8],ymm15[9,10,11,12,13,14],ymm4[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm4[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm11[1,2,3,4,5,6,7],ymm4[8],ymm11[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm11[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 @@ -2267,19 +2263,19 @@ ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-SLOW-NEXT: vmovdqa %ymm7, (%rdx) @@ -2300,57 +2296,57 @@ ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,1,0,2] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [3,6,2,5,3,6,2,5] ; AVX2-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm11[4],xmm9[5],xmm11[6],xmm9[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3,4,5],xmm9[6],xmm11[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,5,1,u,4,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm12, %ymm11 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm11[2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm9, %ymm12, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1,2,3,4,5,6,7],ymm10[8],ymm8[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3,4,5],xmm12[6],xmm10[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2,3,4,5],xmm12[6],xmm9[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <2,6,1,u,5,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm13, %ymm12 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm9, %ymm12, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3,4,5],xmm12[6],xmm13[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] @@ -2359,9 +2355,9 @@ ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm13, %ymm13 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm12[1,2,3,4,5,6,7],ymm10[8],ymm12[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0],ymm12[1,2,3,4,5,6,7],ymm9[8],ymm12[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] @@ -2379,80 +2375,79 @@ ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm14, %ymm15, %ymm11 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm12[1,2,3,4,5,6,7],ymm11[8],ymm12[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm12 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm15 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,3,7,2,6,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm15[1],xmm9[2],xmm15[3],xmm9[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1,2,3,4,5,6,7],ymm5[8],ymm9[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [2,5,1,4,2,5,1,4] -; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm14, %ymm5 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [0,3,7,0,0,3,7,0] -; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm14, %ymm14 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm14[5,6,7],ymm5[8,9,10,11,12],ymm14[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2],xmm12[3],xmm14[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,3,7,2,6,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm13, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm15 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0],ymm10[1,2,3,4,5,6,7],ymm13[8],ymm10[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,5,1,4,2,5,1,4] +; AVX2-FAST-NEXT: # ymm15 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm15, %ymm13 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [0,3,7,0,0,3,7,0] +; AVX2-FAST-NEXT: # ymm15 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm15, %ymm15 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5,6,7],ymm13[8,9,10,11,12],ymm15[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm15[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,4,7,3,6,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm5[1,2,3,4,5,6,7],ymm12[8],ymm5[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm12[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,4,7,0,0,4,7,0] -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,6,1,5,2,6,1,5] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm14[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,4,7,0,0,4,7,0] +; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm13, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,6,1,5,2,6,1,5] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7],ymm4[8,9,10,11,12],ymm6[13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,4,0,3,7,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm10, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm8, (%rdx) +; AVX2-FAST-NEXT: vmovdqa %ymm9, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm11, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm9, (%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm10, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FAST-NEXT: vzeroupper @@ -2460,10 +2455,10 @@ ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride7_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm6 @@ -2474,25 +2469,25 @@ ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm10[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm7, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5],xmm11[6],xmm8[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0],xmm8[1],xmm11[2,3,4,5],xmm8[6],xmm11[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7,8,9,10],ymm12[11],ymm11[12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm11[2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] @@ -2506,14 +2501,13 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1],xmm8[2,3,4,5],xmm12[6],xmm8[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4],ymm13[5,6,7,8,9,10,11],ymm12[12],ymm13[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm12, %ymm8 @@ -2531,88 +2525,87 @@ ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm10[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3],xmm14[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm10[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2],xmm12[3],xmm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm13, %ymm14, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm12, %ymm14, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm13, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7,8,9,10,11,12,13],ymm15[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2],xmm15[3],xmm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm12, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm4[1,2,3,4,5,6,7],ymm14[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1],xmm4[2],xmm12[3],xmm4[4],xmm12[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,8,9,6,7,4,5,2,3,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm13, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6],ymm11[7,8,9,10,11,12,13],ymm12[14],ymm11[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3,4,5,6,7],ymm11[8],ymm9[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3],xmm11[4],xmm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,8,9,6,7,4,5,2,3,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,5],xmm12[6],xmm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,0,1,14,15,12,13] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm14[5,6,7],ymm4[8,9,10,11,12],ymm14[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7,8],ymm14[9,10,11,12,13,14],ymm15[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm15[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm14[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1,2,3,4,5,6,7],ymm11[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm11[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm15 = xmm14[0,1,2,3,4,5],xmm12[6],xmm14[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,u,u,u,0,1,14,15,12,13] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5,6,7],ymm11[8,9,10,11,12],ymm15[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm15[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm13[1,2,3,4,5,6],ymm15[7,8],ymm13[9,10,11,12,13,14],ymm15[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm13[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm11[1,2,3,4,5,6,7],ymm4[8],ymm11[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm14[1],xmm12[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,2,3,0,1,14,15] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, (%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%r9) @@ -2665,12 +2658,12 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6,7,8,9,10],ymm8[11],ymm10[12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero ; AVX512F-SLOW-NEXT: vpor %ymm8, %ymm10, %ymm8 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10 @@ -2681,43 +2674,43 @@ ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,0,3,4,5,4,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5,6,7,8,9,10,11],ymm11[12],ymm10[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4],ymm11[5,6,7,8,9,10,11],ymm10[12],ymm11[13,14,15] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1,2,3,4,5,6,7],ymm10[8],ymm8[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero ; AVX512F-SLOW-NEXT: vpor %ymm10, %ymm11, %ymm10 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1,2,3,4,5,6,7],ymm10[8],ymm8[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,3,2,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpor %ymm10, %ymm11, %ymm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,1,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm12[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,3,2,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpor %ymm11, %ymm10, %ymm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,1,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm9[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[0,1,2,0,4,5,6,4] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5,6],ymm9[7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3,4,5,6,7],ymm11[8],ymm9[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[0,1,2,1,4,5,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] @@ -2794,307 +2787,157 @@ ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; -; AVX512F-ONLY-FAST-LABEL: load_i16_stride7_vf16: -; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] -; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [10,3,6,15,12,13,6,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,6,10,13,3,6,10,13] -; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm14[4],xmm12[5],xmm14[6],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm12, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3],xmm15[4],xmm14[5],xmm15[6],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm14, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm8, %ymm14, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm13[1,2,3,4,5,6,7],ymm8[8],ymm13[9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2],xmm13[3],xmm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm9, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm9, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm16, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,7,10,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm15[1,2,3,4,5,6,7],ymm7[8],ymm15[9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm9, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm10[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1],ymm9[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm11[1,2,3,4,5,6,7],ymm9[8],ymm11[9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,3,u,0,3,7,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,4,8,11,15,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13] -; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0] -; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm8, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm9, (%r9) -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, (%rax) -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, (%rax) -; AVX512F-ONLY-FAST-NEXT: vzeroupper -; AVX512F-ONLY-FAST-NEXT: retq -; -; AVX512DQ-FAST-LABEL: load_i16_stride7_vf16: -; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] -; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [10,3,6,15,12,13,6,15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,6,10,13,3,6,10,13] -; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2] -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm14[4],xmm12[5],xmm14[6],xmm12[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm12, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3],xmm15[4],xmm14[5],xmm15[6],xmm14[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm14, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5] -; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm14 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm8, %ymm14, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm13[1,2,3,4,5,6,7],ymm8[8],ymm13[9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2],xmm13[3],xmm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm9, %zmm9 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm9, %ymm7 -; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm16, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,7,10,14,u,u,u> -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm15[1,2,3,4,5,6,7],ymm7[8],ymm15[9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm9, %zmm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm10[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm14 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1],ymm9[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm11[1,2,3,4,5,6,7],ymm9[8],ymm11[9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,3,u,0,3,7,u> -; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm11 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,4,8,11,15,u,u,u> -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13] -; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm13 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0] -; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, (%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, (%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, (%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, (%r9) -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, (%rax) -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rax) -; AVX512DQ-FAST-NEXT: vzeroupper -; AVX512DQ-FAST-NEXT: retq +; AVX512F-FAST-LABEL: load_i16_stride7_vf16: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,6,10,13,3,6,10,13] +; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %ymm16 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm16[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,9,u,13,4,u,u,7> +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm7 +; AVX512F-FAST-NEXT: vpermi2d %ymm5, %ymm7, %ymm10 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpor %ymm10, %ymm11, %ymm10 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <2,5,9,u,12,u,u,u> +; AVX512F-FAST-NEXT: vpermi2d %ymm7, %ymm5, %ymm11 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpor %ymm11, %ymm10, %ymm10 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm8[1,2,3,4,5,6,7],ymm11[8],ymm8[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,6,9,u,13,u,u,u> +; AVX512F-FAST-NEXT: vpermi2d %ymm7, %ymm5, %ymm10 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[2,3,4,5,10,11] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3,4,5],xmm12[6],xmm13[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [2,5,2,5,2,5,2,5] +; AVX512F-FAST-NEXT: vpermd %ymm16, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1,2,3,4,5,6,7],ymm10[8],ymm12[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,11,14,u,u,5,u,u> +; AVX512F-FAST-NEXT: vpermi2d %ymm5, %ymm7, %ymm12 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpor %ymm12, %ymm11, %ymm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm16[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1,2,3,4,5,6,7],ymm12[8],ymm13[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm9 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5,6],ymm9[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,3,7,10,14,u,u,u> +; AVX512F-FAST-NEXT: vpermi2d %ymm7, %ymm5, %ymm13 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm14, %xmm3 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm13[2,3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm9[1,2,3,4,5,6,7],ymm3[8],ymm9[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,5,9,12,2,5,9,12] +; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm3 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,3,7,0,0,3,7,0] +; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm16, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm13[5,6,7],ymm3[8,9,10,11,12],ymm13[13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm14, %xmm14 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,4,7,11,14,u,u,u> +; AVX512F-FAST-NEXT: vpermi2d %ymm7, %ymm5, %ymm14 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm3[1,2,3,4,5,6,7],ymm13[8],ymm3[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [2,6,9,13,2,6,9,13] +; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm13 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0] +; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm16, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <9,12,0,3,7,u,u,u> +; AVX512F-FAST-NEXT: vpermi2d %ymm5, %ymm7, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa %ymm6, (%rsi) +; AVX512F-FAST-NEXT: vmovdqa %ymm8, (%rdx) +; AVX512F-FAST-NEXT: vmovdqa %ymm10, (%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm11, (%r8) +; AVX512F-FAST-NEXT: vmovdqa %ymm9, (%r9) +; AVX512F-FAST-NEXT: vmovdqa %ymm3, (%r10) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rax) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq ; ; AVX512BW-LABEL: load_i16_stride7_vf16: ; AVX512BW: # %bb.0: @@ -3183,81 +3026,81 @@ ; SSE-LABEL: load_i16_stride7_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $600, %rsp # imm = 0x258 -; SSE-NEXT: movdqa 304(%rdi), %xmm5 -; SSE-NEXT: movdqa 288(%rdi), %xmm6 -; SSE-NEXT: movdqa 112(%rdi), %xmm13 -; SSE-NEXT: movdqa 128(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm7 -; SSE-NEXT: movaps 144(%rdi), %xmm10 +; SSE-NEXT: movdqa 192(%rdi), %xmm5 +; SSE-NEXT: movdqa 176(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm9 +; SSE-NEXT: movdqa 16(%rdi), %xmm8 +; SSE-NEXT: movaps 32(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm9 -; SSE-NEXT: movdqa 176(%rdi), %xmm12 -; SSE-NEXT: movdqa 208(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm15 +; SSE-NEXT: movdqa 96(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,0,0] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm12 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm7[2,2] -; SSE-NEXT: movaps %xmm7, %xmm10 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{.*#+}} xmm14 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm14, %xmm3 +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm10[2,2] +; SSE-NEXT: movaps {{.*#+}} xmm11 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm11, %xmm3 ; SSE-NEXT: andnps %xmm0, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,0,3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,1,0,3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movdqa 320(%rdi), %xmm0 +; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: pand %xmm12, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm12 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 272(%rdi), %xmm3 +; SSE-NEXT: movaps 160(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 256(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm14, %xmm3 +; SSE-NEXT: movaps %xmm11, %xmm3 ; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: movdqa 224(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE-NEXT: movdqa 112(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,0,3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 240(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE-NEXT: movdqa 128(%rdi), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: pand %xmm10, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 432(%rdi), %xmm0 @@ -3268,366 +3111,371 @@ ; SSE-NEXT: movdqa 416(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 400(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movaps 384(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 368(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm14, %xmm3 +; SSE-NEXT: movaps %xmm11, %xmm3 ; SSE-NEXT: andnps %xmm0, %xmm3 ; SSE-NEXT: movdqa 336(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 352(%rdi), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 352(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: pand %xmm10, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm0 +; SSE-NEXT: movdqa 320(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 80(%rdi), %xmm4 +; SSE-NEXT: movdqa 304(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdi), %xmm0 +; SSE-NEXT: movdqa 288(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 32(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm4[2,2] -; SSE-NEXT: movaps %xmm14, %xmm3 +; SSE-NEXT: movaps 272(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 256(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] +; SSE-NEXT: movaps %xmm11, %xmm3 ; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 224(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: pand %xmm10, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE-NEXT: pslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm13[0,1,2,3,4,5] ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; SSE-NEXT: pand %xmm2, %xmm12 -; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm13, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: pand %xmm2, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: pand %xmm10, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm6 -; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm12 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] +; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: pand %xmm10, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm4 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm14, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pand %xmm15, %xmm8 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm4 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] -; SSE-NEXT: pandn %xmm4, %xmm14 -; SSE-NEXT: por %xmm1, %xmm14 -; SSE-NEXT: pand %xmm15, %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm4, %xmm11 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: pand %xmm10, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,0,1] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,1,2,3] +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm8 = xmm4[0],xmm8[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm8 -; SSE-NEXT: orps %xmm1, %xmm8 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] +; SSE-NEXT: andps %xmm13, %xmm1 +; SSE-NEXT: orps %xmm10, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,0,1] ; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm14[2],xmm9[3],xmm14[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,1] ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm14 = xmm14[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm14 = xmm5[0],xmm14[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm14 -; SSE-NEXT: orps %xmm4, %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm13, %xmm0 +; SSE-NEXT: orps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: orps %xmm7, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm15, %xmm4 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] ; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[0,1,0,1] +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: pshufd $196, (%rsp), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm7[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm13, %xmm0 +; SSE-NEXT: orps %xmm6, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[0,1,0,1] ; SSE-NEXT: pandn %xmm7, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm7[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: orps %xmm4, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pand %xmm11, %xmm7 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,6,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm7[0],xmm3[1,2,3] +; SSE-NEXT: andps %xmm13, %xmm3 +; SSE-NEXT: orps %xmm6, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: pandn %xmm14, %xmm6 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm2[0],xmm6[1,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: andps %xmm13, %xmm6 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm0[0],xmm4[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: andps %xmm15, %xmm4 -; SSE-NEXT: por %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: andps %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm9 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,1,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -3635,35 +3483,36 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: andps %xmm9, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: andps %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: andps %xmm9, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: pandn %xmm1, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 @@ -3675,18 +3524,18 @@ ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -3694,30 +3543,29 @@ ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: psrld $16, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm8 ; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: movdqa %xmm11, %xmm14 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -3727,28 +3575,28 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: psrld $16, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $196, (%rsp), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -3758,102 +3606,100 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: psrld $16, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,2] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[0,2] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm11, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pandn %xmm4, %xmm8 ; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[0,2] -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm12, %xmm11 -; SSE-NEXT: por %xmm2, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: movdqa %xmm3, %xmm11 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] @@ -3861,119 +3707,117 @@ ; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,2,2,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: psrldq {{.*#+}} xmm14 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,1,3,4,5,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,1,0,3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,0,3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] -; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,1,1,3,4,5,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,1,0,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps %xmm1, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps %xmm1, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps %xmm1, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: movaps %xmm1, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movdqa %xmm15, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%r8) +; SSE-NEXT: movaps %xmm1, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%r8) +; SSE-NEXT: movaps %xmm1, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%r9) +; SSE-NEXT: movaps %xmm1, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%r9) +; SSE-NEXT: movaps %xmm1, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm6, (%rax) +; SSE-NEXT: movaps %xmm6, 32(%rax) ; SSE-NEXT: movaps %xmm7, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rax) +; SSE-NEXT: movaps %xmm9, 16(%rax) +; SSE-NEXT: movaps %xmm8, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm0, (%rax) +; SSE-NEXT: movapd %xmm0, 32(%rax) ; SSE-NEXT: movapd %xmm3, 48(%rax) -; SSE-NEXT: movapd %xmm4, 32(%rax) -; SSE-NEXT: movapd %xmm5, 16(%rax) +; SSE-NEXT: movapd %xmm4, 16(%rax) +; SSE-NEXT: movapd %xmm5, (%rax) ; SSE-NEXT: addq $600, %rsp # imm = 0x258 ; SSE-NEXT: retq ; @@ -4612,136 +4456,131 @@ ; ; AVX2-SLOW-LABEL: load_i16_stride7_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $520, %rsp # imm = 0x208 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm13 +; AVX2-SLOW-NEXT: subq $552, %rsp # imm = 0x228 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm14 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm13 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm10 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm6 +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm7 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2],ymm13[3,4,5],ymm5[6],ymm13[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm11[2],ymm14[3,4,5],ymm11[6],ymm14[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm6[1],ymm12[2,3,4],ymm6[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm6[2],ymm12[3,4],ymm6[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2],ymm11[3],ymm14[4,5],ymm11[6],ymm14[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7,8,9,10],ymm3[11],ymm0[12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm11[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] -; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm10[2,3,0,1] -; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm14 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm13[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm13[2,3],ymm10[4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7,8,9,10,11],ymm5[12],ymm2[13,14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4,5],xmm3[6],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm6[2,3],ymm12[4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm6[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5,6,7,8,9,10,11],ymm4[12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm13[3],ymm10[4,5],ymm13[6],ymm10[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm13 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2],ymm6[3],ymm12[4,5],ymm6[6],ymm12[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm7[3],ymm2[4,5],ymm7[6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm9 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,0,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,1,0,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpblendd $31, (%rsp), %ymm1, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm0[3],ymm15[4,5],ymm0[6],ymm15[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm10 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,4,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -4756,7 +4595,7 @@ ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm4[2],ymm6[3,4,5],ymm4[6],ymm6[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm15[2],ymm10[3,4,5],ymm15[6],ymm10[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm14 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4],xmm1[5],xmm14[6],xmm1[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] @@ -4769,7 +4608,7 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm11[2],ymm7[3,4,5],ymm11[6],ymm7[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm8[2],ymm9[3,4,5],ymm8[6],ymm9[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 @@ -4781,20 +4620,21 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,1,1,2] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm15[3],ymm10[4,5],ymm15[6],ymm10[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 @@ -4803,205 +4643,212 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1,2,3,4,5,6,7],ymm10[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,2,0,4,5,6,4] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1,2,3,4,5,6,7],ymm11[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm11 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm3[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,2,0,4,5,6,4] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1,2,3,4,5,6,7],ymm7[8],ymm1[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm3[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,1,2,0,4,5,6,4] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm7[1],ymm11[2,3,4],ymm7[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm12[2],ymm6[3,4,5],ymm12[6],ymm6[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [8,9,8,9,8,9,8,9,0,1,14,15,0,1,10,11,24,25,24,25,24,25,24,25,16,17,30,31,16,17,26,27] +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6],ymm4[7,8,9,10,11,12,13],ymm3[14],ymm4[15] +; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm12 +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm15[2],mem[3,4],ymm15[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm14[2],mem[3,4,5],ymm14[6],mem[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm2[1,2,3,4,5,6,7],ymm8[8],ymm2[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6],ymm8[7,8,9,10,11,12,13],ymm5[14],ymm8[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm10 +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm5[2],ymm14[3,4,5],ymm5[6],ymm14[7] +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6],ymm5[7,8,9,10,11,12,13],ymm3[14],ymm5[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3],xmm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm9[2],ymm15[3,4],ymm9[5],ymm15[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm12 ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5],xmm12[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm13[2,3],ymm8[4,5],ymm13[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm15 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm2[2],ymm10[3,4],ymm2[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3],xmm1[4],xmm7[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm10 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1,2,3,4,5],xmm11[6],xmm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,7,6] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm12[5,6,7],ymm5[8,9,10,11,12],ymm12[13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2],ymm14[3],ymm7[4,5],ymm14[6],ymm7[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm13 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3,4,5,6],ymm14[7,8],ymm12[9,10,11,12,13,14],ymm14[15] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm12, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0,1,2,3,4,5],xmm10[6],xmm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,7,6] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2],ymm5[3],ymm14[4,5],ymm5[6],ymm14[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1,2,3,4,5,6],ymm14[7,8],ymm11[9,10,11,12,13,14],ymm14[15] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm11, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm13 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm14 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1,2,3,4,5,6,7],ymm8[8],ymm5[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2,3,4,5,6,7,8],ymm10[9],ymm8[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4],xmm4[5],xmm6[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0],ymm7[1,2,3,4,5,6,7],ymm3[8],ymm7[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6,7,8],ymm8[9],ymm7[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm7[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1],ymm9[2,3],ymm15[4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0,1,2],xmm6[3],xmm11[4],xmm6[5],xmm11[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm0[1],xmm12[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7],ymm6[8,9,10,11,12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3],xmm8[4],xmm4[5],xmm8[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm1[1],xmm11[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4],xmm6[5],xmm8[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0],xmm1[1],xmm10[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7],ymm3[8,9,10,11,12],ymm1[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm7[1],ymm13[2,3],ymm7[4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6,7,8],ymm4[9],ymm3[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2],ymm2[3],ymm9[4,5],ymm2[6],ymm9[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7],ymm4[8,9,10,11,12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7,8],ymm5[9],ymm4[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm13[0,1,2],mem[3],ymm13[4,5],mem[6],ymm13[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4,5,6,7],ymm4[8],ymm1[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rdx) @@ -5022,408 +4869,409 @@ ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rax) -; AVX2-SLOW-NEXT: addq $520, %rsp # imm = 0x208 +; AVX2-SLOW-NEXT: addq $552, %rsp # imm = 0x228 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride7_vf32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $648, %rsp # imm = 0x288 -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm13 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm14 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm13 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm10 ; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm12 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm9 -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7] +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm9 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,1,u,4,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0],xmm4[1],xmm8[2,3,4,5],xmm4[6],xmm8[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm4, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3,4,5],xmm1[6],xmm4[7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm13[2,3],ymm11[4,5],ymm13[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm14 ; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm11 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,6,1,u,5,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm8 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm3[3],ymm15[4,5],ymm3[6],ymm15[7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [3,6,2,5,3,6,2,5] ; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm13, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,0,2] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm12 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,0,2] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm12 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm6[0,1,0,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vpblendd $31, (%rsp), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3,4],ymm10[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7] +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm13, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm1[0,1,0,2] +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm14 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm5[2],ymm7[3,4,5],ymm5[6],ymm7[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm15[4],xmm1[5],xmm15[6],xmm1[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm15[2],ymm3[3,4,5],ymm15[6],ymm3[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm14 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm14[4],xmm2[5],xmm14[6],xmm2[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,4,5,6,7,6,7,12,13,16,17,18,19,20,21,22,23,20,21,22,23,22,23,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm12[2],ymm11[3,4,5],ymm12[6],ymm11[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm12[2],ymm4[3,4,5],ymm12[6],ymm4[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5] -; AVX2-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm14 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,3,2,3,2,5] +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm14 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3,4,5],xmm0[6],xmm9[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm12[3],ymm4[4,5],ymm12[6],ymm4[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2,3,4,5],xmm0[6],xmm8[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm10 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm6[0,1,1,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,2,3,8,9,16,17,18,19,20,21,22,23,24,25,26,27,18,19,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1,2,3,4,5,6,7],ymm11[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm12 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm4[1],ymm12[2,3],ymm4[4],ymm12[5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm11 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5],xmm3[6],xmm0[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,1,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,1,1,3] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm10[1],ymm5[2,3,4],ymm10[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm7 ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3],xmm8[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm3[2],ymm0[3,4,5],ymm3[6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,7,2,6,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm9, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1,2,3,4,5,6,7],ymm6[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm14 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm10[2],ymm4[3,4,5],ymm10[6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,3,7,2,6,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovdqa %xmm14, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm2 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2],ymm13[3,4,5],ymm5[6],ymm13[7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm6[1],xmm9[2],xmm6[3],xmm9[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,3,3,0,3,7,7] -; AVX2-FAST-NEXT: vpermd (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [2,5,1,4,2,5,1,4] -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5,6,7],ymm6[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm15 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm9 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,7,3,6,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm2, %ymm15 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4,5,6,7],ymm15[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm15[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm11[2],ymm9[3,4,5],ymm11[6],ymm9[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,3,7,2,6,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,3,3,3,0,3,7,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,5,1,4,2,5,1,4] +; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,4,7,3,6,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,2,3,2,3,2,3,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,3,3,0,3,7,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5,6,7],ymm8[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm2[2],ymm12[3,4],ymm2[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7],ymm3[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm13[2,3],ymm8[4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7] +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm14 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,4,7,0,0,4,7,0] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd (%rsp), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0] +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,6,1,5,2,6,1,5] ; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,2],ymm7[3],mem[4,5],ymm7[6],mem[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm10 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX2-FAST-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <1,4,0,3,7,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm2[2,3],ymm12[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm13[1],mem[2,3],ymm13[4],mem[5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%r9) +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7],ymm3[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2],ymm13[3],mem[4,5],ymm13[6],mem[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm14[1],mem[2,3],ymm14[4],mem[5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm9, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FAST-NEXT: addq $648, %rsp # imm = 0x288 ; AVX2-FAST-NEXT: vzeroupper @@ -5432,183 +5280,179 @@ ; AVX2-FAST-PERLANE-LABEL: load_i16_stride7_vf32: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $552, %rsp # imm = 0x228 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm5[2],ymm12[3,4,5],ymm5[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm8[1],ymm1[2,3,4],ymm8[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm2, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm10[2],ymm11[3,4,5],ymm10[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm4, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm3, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7,8,9,10],ymm3[11],ymm0[12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm6[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm8[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm8[2,3],ymm7[4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4,5],xmm3[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm11[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7,8,9,10,11],ymm5[12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2,3,4],ymm8[5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm6[3],ymm2[4,5],ymm6[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5,6,7,8,9,10,11],ymm4[12],ymm1[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm2[1],ymm10[2,3,4],ymm2[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm5[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm2 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm9[3],ymm4[4,5],ymm9[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm3[3],ymm15[4,5],ymm3[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm1[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm1[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm13, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm4[2],ymm9[3,4,5],ymm4[6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm13[4],xmm3[5],xmm13[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm3[1,2,3,4,5,6,7],ymm12[8],ymm3[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm11[2],ymm6[3,4,5],ymm11[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm14[4],xmm3[5],xmm14[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm15[2],ymm3[3,4,5],ymm15[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm14[4],xmm2[5],xmm14[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,4,5,6,7,6,7,12,13,16,17,18,19,20,21,22,23,20,21,22,23,22,23,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm13, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7],ymm4[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm10[2],ymm8[3,4,5],ymm10[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm13[4],xmm2[5],xmm13[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm13 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15,16,17,18,19,20,21,22,23,16,17,18,19,16,17,30,31] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm4[3],ymm9[4,5],ymm4[6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm15[3],ymm9[4,5],ymm15[6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 @@ -5616,197 +5460,198 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1,2,3,4,5,6,7],ymm11[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm7, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1,2,3,4,5,6,7],ymm10[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,2,3,8,9,16,17,18,19,20,21,22,23,24,25,26,27,18,19,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm9[1],ymm4[2,3],ymm9[4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3,4,5],xmm5[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm9[1],ymm15[2,3],ymm9[4],ymm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3,4,5],xmm7[6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm0[1,2,3,4,5,6,7],ymm12[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm11[2],ymm13[3,4,5],ymm11[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [8,9,8,9,8,9,8,9,0,1,14,15,0,1,10,11,24,25,24,25,24,25,24,25,16,17,30,31,16,17,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm2[6],ymm5[7,8,9,10,11,12,13],ymm2[14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1],ymm8[2],ymm12[3,4],ymm8[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0],ymm9[1],ymm15[2,3,4],ymm9[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm1[2],mem[3,4,5],ymm1[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm4[1],ymm10[2,3,4],ymm4[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm6[2],ymm10[3,4,5],ymm6[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6],ymm8[7,8,9,10,11,12,13],ymm3[14],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm6[1],ymm11[2,3,4],ymm6[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0],xmm3[1],xmm10[2],xmm3[3],xmm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm13 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1,2,3,4,5,6,7],ymm8[8],ymm3[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6],ymm0[7,8,9,10,11,12,13],ymm3[14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3],xmm0[4],xmm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,5],xmm13[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm7[1,2,3,4,5,6,7],ymm1[8],ymm7[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm9[2],ymm12[3,4],ymm9[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3],xmm1[4],xmm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0,1,2,3,4,5],xmm5[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm1[2],ymm8[3,4,5],ymm1[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6],ymm5[7,8,9,10,11,12,13],ymm3[14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm9[1],ymm4[2,3,4],ymm9[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3],xmm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm1[3],ymm8[4,5],ymm1[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6],ymm2[7,8],ymm0[9,10,11,12,13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4],xmm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm8[1,2,3,4,5,6,7],ymm2[8],ymm8[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm6[2],ymm11[3,4],ymm6[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3],xmm8[4],xmm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,5],xmm15[6],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7],ymm10[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2],ymm7[3],ymm11[4,5],ymm7[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0],ymm10[1,2,3,4,5,6],ymm13[7,8],ymm10[9,10,11,12,13,14],ymm13[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm5[1,2,3,4,5,6,7],ymm10[8],ymm5[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm5[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4,5,6,7,8],ymm10[9],ymm5[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4],xmm4[5],xmm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2],ymm6[3],ymm10[4,5],ymm6[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm3[1,2,3,4,5,6],ymm11[7,8],ymm3[9,10,11,12,13,14],ymm11[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm7[1,2,3,4,5,6,7],ymm2[8],ymm7[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0],ymm6[1],ymm2[2,3,4,5,6,7,8],ymm6[9],ymm2[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,2],ymm8[3],mem[4,5],ymm8[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3],xmm8[4],xmm4[5],xmm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm9 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm9[2,3],ymm12[4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4],xmm4[5],xmm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7],ymm4[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm14[1],ymm10[2,3],ymm14[4],ymm10[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7,8],ymm5[9],ymm4[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm8[3],ymm2[4,5],ymm8[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5],mem[6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) @@ -5827,10 +5672,10 @@ ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-PERLANE-NEXT: addq $552, %rsp # imm = 0x228 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -5903,9 +5748,9 @@ ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm12[2,3],ymm4[4,5],ymm12[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm12[2,3],ymm4[4,5],ymm12[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 @@ -5972,8 +5817,7 @@ ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm2[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm8[6],ymm2[7,8,9,10,11,12,13],ymm8[14],ymm2[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1,2],xmm7[3,4,5,6],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] @@ -6090,7 +5934,7 @@ ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm10 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2,3,4,5],xmm0[6],xmm10[7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm3[2,3],ymm15[4,5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm15[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm15[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm13[3],ymm10[4,5,6,7,8,9,10],ymm13[11],ymm10[12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 @@ -6198,304 +6042,311 @@ ; ; AVX512F-ONLY-FAST-LABEL: load_i16_stride7_vf32: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,6,9,13,2,6,9,13] -; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] -; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [10,3,6,15,12,13,6,15] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [3,6,10,13,3,6,10,13] -; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <1,u,u,u,5,8,12,15> -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,6,9,u,13,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm18, %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <1,u,u,u,4,8,11,15> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,5,9,u,12,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm18, %zmm16, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,u,u,u,4,7,11,14> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm27, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm25 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm25[0,1,0,2] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vporq %ymm7, %ymm11, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; AVX512F-ONLY-FAST-NEXT: subq $72, %rsp +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [3,6,10,13,3,6,10,13] +; AVX512F-ONLY-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm9, %ymm14, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm24 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm24[0,1,0,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,9,u,13,4,u,u,7> +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 96(%rdi), %ymm28 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm26, %ymm28, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm10[2],ymm15[3,4,5],ymm10[6],ymm15[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vporq %ymm1, %ymm3, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512F-ONLY-FAST-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm22 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3],xmm15[4],xmm11[5],xmm15[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm6[0,1,2],xmm11[3,4,5,6],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm10, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm15[4],xmm0[5],xmm15[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm15 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm31 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,u,u,u,4,7,11,14> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 352(%rdi), %ymm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm29, %ymm30, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4],xmm1[5],xmm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4,5,6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,5,9,u,12,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm28, %ymm26, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm9[2],ymm5[3,4,5],ymm9[6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm0[4],xmm8[5],xmm0[6],xmm8[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3,4,5],xmm8[6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm2, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm4[1],xmm6[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,0,1,14,15,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,6,9,u,13,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm28, %ymm26, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm9[3],ymm5[4,5],ymm9[6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm25, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm7[2],ymm5[3,4,5],ymm7[6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm8[4],xmm0[5],xmm8[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm26, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm8, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm18, %zmm27, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2],xmm0[3],xmm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm19, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vporq %ymm6, %ymm0, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm25[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm2[2],ymm8[3,4,5],ymm2[6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm9[4],xmm6[5],xmm9[6],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2,3,4,5],xmm9[6],xmm13[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [2,11,2,11,12,5,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm13, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm13[0,1,2],ymm6[3,4,5,6,7],ymm13[8,9,10],ymm6[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm13, %ymm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm15, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm16, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,7,10,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,3,3,u,0,3,7,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm25, %ymm13, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7],ymm10[8,9,10,11,12],ymm13[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm12, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm10, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm11, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm30, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1],xmm11[2,3,4,5],xmm6[6],xmm11[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2,5,2,5,2,5,2,5] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm24, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm6, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm17 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,11,14,u,u,5,u,u> +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm26, %ymm28, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm5[1],ymm9[2,3],ymm5[4],ymm9[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm24[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,5,9,12,2,5,9,12] +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm5[1],ymm9[2,3,4],ymm5[5],ymm9[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <2,u,u,u,6,9,13,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm11, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm11, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,6,9,13,2,6,9,13] -; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = <0,4,7,11,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm23, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1],ymm6[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm10[1],xmm6[2,3,4,5],xmm10[6],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,4,7,0,0,4,7,0] -; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm25, %ymm10, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7],ymm0[8,9,10,11,12],ymm10[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <3,u,u,u,6,10,13,u> -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2],xmm0[3],xmm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm15, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3,4,5,6,7],ymm10[8,9,10],ymm6[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm10, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <1,4,8,11,15,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm7, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm13, %ymm6, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm8[1],ymm2[2,3,4],ymm8[5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm9, %ymm5, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,3,3,u,0,3,7,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm24, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm4, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm16 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <9,u,u,u,12,0,3,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm30, %ymm29, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm13, %ymm3, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm13[2],ymm3[3,4,5],ymm13[6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2],ymm13[3],ymm3[4,5],ymm13[6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm3[1],ymm13[2,3],ymm3[4],ymm13[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0],ymm3[1],ymm13[2,3,4],ymm3[5],ymm13[6,7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [2,6,9,13,2,6,9,13] +; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm13, %ymm27, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1],ymm12[2],ymm11[3,4,5],ymm12[6],ymm11[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <9,u,u,u,13,0,4,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm30, %ymm29, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm14, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm7[4],xmm1[5],xmm7[6],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <10,u,u,u,4,13,u,1> +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm30, %ymm29, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7],ymm8[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3,4,5],xmm4[6],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,u,u,u,6,9,13,u> +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm29, %ymm30, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1,2],ymm4[3,4,5,6,7],ymm8[8,9,10],ymm4[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm15[2],ymm10[3,4],ymm15[5],ymm10[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,3,7,10,14,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm28, %ymm26, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm14, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm14[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm8, %zmm14, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm8[1],xmm2[2,3,4,5],xmm8[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm9, %ymm27, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3],xmm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <3,u,u,u,6,10,13,u> +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm29, %ymm30, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm9, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,7,0,0,4,7,0] +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm24, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm5[5,6,7],ymm0[8,9,10,11,12],ymm5[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm15[2,3],ymm10[4,5],ymm15[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,3,14,7,10,3] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,7,11,14,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm28, %ymm26, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,10,3,14,7,10,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm29, %ymm30, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2],xmm6[3],xmm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3,4,5,6,7],ymm2[8,9,10],ymm4[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <9,12,0,3,7,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm26, %ymm28, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2],ymm15[3],ymm10[4,5],ymm15[6],ymm10[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,4,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm22, %zmm2, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm24, %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm31, %zmm30, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm31, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm3, %zmm6, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm18 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm27, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm21, %zmm30, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm17, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm26, %zmm30, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%r9) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm9, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm20 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm13, %zmm20 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm21 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm23, %zmm14, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm8, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm12, %zmm30, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm5, %zmm14, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512F-ONLY-FAST-NEXT: addq $72, %rsp ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; @@ -6571,9 +6422,9 @@ ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm20 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm12[2,3],ymm7[4,5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm12[2,3],ymm7[4,5],ymm12[6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 @@ -6640,8 +6491,7 @@ ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7,8,9,10,11,12,13],ymm2[14],ymm0[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,2,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3,4,5,6],xmm0[7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] @@ -6749,7 +6599,7 @@ ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm15[2,3],ymm11[4,5],ymm15[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm11[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm12[3],ymm3[4,5,6,7,8,9,10],ymm12[11],ymm3[12,13,14,15] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm7 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6,7] @@ -6869,305 +6719,298 @@ ; ; AVX512DQ-FAST-LABEL: load_i16_stride7_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm24 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [2,5,9,12,2,5,9,12] -; AVX512DQ-FAST-NEXT: # ymm30 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,6,15,12,13,6,15] -; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [3,6,10,13,3,6,10,13] -; AVX512DQ-FAST-NEXT: # ymm21 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = <1,u,u,u,5,8,12,15> -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm2, %zmm12 -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm1, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,u,u,u,4,8,11,15> -; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,5,9,u,12,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm30, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,u,u,u,4,7,11,14> -; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [8,1,12,5,12,5,14,15] -; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm4, %zmm4 -; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm21, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm26 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm26[0,1,0,2] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vporq %ymm7, %ymm10, %ymm22 -; AVX512DQ-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [3,6,10,13,3,6,10,13] +; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm0 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm5, %ymm13, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm23 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm23[0,1,0,2] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,9,u,13,4,u,u,7> +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 96(%rdi), %ymm26 +; AVX512DQ-FAST-NEXT: vpermi2d %ymm25, %ymm26, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm10[2],ymm15[3,4,5],ymm10[6],ymm15[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4],xmm4[5],xmm7[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vporq %ymm3, %ymm4, %ymm30 +; AVX512DQ-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512DQ-FAST-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa 288(%rdi), %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3],xmm11[4],xmm10[5],xmm11[6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3,4,5,6],xmm8[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm23 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vporq %ymm3, %ymm9, %ymm20 -; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1],ymm3[2],ymm9[3,4,5],ymm3[6],ymm9[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] -; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm10 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm13[7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm10[0],xmm0[1],xmm10[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[2,3,0,1,14,15,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm25 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3],xmm11[4],xmm13[5],xmm11[6],xmm13[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3,4,5,6],xmm2[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm11, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm2[1],xmm12[2,3,4,5],xmm2[6],xmm12[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [2,5,2,5,2,5,2,5] -; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm12, %ymm12 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1,2,3,4,5,6],ymm12[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm31 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm18 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm12, %zmm15 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm7[2],ymm5[3,4,5],ymm7[6],ymm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm14 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0,1,2,3],xmm8[4],xmm12[5],xmm8[6],xmm12[7] -; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm19, %zmm12 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm12, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm21, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm11, %zmm17, %zmm15 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7] -; AVX512DQ-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm8, %zmm0, %zmm15 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm1[1],ymm4[2,3,4],ymm1[5],ymm4[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm4 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm8, %ymm11, %ymm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3,4,5],xmm11[6],xmm8[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm26[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm13[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa 416(%rdi), %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm8[2],ymm5[3,4,5],ymm8[6],ymm5[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm0[4],xmm13[5],xmm0[6],xmm13[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2],ymm7[3],ymm14[4,5],ymm7[6],ymm14[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,3,7,10,14,u,u,u> -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [2,11,2,11,12,5,8,9] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm21 -; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm28, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm12, %zmm17, %zmm19 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm12, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm19 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm14, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm28 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm9[1],ymm3[2,3,4],ymm9[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm30 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,u,4,7,11,14> +; AVX512DQ-FAST-NEXT: vmovdqa64 352(%rdi), %ymm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %ymm29 +; AVX512DQ-FAST-NEXT: vpermi2d %ymm28, %ymm29, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa 288(%rdi), %ymm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4],xmm3[5],xmm4[6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3,4,5,6],xmm1[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,5,9,u,12,u,u,u> +; AVX512DQ-FAST-NEXT: vpermi2d %ymm26, %ymm25, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQ-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm3 -; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm10, %xmm9 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <2,u,u,u,6,9,13,u> -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm8[3],ymm5[4,5],ymm8[6],ymm5[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm7[1],xmm2[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,0,1,14,15,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,6,9,u,13,u,u,u> +; AVX512DQ-FAST-NEXT: vpermi2d %ymm26, %ymm25, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4,5],xmm3[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm1, %ymm20 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm12 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm11[1],xmm1[2,3,4,5],xmm11[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm9, %zmm9 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm9, %ymm1 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,6,9,13,2,6,9,13] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2,5,2,5,2,5,2,5] +; AVX512DQ-FAST-NEXT: vpermd %ymm23, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm31 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,11,14,u,u,5,u,u> +; AVX512DQ-FAST-NEXT: vpermi2d %ymm25, %ymm26, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm3, %ymm22 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5],xmm3[6],xmm0[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm23[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5,6],ymm4[7] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,5,9,12,2,5,9,12] ; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm2, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,4,7,11,14,u,u,u> -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,3,3,u,0,3,7,u> -; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm2 -; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm31, %xmm1 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm14 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm13 -; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm9, %zmm9 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm9[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm13 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3,4,5],xmm13[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,4,7,0,0,4,7,0] -; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm13, %ymm13 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5,6,7],ymm11[8,9,10,11,12],ymm13[13,14,15] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <3,u,u,u,6,10,13,u> -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3],xmm14[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm13, %zmm13 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2],ymm1[3,4,5,6,7],ymm13[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm13, %ymm11, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <1,4,8,11,15,u,u,u> -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm17, %zmm9 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vpermd %zmm29, %zmm11, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm18 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,3,14,7,10,3] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2],xmm1[3],xmm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 416(%rdi), %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX512DQ-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm21, %xmm7 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, %ymm1 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm4, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5],xmm1[6],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <9,u,u,u,12,0,3,7> +; AVX512DQ-FAST-NEXT: vpermi2d %ymm29, %ymm28, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6],xmm3[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2d %ymm4, %ymm7, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1],ymm4[2],ymm7[3,4,5],ymm4[6],ymm7[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm4[3],ymm7[4,5],ymm4[6],ymm7[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0],ymm7[1],ymm4[2,3,4],ymm7[5],ymm4[6,7] +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [2,6,9,13,2,6,9,13] +; AVX512DQ-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm4, %ymm27, %ymm9 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1],ymm12[2],ymm11[3,4,5],ymm12[6],ymm11[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm9[4],xmm4[5],xmm9[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <9,u,u,u,13,0,4,7> +; AVX512DQ-FAST-NEXT: vpermi2d %ymm29, %ymm28, %ymm9 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm9, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm20, %zmm13, %zmm16 +; AVX512DQ-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 +; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm4, %zmm0, %zmm16 {%k1} +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm4[4],xmm8[5],xmm4[6],xmm8[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <10,u,u,u,4,13,u,1> +; AVX512DQ-FAST-NEXT: vpermi2d %ymm29, %ymm28, %ymm9 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5,6,7],ymm9[8,9,10],ymm4[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm9, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm22, %zmm13, %zmm18 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm4, %zmm0, %zmm18 {%k1} +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3,4,5],xmm8[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,u,u,u,6,9,13,u> +; AVX512DQ-FAST-NEXT: vpermi2d %ymm28, %ymm29, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3,4,5,6,7],ymm8[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm8, %ymm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm15[2],ymm10[3,4],ymm15[5],ymm10[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm4 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,3,7,10,14,u,u,u> +; AVX512DQ-FAST-NEXT: vpermi2d %ymm26, %ymm25, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm13, %zmm19 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm9, %zmm0, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2,3,4,5],xmm4[6],xmm14[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vpermi2d %ymm5, %ymm6, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,3,u,0,3,7,u> +; AVX512DQ-FAST-NEXT: vpermd %ymm23, %ymm9, %ymm9 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5,6,7],ymm0[8,9,10,11,12],ymm9[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm1 +; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm31, %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2],xmm0[3],xmm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <3,u,u,u,6,10,13,u> +; AVX512DQ-FAST-NEXT: vpermi2d %ymm28, %ymm29, %ymm14 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0,1,2],ymm4[3,4,5,6,7],ymm14[8,9,10],ymm4[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm14, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm5, %ymm27, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,7,0,0,4,7,0] +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %ymm23, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7],ymm0[8,9,10,11,12],ymm5[13,14,15] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm15[2,3],ymm10[4,5],ymm15[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,4,7,11,14,u,u,u> +; AVX512DQ-FAST-NEXT: vpermi2d %ymm26, %ymm25, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm9, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm4, %zmm0, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm10, %zmm17, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,10,3,14,7,10,3] +; AVX512DQ-FAST-NEXT: vpermi2d %ymm28, %ymm29, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1,2],ymm6[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2],xmm7[3],xmm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0,1,2],ymm6[3,4,5,6,7],ymm4[8,9,10],ymm6[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <9,12,0,3,7,u,u,u> +; AVX512DQ-FAST-NEXT: vpermi2d %ymm25, %ymm26, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm15[3],ymm10[4,5],ymm15[6],ymm10[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm5, %zmm13, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm4, %zmm0, %zmm2 {%k1} ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm22, %zmm0, %zmm23 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm25 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm25, %zmm0, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, (%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, (%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, (%r9) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm30, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm24 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm20, %zmm4, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, (%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, (%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, (%r9) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -7313,331 +7156,331 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i16_stride7_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1352, %rsp # imm = 0x548 -; SSE-NEXT: movdqa 640(%rdi), %xmm9 -; SSE-NEXT: movdqa 624(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm8 -; SSE-NEXT: movdqa 128(%rdi), %xmm10 -; SSE-NEXT: movaps 160(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm13 -; SSE-NEXT: movdqa 192(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: subq $1368, %rsp # imm = 0x558 +; SSE-NEXT: movdqa 528(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 512(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm9 +; SSE-NEXT: movdqa 16(%rdi), %xmm7 +; SSE-NEXT: movaps 32(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm11 +; SSE-NEXT: movdqa 64(%rdi), %xmm15 +; SSE-NEXT: movdqa 96(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm6[2,2] -; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm8[2,2] +; SSE-NEXT: movaps {{.*#+}} xmm8 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm8, %xmm2 ; SSE-NEXT: andnps %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,0,3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,1,0,3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movdqa 656(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: movdqa 544(%rdi), %xmm7 +; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,0,0] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 608(%rdi), %xmm2 +; SSE-NEXT: movaps 496(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 592(%rdi), %xmm0 +; SSE-NEXT: movaps 480(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] -; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movaps %xmm8, %xmm2 ; SSE-NEXT: andnps %xmm0, %xmm2 -; SSE-NEXT: movdqa 560(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,0,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 448(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,0,3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 576(%rdi), %xmm5 +; SSE-NEXT: movdqa 464(%rdi), %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm0 +; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 80(%rdi), %xmm2 +; SSE-NEXT: movdqa 192(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdi), %xmm0 +; SSE-NEXT: movdqa 176(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 32(%rdi), %xmm0 +; SSE-NEXT: movaps 160(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm4[2,2] -; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm8, %xmm2 ; SSE-NEXT: andnps %xmm0, %xmm2 -; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 112(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 544(%rdi), %xmm0 +; SSE-NEXT: movdqa 656(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 528(%rdi), %xmm2 +; SSE-NEXT: movdqa 640(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 512(%rdi), %xmm0 +; SSE-NEXT: movdqa 624(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 496(%rdi), %xmm2 +; SSE-NEXT: movaps 608(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 480(%rdi), %xmm0 +; SSE-NEXT: movaps 592(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] -; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movaps %xmm8, %xmm2 ; SSE-NEXT: andnps %xmm0, %xmm2 -; SSE-NEXT: movdqa 448(%rdi), %xmm0 +; SSE-NEXT: movdqa 560(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 464(%rdi), %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 576(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm0 +; SSE-NEXT: movdqa 320(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 416(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 400(%rdi), %xmm0 +; SSE-NEXT: movdqa 304(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 288(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 384(%rdi), %xmm2 +; SSE-NEXT: movaps 272(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm0 +; SSE-NEXT: movaps 256(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] -; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movaps %xmm8, %xmm2 ; SSE-NEXT: andnps %xmm0, %xmm2 -; SSE-NEXT: movdqa 336(%rdi), %xmm0 +; SSE-NEXT: movdqa 224(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 352(%rdi), %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 880(%rdi), %xmm0 +; SSE-NEXT: movdqa 768(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 864(%rdi), %xmm4 +; SSE-NEXT: movdqa 752(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 848(%rdi), %xmm0 +; SSE-NEXT: movdqa 736(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 832(%rdi), %xmm2 +; SSE-NEXT: movaps 720(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 816(%rdi), %xmm0 +; SSE-NEXT: movaps 704(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] -; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movaps %xmm8, %xmm2 ; SSE-NEXT: andnps %xmm0, %xmm2 -; SSE-NEXT: movdqa 784(%rdi), %xmm0 +; SSE-NEXT: movdqa 672(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 800(%rdi), %xmm0 +; SSE-NEXT: movdqa 688(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm0 +; SSE-NEXT: movdqa 432(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 304(%rdi), %xmm2 +; SSE-NEXT: movdqa 416(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm0 +; SSE-NEXT: movdqa 400(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 272(%rdi), %xmm2 +; SSE-NEXT: movaps 384(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 256(%rdi), %xmm0 +; SSE-NEXT: movaps 368(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] -; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movaps %xmm8, %xmm2 ; SSE-NEXT: andnps %xmm0, %xmm2 -; SSE-NEXT: movdqa 224(%rdi), %xmm0 +; SSE-NEXT: movdqa 336(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 240(%rdi), %xmm0 +; SSE-NEXT: movdqa 352(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 768(%rdi), %xmm0 +; SSE-NEXT: movdqa 880(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 752(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 736(%rdi), %xmm0 +; SSE-NEXT: movdqa 864(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 848(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 720(%rdi), %xmm2 +; SSE-NEXT: movaps 832(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 704(%rdi), %xmm0 +; SSE-NEXT: movaps 816(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] -; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movaps %xmm8, %xmm2 ; SSE-NEXT: andnps %xmm0, %xmm2 -; SSE-NEXT: movdqa 672(%rdi), %xmm0 +; SSE-NEXT: movdqa 784(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 688(%rdi), %xmm0 +; SSE-NEXT: movdqa 800(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] +; SSE-NEXT: pslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4,5] ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: psrld $16, %xmm11 +; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] +; SSE-NEXT: pand %xmm3, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm9, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm10 ; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,1,2,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5] ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: psrld $16, %xmm9 -; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload +; SSE-NEXT: pandn %xmm7, %xmm2 +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm15, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm5 ; SSE-NEXT: por %xmm2, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -7645,13 +7488,13 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -7665,12 +7508,12 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -7678,13 +7521,13 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -7698,25 +7541,26 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm7, %xmm12 -; SSE-NEXT: por %xmm2, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -7730,26 +7574,27 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -7763,12 +7608,12 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -7776,56 +7621,55 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm4 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pandn %xmm12, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pandn %xmm11, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm15 +; SSE-NEXT: movdqa %xmm8, %xmm15 ; SSE-NEXT: pandn %xmm2, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: pand %xmm14, %xmm15 +; SSE-NEXT: pand %xmm12, %xmm15 ; SSE-NEXT: por %xmm1, %xmm15 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 @@ -7836,27 +7680,27 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: pand %xmm14, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: por %xmm2, %xmm8 +; SSE-NEXT: pand %xmm12, %xmm8 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,1] ; SSE-NEXT: movdqa %xmm3, %xmm2 @@ -7867,289 +7711,233 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm4 -; SSE-NEXT: orps %xmm2, %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm4[0],xmm6[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm6 -; SSE-NEXT: orps %xmm2, %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,0,1] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,1,0,1] -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: orps %xmm4, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm8 = xmm4[0],xmm8[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm8 +; SSE-NEXT: orps %xmm2, %xmm8 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,0,1] ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: orps %xmm4, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm5[0],xmm1[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm1 +; SSE-NEXT: orps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,1,0,1] ; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,0,3] +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,2,3,3] ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: orps %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: orps %xmm5, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,0,1] +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,0,1] ; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm10[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhdq (%rsp), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm8[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm0 ; SSE-NEXT: orps %xmm6, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm15[0,1,0,1] -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: pandn %xmm6, %xmm10 +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] ; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: por %xmm10, %xmm6 -; SSE-NEXT: movdqa %xmm14, %xmm11 -; SSE-NEXT: pandn %xmm6, %xmm11 +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[2,2,3,3] ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm6[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: orps %xmm11, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm9[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: orps %xmm8, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] -; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: pandn %xmm8, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] +; SSE-NEXT: pand %xmm3, %xmm8 +; SSE-NEXT: por %xmm9, %xmm8 +; SSE-NEXT: movdqa %xmm12, %xmm9 +; SSE-NEXT: pandn %xmm8, %xmm9 +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm10[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: orps %xmm9, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: pandn %xmm9, %xmm10 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,1,0,1] -; SSE-NEXT: pandn %xmm11, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,1] +; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: por %xmm10, %xmm9 +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: pandn %xmm9, %xmm10 +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm15[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm11[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: orps %xmm10, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm2[2],xmm10[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,1] +; SSE-NEXT: pand %xmm3, %xmm10 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm15[0,1,0,1] +; SSE-NEXT: pandn %xmm11, %xmm3 +; SSE-NEXT: por %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm11[0],xmm5[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm5 -; SSE-NEXT: orps %xmm6, %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: pandn %xmm13, %xmm6 -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: pand %xmm7, %xmm11 -; SSE-NEXT: por %xmm6, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm11[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: orps %xmm10, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm14, %xmm10 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm1[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm3[0],xmm10[1,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: andps %xmm14, %xmm6 -; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm10 +; SSE-NEXT: por %xmm10, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: pand %xmm7, %xmm6 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: andps %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: andps %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,0,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pand %xmm13, %xmm10 +; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[0,1,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7] @@ -8161,74 +7949,126 @@ ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm14, %xmm13 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: andps %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm3 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pand %xmm13, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] +; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm14, %xmm13 -; SSE-NEXT: pandn %xmm3, %xmm13 -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,0,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -8236,23 +8076,24 @@ ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: andps %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm1 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm7, %xmm12 -; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,1,1,0,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -8262,23 +8103,21 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: andps %xmm14, %xmm1 +; SSE-NEXT: andps %xmm12, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: pandn %xmm2, %xmm12 +; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm2 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -8297,30 +8136,30 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm2 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: psrlq $16, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] @@ -8347,8 +8186,7 @@ ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] @@ -8374,39 +8212,40 @@ ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,1,0,3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm2 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: psrlq $16, %xmm2 -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE-NEXT: punpckhdq (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] @@ -8415,29 +8254,28 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: psrld $16, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm2 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: psrlq $16, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd $196, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] @@ -8445,10 +8283,10 @@ ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: psrld $16, %xmm12 +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: psrld $16, %xmm11 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8461,44 +8299,42 @@ ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: psrld $16, %xmm11 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: psrld $16, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm2 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: psrlq $16, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] @@ -8509,13 +8345,13 @@ ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] @@ -8528,13 +8364,13 @@ ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] @@ -8547,13 +8383,13 @@ ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] @@ -8563,16 +8399,15 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] @@ -8582,72 +8417,76 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm14 +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[0,2] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pandn %xmm7, %xmm13 +; SSE-NEXT: por %xmm1, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm4, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] @@ -8655,13 +8494,13 @@ ; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -8672,16 +8511,16 @@ ; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -8692,16 +8531,16 @@ ; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -8712,16 +8551,16 @@ ; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -8732,16 +8571,16 @@ ; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshuflw $212, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -8752,16 +8591,16 @@ ; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -8772,18 +8611,19 @@ ; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,1,3,4,5,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] @@ -8791,65 +8631,65 @@ ; SSE-NEXT: # xmm2 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps %xmm1, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rdx) +; SSE-NEXT: movaps %xmm1, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps %xmm1, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rdx) +; SSE-NEXT: movaps %xmm1, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rcx) +; SSE-NEXT: movaps %xmm1, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rcx) +; SSE-NEXT: movaps %xmm1, 96(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps %xmm1, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps %xmm1, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movdqa %xmm14, 112(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movdqa %xmm12, 112(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 96(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -8881,9 +8721,10 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm11, 112(%rax) -; SSE-NEXT: movaps %xmm12, 96(%rax) -; SSE-NEXT: movaps %xmm13, 80(%rax) +; SSE-NEXT: movaps %xmm10, 112(%rax) +; SSE-NEXT: movaps %xmm11, 96(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -8892,23 +8733,22 @@ ; SSE-NEXT: movaps %xmm1, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rax) +; SSE-NEXT: movaps %xmm15, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm0, 112(%rax) -; SSE-NEXT: movapd %xmm4, 96(%rax) -; SSE-NEXT: movapd %xmm5, 80(%rax) -; SSE-NEXT: movapd %xmm6, 64(%rax) -; SSE-NEXT: movapd %xmm7, 48(%rax) -; SSE-NEXT: movapd %xmm8, 32(%rax) -; SSE-NEXT: movapd %xmm9, 16(%rax) -; SSE-NEXT: movapd %xmm10, (%rax) -; SSE-NEXT: addq $1352, %rsp # imm = 0x548 +; SSE-NEXT: movapd %xmm3, 96(%rax) +; SSE-NEXT: movapd %xmm4, 80(%rax) +; SSE-NEXT: movapd %xmm5, 64(%rax) +; SSE-NEXT: movapd %xmm6, 48(%rax) +; SSE-NEXT: movapd %xmm7, 32(%rax) +; SSE-NEXT: movapd %xmm8, 16(%rax) +; SSE-NEXT: movapd %xmm9, (%rax) +; SSE-NEXT: addq $1368, %rsp # imm = 0x558 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1544, %rsp # imm = 0x608 +; AVX1-ONLY-NEXT: subq $1512, %rsp # imm = 0x5E8 ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm0 @@ -8923,19 +8763,20 @@ ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] @@ -8953,11 +8794,11 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[2],xmm6[2],zero -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm8[2],xmm10[2],zero +; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 @@ -8970,21 +8811,21 @@ ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 608(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 592(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa 592(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 656(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa 656(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] @@ -8997,9 +8838,9 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm5[2],xmm15[2],zero -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm5[2],xmm7[2],zero +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm4 @@ -9044,19 +8885,19 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm9[2],xmm3[2],zero -; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[2],xmm4[2],zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9067,9 +8908,9 @@ ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,3,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 @@ -9094,9 +8935,9 @@ ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 880(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] @@ -9108,17 +8949,17 @@ ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 720(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm14[2],xmm3[2],zero -; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 720(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[2],xmm4[2],zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 752(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 752(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] @@ -9145,157 +8986,153 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpslld $16, %xmm15, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm14, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm15[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm14, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm6[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm3[1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm8 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm0[6],mem[7] +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm11[6],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-ONLY-NEXT: vpslld $16, %xmm12, %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm0[0],mem[1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm12[1],xmm7[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm13[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1,2,3,4,5],mem[6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm3[6],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslld $16, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0],xmm10[1],xmm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0],xmm4[1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5],xmm0[6],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm9[6],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslld $16, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslld $16, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm15[1],xmm11[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] @@ -9303,13 +9140,12 @@ ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -9331,11 +9167,9 @@ ; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm4 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,3] @@ -9343,23 +9177,23 @@ ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm2[1],xmm5[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsllq $16, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,3,2,3] @@ -9377,187 +9211,185 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,2,3] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm1, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsllq $16, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsllq $16, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufd $196, (%rsp), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm3[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3,4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2,3,4,5],xmm7[6,7] ; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6],xmm15[7] ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm11, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm9, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm9, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $16, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsllq $16, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1],xmm6[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm11[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm3[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,2,3] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm6[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm1[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3,4,5,6],xmm14[7] ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm11, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm11, %ymm14 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm1, %ymm14 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm14, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm9, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm9, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1,2,3,4,5],mem[6],xmm7[7] +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2,3,4,5],xmm7[6],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,0,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1,2],xmm7[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm7[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm5[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm14[6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0],xmm5[1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,4,5,6,7,8,9,4,5,8,9,2,3] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0],xmm13[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm2[0,1],xmm14[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm12 +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,4,5,6,7,8,9,4,5,8,9,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm15, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm11, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm11, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm9, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm11[0,1,2,3,4,5],mem[6],xmm11[7] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm1, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm12, %ymm14, %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,2,3,4,5],xmm10[6],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm2[1,2],xmm12[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm4[1,2],xmm12[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0],xmm11[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0],xmm10[1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm14[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] @@ -9565,205 +9397,202 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm15, %xmm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm15, %ymm14 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm1, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm14, %ymm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm14, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm12, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm13[0,1,2,3,4,5],mem[6],xmm13[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm4[1,2],xmm12[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm12, %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2,3,4,5],mem[6],xmm12[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm5[1,2],xmm12[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm5[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0],xmm0[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $2, (%rsp), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[1],xmm14[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm15 +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm15, %xmm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm1, %ymm14 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm14, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,2,3,4,5],xmm0[6],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2,3,4,5],mem[6],xmm12[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm3[1,2],xmm12[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7] -; AVX1-ONLY-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm0[0],mem[1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm11[0],mem[1],xmm11[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1,2],xmm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm9, %xmm1 +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm13, %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm6[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,14,15,4,5,6,7,0,1,4,5,8,9,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm7 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm10[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm12[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2],xmm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm2[1,2],xmm6[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[0,1,0,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm10, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm6[2],xmm15[2],xmm6[3],xmm15[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2],xmm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6,7] @@ -9780,16 +9609,16 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,3,2,3] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] @@ -9804,14 +9633,13 @@ ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1 @@ -9820,56 +9648,58 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2,3,4,5],mem[6],xmm2[7] +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3,4,5],xmm2[6],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm8[0],mem[1],xmm8[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload @@ -9882,8 +9712,7 @@ ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] @@ -9901,23 +9730,66 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,5],xmm8[6],xmm12[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm1[0,1,2,3,4,5],mem[6],xmm1[7] +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm1[6],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] @@ -9926,102 +9798,87 @@ ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0],xmm1[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm5[0],mem[1],xmm5[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,7] ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = zero,xmm3[1],mem[0],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,5],xmm2[6],xmm8[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,6] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0],xmm13[1],xmm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,7,7] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = zero,xmm4[1],mem[0],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm9[1],xmm10[1],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0],xmm3[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm12[1],xmm8[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm7[6,7] ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload @@ -10029,67 +9886,76 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = zero,xmm15[1],mem[0],zero -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm11[2],xmm14[2],xmm11[3],xmm14[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = zero,xmm5[1],mem[0],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm8[1],xmm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0],xmm6[1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[3,3,3,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = zero,xmm0[1],mem[0],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = zero,xmm6[1],mem[0],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0],xmm0[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] @@ -10099,57 +9965,17 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm3, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = zero,xmm0[1],mem[0],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0],xmm0[1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm3, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm7, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -10159,10 +9985,10 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) @@ -10179,12 +10005,11 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -10192,131 +10017,126 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax) -; AVX1-ONLY-NEXT: addq $1544, %rsp # imm = 0x608 +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) +; AVX1-ONLY-NEXT: addq $1512, %rsp # imm = 0x5E8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $1448, %rsp # imm = 0x5A8 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm14 +; AVX2-SLOW-NEXT: subq $1432, %rsp # imm = 0x598 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm12 ; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5],ymm2[6],ymm3[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm9[2],ymm10[3,4,5],ymm9[6],ymm10[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm12[2],ymm10[3,4,5],ymm12[6],ymm10[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm15[1],ymm12[2,3,4],ymm15[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm15, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm13[2],ymm14[3,4,5],ymm13[6],ymm14[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm6[2],ymm7[3,4,5],ymm6[6],ymm7[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm2[1],ymm13[2,3,4],ymm2[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm14 +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 672(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm0[2],ymm8[3,4,5],ymm0[6],ymm8[7] +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 672(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1],ymm7[2],ymm11[3,4,5],ymm7[6],ymm11[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm11[1],ymm5[2,3,4],ymm11[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7,8,9,10],ymm3[11],ymm2[12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm15[2],ymm9[3,4],ymm15[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7,8,9,10],ymm3[11],ymm1[12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm9 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7,8,9,10],ymm5[11],ymm4[12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] +; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2],ymm6[3],mem[4,5],ymm6[6],mem[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7,8,9,10],ymm5[11],ymm4[12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm13 -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm7[3],ymm11[4,5],ymm7[6],ymm11[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7,8,9,10],ymm5[11],ymm4[12,13,14,15] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10325,114 +10145,116 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4],ymm3[5,6,7,8,9,10,11],ymm2[12],ymm3[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7,8,9,10,11],ymm1[12],ymm3[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6,7] ; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm10 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm5 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3,4,5],xmm4[6],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm14 +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm9[2,3,0,1] +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm9[1],ymm13[2,3],ymm9[4],ymm13[5,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm15[2,3,0,1] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm0[2,3],ymm8[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm12 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm15[2,3],ymm6[4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm0[0,1,2],mem[3],ymm0[4,5],mem[6],ymm0[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[1,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0,1,2],mem[3],ymm1[4,5],mem[6],ymm1[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[1,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] +; AVX2-SLOW-NEXT: vpblendd $72, (%rsp), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm2[0,1,2],mem[3],ymm2[4,5],mem[6],ymm2[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm9[1],ymm13[2,3,4],ymm9[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2],ymm0[3],mem[4,5],ymm0[6],mem[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm1 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm11 ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm11[3],ymm2[4,5],ymm11[6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm9 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,4,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -10441,16 +10263,16 @@ ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm0[0,1,0,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7] +; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] @@ -10459,78 +10281,77 @@ ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 640(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,1,0,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendd $31, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,1,0,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,4,7] +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,0,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm14[0,1,0,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm9 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,4,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,4,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 864(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,1,0,2] +; AVX2-SLOW-NEXT: vmovdqa 864(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm13[0,1,0,2] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm15[7] ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm11[2],ymm13[3,4,5],ymm11[6],ymm13[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm6[2],ymm7[3,4,5],ymm6[6],ymm7[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm15 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm15[4],xmm0[5],xmm15[6],xmm0[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm8[2],ymm12[3,4,5],ymm8[6],ymm12[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm9[2],ymm7[3,4,5],ymm9[6],ymm7[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm10[2],ymm3[3,4,5],ymm10[6],ymm3[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] @@ -10540,73 +10361,70 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm9[2],ymm11[3,4,5],ymm9[6],ymm11[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,1,1,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm6 -; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,1,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm15[0,1,1,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm9[3],ymm7[4,5],ymm9[6],ymm7[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,1,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm14[0,1,1,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm10[3],ymm3[4,5],ymm10[6],ymm3[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm14[0,1,1,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm13[0,1,1,2] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] @@ -10614,15 +10432,13 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm8[0,1,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm5[0,1,1,3] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] @@ -10631,30 +10447,49 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm10 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm13[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[0,1,2,0,4,5,6,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm15[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,2,0,4,5,6,4] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm14[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,2,0,4,5,6,4] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm15 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm14[0,1,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm13[0,1,1,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,0,4,5,6,4] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] @@ -10662,57 +10497,40 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm5[1],ymm9[2,3],ymm5[4],ymm9[5,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5],xmm3[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,2,0,4,5,6,4] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm13[2],ymm14[3,4,5],ymm13[6],ymm14[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7,8,9,10,11,12,13],ymm4[14],ymm5[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm3[1],xmm6[2],xmm3[3],xmm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm13[2],ymm14[3,4,5],ymm13[6],ymm14[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[8,9,8,9,8,9,8,9,0,1,14,15,0,1,10,11,24,25,24,25,24,25,24,25,16,17,30,31,16,17,26,27] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6],ymm4[7,8,9,10,11,12,13],ymm3[14],ymm4[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm11[2],ymm7[3,4],ymm11[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3,4,5,6,7],ymm5[8],ymm6[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm5[1,2,3,4,5,6,7],ymm3[8],ymm5[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm4[2],mem[3,4,5],ymm4[6],mem[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [8,9,8,9,8,9,8,9,0,1,14,15,0,1,10,11,24,25,24,25,24,25,24,25,16,17,30,31,16,17,26,27] +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm3, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6],ymm5[7,8,9,10,11,12,13],ymm3[14],ymm5[15] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] @@ -10721,275 +10539,234 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0],ymm12[1],ymm8[2,3,4],ymm12[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm6[2],mem[3,4,5],ymm6[6],mem[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4,5,6,7],ymm7[8],ymm4[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm6[2],ymm9[3,4,5],ymm6[6],ymm9[7] +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm12 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7,8,9,10,11,12,13],ymm2[14],ymm3[15] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7,8,9,10,11,12,13],ymm5[14],ymm6[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3],xmm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm10[0,1],mem[2],ymm10[3,4],mem[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm15[1],mem[2,3,4],ymm15[5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6,7],ymm6[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm11[2],mem[3,4,5],ymm11[6],mem[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm4[6],ymm2[7,8,9,10,11,12,13],ymm4[14],ymm2[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $221, (%rsp), %ymm7, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm8[2],mem[3,4,5],ymm8[6],mem[7] +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7,8,9,10,11,12,13],ymm2[14],ymm0[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm11[2,3],ymm7[4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 656(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 640(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3,4,5],xmm13[6],xmm14[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 656(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 640(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6],xmm3[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm4[3],ymm13[4,5],ymm4[6],ymm13[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3],xmm1[4],xmm4[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm4[6],xmm5[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm11[0,1],mem[2],ymm11[3,4],mem[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,6] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7],ymm1[8,9,10,11,12],ymm5[13,14,15] -; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm4[1,2,3,4,5,6,7],ymm3[8],ymm4[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6],ymm4[7,8],ymm3[9,10,11,12,13,14],ymm4[15] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 880(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 864(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm15 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0],ymm6[1,2,3,4,5,6,7],ymm5[8],ymm6[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3],xmm5[4],xmm6[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5],xmm15[6],xmm3[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm10[0,1],mem[2],ymm10[3,4],mem[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3],xmm4[4],xmm5[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 880(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 864(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,5],xmm6[6],xmm4[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5],mem[6],ymm6[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm5[1,2,3,4,5,6,7],ymm3[8],ymm5[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3],xmm3[4],xmm5[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5],xmm5[6],xmm3[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7],ymm2[8,9,10,11,12],ymm6[13,14,15] +; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2],ymm8[3],mem[4,5],ymm8[6],mem[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7,8],ymm6[9,10,11,12,13,14],ymm7[15] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm15[2,3],ymm12[4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm5[1,2,3,4,5,6,7],ymm2[8],ymm5[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm5[1],ymm2[2,3,4,5,6,7,8],ymm5[9],ymm2[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4],xmm2[5],xmm5[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5],mem[6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1,2,3,4,5,6,7],ymm6[8],ymm2[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm13[1],mem[2,3],ymm13[4],mem[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm6[1],ymm2[2,3,4,5,6,7,8],ymm6[9],ymm2[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2],ymm14[3],mem[4,5],ymm14[6],mem[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0],ymm7[1,2,3,4,5,6,7],ymm6[8],ymm7[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm7[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4,5,6,7,8],ymm10[9],ymm7[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm8[0,1],mem[2,3],ymm8[4,5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4],xmm9[5],xmm10[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm11[0,1],mem[2,3],ymm11[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[3],xmm8[4],xmm6[5],xmm8[6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm7 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpblendd $18, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4,5,6,7,8],ymm7[9],ymm1[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3],xmm9[4],xmm7[5],xmm9[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm7 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7],ymm7[8,9,10,11,12],ymm4[13,14,15] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,2],ymm7[3],mem[4,5],ymm7[6],mem[7] @@ -10999,35 +10776,78 @@ ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4],xmm4[5],xmm7[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3],xmm8[4],xmm7[5],xmm8[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm7[1,2,3,4,5,6,7],ymm1[8],ymm7[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0],ymm7[1],mem[2,3],ymm7[4],mem[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6,7,8],ymm8[9],ymm7[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,2],ymm9[3],mem[4,5],ymm9[6],mem[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm10[0,1],mem[2,3],ymm10[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7] +; AVX2-SLOW-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = mem[0],xmm4[1],mem[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7],ymm8[8,9,10,11,12],ymm4[13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4,5,6,7],ymm7[8],ymm4[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3],xmm8[4],xmm7[5],xmm8[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7],ymm4[8,9,10,11,12],ymm3[13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7,8],ymm5[9],ymm4[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7],ymm5[8,9,10,11,12],ymm3[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7,8],ymm6[9],ymm5[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2],ymm15[3],mem[4,5],ymm15[6],mem[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rsi) @@ -11046,10 +10866,10 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rcx) @@ -11066,9 +10886,9 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, (%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rax) @@ -11076,370 +10896,363 @@ ; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm12, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 64(%rax) -; AVX2-SLOW-NEXT: addq $1448, %rsp # imm = 0x5A8 +; AVX2-SLOW-NEXT: addq $1432, %rsp # imm = 0x598 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride7_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $1544, %rsp # imm = 0x608 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm14 -; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm15 -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm12 -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm12 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm9[2],ymm2[3,4,5],ymm9[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm12[2],ymm13[3,4,5],ymm12[6],ymm13[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[2,3,4],ymm0[5],ymm14[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm9[2],ymm12[3,4,5],ymm9[6],ymm12[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm6[2],ymm7[3,4,5],ymm6[6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm15 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm1[1],ymm12[2,3,4],ymm1[5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm8 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm7 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2],ymm11[3,4,5],ymm10[6],ymm11[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm1[2],ymm15[3,4,5],ymm1[6],ymm15[7] +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm11 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm10[1],ymm4[2,3,4],ymm10[5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,5,1,u,4,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5],mem[6],ymm7[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm1[2],ymm12[3,4],ymm1[5],ymm12[6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm9[3],ymm12[4,5],ymm9[6],ymm12[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm14[0,1],mem[2],ymm14[3,4],mem[5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,1,u,4,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm5, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm6, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm10[1],ymm2[2,3],ymm10[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,6,1,u,5,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm11 +; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,2],ymm0[3],mem[4,5],ymm0[6],mem[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-FAST-NEXT: vpblendd $219, (%rsp), %ymm10, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm10[2],mem[3,4],ymm10[5],mem[6,7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0],ymm12[1],ymm2[2,3],ymm12[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,1,u,5,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5],xmm6[6],xmm5[7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5],xmm6[6],xmm5[7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5],xmm6[6],xmm5[7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm1[2,3],ymm15[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm0[1],mem[2,3,4],ymm0[5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm12[3],ymm9[4,5],ymm12[6],ymm9[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm2[1,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3,4],ymm0[5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm0[0,1,2],mem[3],ymm0[4,5],mem[6],ymm0[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm4[1,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2],ymm0[3],ymm11[4,5],ymm0[6],ymm11[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2],ymm1[3],ymm15[4,5],ymm1[6],ymm15[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm10[3],ymm1[4,5],ymm10[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm11 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,6,2,5,3,6,2,5] ; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,2] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm7 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm12[3],ymm7[4,5],ymm12[6],ymm7[7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vmovdqa 640(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2],ymm0[3],ymm10[4,5],ymm0[6],ymm10[7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vmovdqa 640(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm15 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm0[0,1,0,2] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm9[3],ymm12[4,5],ymm9[6],ymm12[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,1,0,2] +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm9 -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vpblendd $31, (%rsp), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 864(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7] -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa 864(%rdi), %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4,5],ymm14[6],ymm13[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,4,5,6,7,4,5,6,7,6,7,12,13,16,17,18,19,20,21,22,23,20,21,22,23,22,23,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,2] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm8[2],ymm10[3,4,5],ymm8[6],ymm10[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm14 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm14[4],xmm5[5],xmm14[6],xmm5[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm7[2],ymm12[3,4,5],ymm7[6],ymm12[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4,5,6,7],ymm4[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm6[2],ymm15[3,4,5],ymm6[6],ymm15[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm12[2],ymm9[3,4,5],ymm12[6],ymm9[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4,5],ymm8[6],ymm9[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,3,2,3,2,5] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm2, %ymm3 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm8[3],ymm10[4,5],ymm8[6],ymm10[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 @@ -11452,859 +11265,866 @@ ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm14 ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm12[3],ymm9[4,5],ymm12[6],ymm9[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm3 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm8 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,1,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm15[0,1,1,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,2,3,8,9,16,17,18,19,20,21,22,23,24,25,26,27,18,19,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm11[0,1,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm12[0,1,1,3] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm10[1],ymm6[2,3],ymm10[4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm11[0,1,1,3] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm14[1],ymm12[2,3],ymm14[4],ymm12[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm9[0,1,1,3] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm10 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2],xmm1[3],xmm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm11 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm0[0,1],mem[2],ymm0[3,4,5],mem[6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,3,7,2,6,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm12, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm7[2],ymm13[3,4],ymm7[5],ymm13[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm4 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm10 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm13[0,1,1,3] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm15[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0],ymm5[1,2,3,4,5,6,7],ymm0[8],ymm5[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm3 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm15[1],xmm5[2,3,4,5],xmm15[6],xmm5[7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm9[0,1],mem[2],ymm9[3,4,5],mem[6],ymm9[7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm9[0,1,1,3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm3 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4,5],ymm3[6],mem[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm12, %ymm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $221, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,3,3,0,3,7,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,5,1,4,2,5,1,4] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm7[2,3],ymm13[4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,7,3,6,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm12[1,2,3,4,5,6,7],ymm0[8],ymm12[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm15 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm15[1],mem[2,3,4],ymm15[5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm12 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm13 -; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm6 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm12 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0,1,2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm5, %ymm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm9 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4,5],ymm0[6],mem[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,3,7,2,6,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm8[0,1],mem[2],ymm8[3,4],mem[5],ymm8[6,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm13 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm12 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = ymm9[0,1,2],mem[3],ymm9[4,5],mem[6],ymm9[7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm4, %ymm13 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovdqa %xmm9, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm5 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm9[2],ymm10[3,4,5],ymm9[6],ymm10[7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm4 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm7[2],mem[3,4,5],ymm7[6],mem[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm3 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4,5],ymm3[6],mem[7] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm13[2,3],ymm11[4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7],ymm0[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = ymm5[0,1,2],mem[3],ymm5[4,5],mem[6],ymm5[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm10 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,4,0,3,7,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,3,3,3,0,3,7,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm15[2],mem[3,4],ymm15[5],mem[6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,5,1,4,2,5,1,4] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2],ymm0[3],mem[4,5],ymm0[6],mem[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,4,7,3,6,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,2,3,2,3,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm6[1,2,3,4,5,6,7],ymm0[8],ymm6[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,3,3,3,0,3,7,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7],ymm6[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm8[2,3],ymm11[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm15 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm15, %xmm15 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm14, %ymm15 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm15 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm15[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm3, %ymm6 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm15 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm3, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,2],ymm9[3],mem[4,5],ymm9[6],mem[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm11, %xmm11 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7],ymm6[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm15 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm15, %xmm15 +; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] +; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[0,1,2],ymm7[3],mem[4,5],ymm7[6],mem[7] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm14, %ymm15 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm15[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm12[0,1],mem[2],ymm12[3,4],mem[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $72, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm14, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,4,7,0,0,4,7,0] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [2,6,1,5,2,6,1,5] +; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,2],ymm0[3],mem[4,5],ymm0[6],mem[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,4,6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <1,4,0,3,7,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm13, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,2],ymm6[3],mem[4,5],ymm6[6],mem[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,4,6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0],ymm7[1],mem[2,3],ymm7[4],mem[5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1,2,3,4,5,6,7],ymm6[8],ymm3[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1],mem[2,3],ymm6[4,5],mem[6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm13, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5],mem[6],ymm7[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5],mem[6],ymm3[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm13, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7],ymm4[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm5 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $237, (%rsp), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7],ymm4[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, (%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-NEXT: addq $1544, %rsp # imm = 0x608 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride7_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $1448, %rsp # imm = 0x5A8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: subq $1432, %rsp # imm = 0x598 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm9[2],ymm12[3,4,5],ymm9[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2],ymm11[3,4,5],ymm10[6],ymm11[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm3[1],ymm7[2,3,4],ymm3[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm13[2],ymm15[3,4,5],ymm13[6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm14[2],ymm12[3,4,5],ymm14[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm4, %ymm6, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 672(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2],ymm10[3,4,5],ymm0[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 672(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm14[2],ymm15[3,4,5],ymm14[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm9[3],ymm12[4,5],ymm9[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm13[3],ymm10[4,5],ymm13[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7,8,9,10],ymm4[11],ymm3[12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2,3],ymm12[4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm4, %ymm6, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3],ymm4[4,5,6,7,8,9,10],ymm6[11],ymm4[12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,2],ymm13[3],mem[4,5],ymm13[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3,4,5],xmm6[6],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm6, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3],ymm4[4,5,6,7,8,9,10],ymm6[11],ymm4[12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3,4,5],xmm6[6],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm6, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm0[2],ymm7[3,4],ymm0[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3],ymm4[4,5,6,7,8,9,10],ymm6[11],ymm4[12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm3, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm3[4],ymm4[5,6,7,8,9,10,11],ymm3[12],ymm4[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm5[1],xmm3[2,3,4,5],xmm5[6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm6, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3,4,5],xmm6[6],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4],ymm6[5,6,7,8,9,10,11],ymm7[12],ymm6[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm4, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm8[1],ymm12[2,3],ymm8[4],ymm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3,4,5],xmm6[6],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm15[2,3],ymm1[4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm8[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm14[2,3],ymm5[4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm14[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7,8,9,10,11],ymm4[12],ymm3[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm10[1],mem[2,3,4],ymm10[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4],ymm6[5,6,7,8,9,10,11],ymm7[12],ymm6[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm14[1],ymm7[2,3],ymm14[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3,4,5],xmm6[6],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm4, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $72, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5],mem[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[1,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2,3,4],ymm12[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm15[3],ymm1[4,5],ymm15[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0],ymm2[1],mem[2,3,4],ymm2[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm2[0,1,2],mem[3],ymm2[4,5],mem[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm3[1,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm15[1],ymm11[2,3,4],ymm15[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm10[0,1,2],mem[3],ymm10[4,5],mem[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm8[1],ymm12[2,3,4],ymm8[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm9[0,1,2],mem[3],ymm9[4,5],mem[6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm14[1],ymm7[2,3,4],ymm14[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm4, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm13[1],ymm11[2,3,4],ymm13[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm14[3],ymm5[4,5],ymm14[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm9[0,1,2],mem[3],ymm9[4,5],mem[6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm0[3],ymm13[4,5],ymm0[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm0[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm14, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm4 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 640(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm12[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 864(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm14[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm10, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5],xmm11[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm3[3],ymm10[4,5],ymm3[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 864(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm3[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm15, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm8[2],ymm9[3,4,5],ymm8[6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm11[4],xmm6[5],xmm11[6],xmm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm6[2],ymm3[3,4,5],ymm6[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,4,5,6,7,6,7,12,13,16,17,18,19,20,21,22,23,20,21,22,23,22,23,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1,2,3,4,5,6,7],ymm6[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm7[2],ymm5[3,4,5],ymm7[6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm12[2],ymm4[3,4,5],ymm12[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm15, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm13[2],ymm9[3,4,5],ymm13[6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm14, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm13[3],ymm9[4,5],ymm13[6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm11[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm5 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm11[0,1,1,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15,16,17,18,19,20,21,22,23,16,17,18,19,16,17,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm8[3],ymm2[4,5],ymm8[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3,4,5],xmm0[6],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm14[0,1,1,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3,4,5],xmm0[6],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm14[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm15[3],ymm9[4,5],ymm15[6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm15[0,1,1,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3,4,5],xmm0[6],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,1,1,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm6[1],ymm10[2,3],ymm6[4],ymm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] @@ -12312,769 +12132,778 @@ ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm11[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,6,7,8,9,10,11,2,3,8,9,16,17,18,19,20,21,22,23,24,25,26,27,18,19,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm7[1],ymm13[2,3],ymm7[4],ymm13[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3,4,5],xmm6[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm14[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6,7],ymm6[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm15[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm9, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm9[1],ymm15[2,3],ymm9[4],ymm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3,4,5],xmm6[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7],ymm4[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm6 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [8,9,8,9,8,9,8,9,0,1,14,15,0,1,10,11,24,25,24,25,24,25,24,25,16,17,30,31,16,17,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6],ymm4[7,8,9,10,11,12,13],ymm3[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm7[1],ymm13[2,3,4],ymm7[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm5[1,2,3,4,5,6,7],ymm3[8],ymm5[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm13[2],mem[3,4,5],ymm13[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm8[1],ymm11[2,3,4],ymm8[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2],xmm4[3],xmm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6,7],ymm7[8],ymm5[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm13[2],mem[3,4,5],ymm13[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6],ymm5[7,8,9,10,11,12,13],ymm3[14],ymm5[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1],mem[2],ymm6[3,4,5],mem[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm14[1],ymm10[2,3,4],ymm14[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3,4,5,6,7],ymm7[8],ymm3[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm8[1],ymm2[2,3,4],ymm8[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm9, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm11[2],mem[3,4,5],ymm11[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7,8,9,10,11,12,13],ymm2[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7,8,9,10,11,12,13],ymm5[14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3],xmm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6,7],ymm6[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm14[0,1],mem[2],ymm14[3,4,5],mem[6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7,8,9,10,11,12,13],ymm3[14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4,5],ymm0[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7,8,9,10,11,12,13],ymm2[14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2],ymm13[3],mem[4,5],ymm13[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 656(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 640(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 656(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm13 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm4 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 640(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm2[6],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7],ymm1[8,9,10,11,12],ymm5[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2],ymm13[3],mem[4,5],ymm13[6],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm1[2],xmm8[3],xmm1[4],xmm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm11, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 880(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 864(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm15 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5,6,7],ymm11[8,9,10,11,12],ymm15[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1],ymm6[2,3],mem[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm8[0],ymm11[1,2,3,4,5,6,7],ymm8[8],ymm11[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm13[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1],ymm6[2],mem[3,4],ymm6[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1],xmm8[2],xmm11[3],xmm8[4],xmm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1,2,3,4,5],xmm14[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm3[5,6,7],ymm8[8,9,10,11,12],ymm3[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm6[0,1,2],mem[3],ymm6[4,5],mem[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1,2,3,4,5,6],ymm13[7,8],ymm8[9,10,11,12,13,14],ymm13[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm8, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm13, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5,6,7],ymm7[8,9,10,11,12],ymm10[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0],ymm7[1,2,3,4,5,6,7],ymm5[8],ymm7[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2],ymm11[3],mem[4,5],ymm11[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6],ymm7[7,8],ymm5[9,10,11,12,13,14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm15, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1],ymm4[2],mem[3,4],ymm4[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1],xmm5[2],xmm10[3],xmm5[4],xmm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 880(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 864(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm5[0,1,2,3,4,5],xmm4[6],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5,6,7],ymm15[8,9,10,11,12],ymm14[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm7[0],ymm14[1,2,3,4,5,6,7],ymm7[8],ymm14[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0,1],xmm7[2],xmm14[3],xmm7[4],xmm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm10[0,1,2,3,4,5],xmm15[6],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6,7],ymm7[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm14, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm7, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $72, (%rsp), %ymm4, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm7[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm7[1,2,3,4,5,6],ymm11[7,8],ymm7[9,10,11,12,13,14],ymm11[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm7, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4],xmm3[5],xmm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,2],ymm3[3],mem[4,5],ymm3[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm3 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm8[1,2,3,4,5,6,7],ymm2[8],ymm8[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm12[0],mem[1],ymm12[2,3],mem[4],ymm12[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6,7,8],ymm9[9],ymm8[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0,1],mem[2,3],ymm9[4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3],xmm12[4],xmm9[5],xmm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm2[0,1,2],mem[3],ymm2[4,5],mem[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm6[3],xmm7[4],xmm6[5],xmm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm7, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2],ymm1[3],mem[4,5],ymm1[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4,5,6,7],ymm14[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4,5,6,7,8],ymm8[9],ymm1[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm9, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,2],ymm9[3],mem[4,5],ymm9[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0],ymm8[1,2,3,4,5,6,7],ymm1[8],ymm8[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6,7,8],ymm9[9],ymm8[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3],xmm12[4],xmm9[5],xmm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[0],xmm5[1],mem[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1,2,3,4],ymm5[5,6,7],ymm9[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1,2,3,4,5,6,7],ymm8[8],ymm5[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0],xmm5[1],xmm14[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0],xmm10[1],xmm15[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7,8],ymm6[9],ymm5[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7],ymm6[8,9,10,11,12],ymm4[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $237, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0],ymm6[1],mem[2,3],ymm6[4],mem[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6,7,8],ymm7[9],ymm6[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm6, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5],mem[6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $1448, %rsp # imm = 0x5A8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $1432, %rsp # imm = 0x598 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: load_i16_stride7_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $1864, %rsp # imm = 0x748 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 480(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: subq $1800, %rsp # imm = 0x708 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 544(%rdi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm3, %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 700(%rdi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 672(%rdi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm2[2],ymm4[3,4,5],ymm2[6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm6[2],ymm9[3,4,5],ymm6[6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm20[0,1,0,2] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm19 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 480(%rdi), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 544(%rdi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm14[1],ymm8[2,3,4],ymm14[5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm8, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm19[0,1,0,2] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 700(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 672(%rdi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm6, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7,8,9,10],ymm0[11],ymm3[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7,8,9,10],ymm0[11],ymm2[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3,4,5],xmm4[6],xmm6[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm5, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm4, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm11[2],ymm9[3,4,5],ymm11[6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm7[2],ymm13[3,4,5],ymm7[6],ymm13[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm25[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm27[0,1,1,3,4,5,5,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5,6],ymm7[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm15[1],xmm0[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm23 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,0,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm19, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0],xmm1[1],xmm3[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,0,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm7, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 528(%rdi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm10, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6,7,8,9,10],ymm7[11],ymm8[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6,7,8,9,10],ymm7[11],ymm9[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3,4,5],xmm7[6],xmm9[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm3, %ymm6, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm2, %ymm6, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 608(%rdi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm16[0,1,0,2] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 688(%rdi), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm1[1],xmm3[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm16[0,1,0,2] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm26[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 688(%rdi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm5[1],xmm2[2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm4, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1],ymm14[2,3],ymm4[4,5],ymm14[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm14, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm14[1],ymm2[2,3],ymm14[4],ymm2[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4],ymm6[5,6,7,8,9,10,11],ymm4[12],ymm6[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm6, %ymm9, %ymm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm6[1],xmm9[2,3,4,5],xmm6[6],xmm9[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm19[0,1,1,2] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,0,3,4,5,4,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm15, %xmm28 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm20[0,1,1,2] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,0,3,4,5,4,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm21 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm9, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7,8,9,10,11],ymm9[12],ymm8[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm7, %ymm5, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 528(%rdi), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm14, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4],ymm10[5,6,7,8,9,10,11],ymm9[12],ymm10[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm9, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3,4,5],xmm10[6],xmm9[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm9, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm7, %ymm4, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2],ymm10[3],ymm0[4,5],ymm10[6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3,4,5],xmm4[6],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm16[0,1,1,2] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,0,3,4,5,4,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm2, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm24 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm4, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[1,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm4[1,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm9 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2],xmm7[3],xmm9[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2],ymm1[3],ymm15[4,5],ymm1[6],ymm15[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm7, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm4, %ymm6, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm19[0,1,1,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm20[0,1,1,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,1,2,1,4,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,2,1,4,5,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm23, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm17, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm17, %xmm7 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm4, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm12[1],ymm9[2,3,4],ymm12[5],ymm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,1,1,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm16[0,1,1,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[0,1,2,1,4,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 680(%rdi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,1,4,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 680(%rdi), %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm3, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4],xmm5[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2],xmm4[3],xmm1[4],xmm4[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,5],xmm6[6],xmm7[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7],ymm5[8,9,10,11,12],ymm8[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm28, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3],xmm5[4],xmm8[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 656(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm5[0,1,2,3,4,5],xmm1[6],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0,1,2,3,4,5],xmm6[6],xmm7[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,7,6] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm9[5,6,7],ymm4[8,9,10,11,12],ymm9[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm21, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm17, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1],xmm4[2],xmm9[3],xmm4[4],xmm9[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 656(%rdi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm4[0,1,2,3,4,5],xmm1[6],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,7,6] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm24, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm24, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm9, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4],xmm9[5],xmm10[6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm7[1],xmm6[2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm12[3],ymm2[4,5],ymm12[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,1,4,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm27[0,1,2,1,4,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] ; AVX512F-ONLY-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 {%k1} # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm12[2,3],ymm2[4,5],ymm12[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 {%k1} # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm12[2,3],ymm7[4,5],ymm12[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3],xmm5[4],xmm0[5],xmm5[6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] @@ -13085,259 +12914,254 @@ ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 736(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 800(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 768(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 832(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 864(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm22 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm25[0,1,2,1,4,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6],ymm0[7,8,9,10,11,12,13],ymm13[14],ymm0[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3,4,5,6],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm3[6],ymm1[7,8,9,10,11,12,13],ymm3[14],ymm1[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [10,11,10,11,10,11,10,11,8,9,6,7,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3,4,5,6],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,3,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm10[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm12[3],ymm7[4,5],ymm12[6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,4,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm26[0,1,2,1,4,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm10[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 {%k1} # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 736(%rdi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3],xmm11[4],xmm3[5],xmm11[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 800(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 768(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm0[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6],ymm0[7,8,9,10,11,12,13],ymm14[14],ymm0[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm5[3,4,5,6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 832(%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 864(%rdi), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1],ymm9[2],ymm5[3,4],ymm9[5],ymm5[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm5, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,3,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm0[4],xmm13[5],xmm0[6],xmm13[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4,5,6],ymm14[7,8],ymm13[9,10,11,12,13,14],ymm14[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [12,13,12,13,12,13,12,13,10,11,8,9,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3,4,5,6],xmm13[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm13, %xmm20 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm20[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,1,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm12[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6],xmm12[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1,2,3],xmm0[4],xmm12[5],xmm0[6],xmm12[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6],ymm13[7,8],ymm12[9,10,11,12,13,14],ymm13[15] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm12, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3,4,5,6],xmm12[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm12, %xmm25 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm25[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm12, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm12 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm11[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0,1,2,3],xmm6[4],xmm11[5],xmm6[6],xmm11[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6],ymm12[7,8],ymm11[9,10,11,12,13,14],ymm12[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3,4,5,6],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm8[1],ymm2[2,3,4],ymm8[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,0,1,14,15,8,9,10,11,4,5,6,7,20,21,20,21,16,17,30,31,24,25,26,27,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm12 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4,5,6,7,8],ymm0[9],ymm11[10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm10[2],ymm4[3,4,5],ymm10[6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm11, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm24, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm0, %xmm25 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2,3,4,5,6,7,8],ymm0[9],ymm12[10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1],ymm5[2],ymm7[3,4,5],ymm5[6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm12, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm2, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm0, %xmm20 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm25[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm20[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2,3,4,5,6,7,8],ymm0[9],ymm6[10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm11[4],xmm6[5],xmm11[6],xmm6[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm24, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm2[3],ymm9[4,5],ymm2[6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm4[2],ymm6[3,4,5],ymm4[6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm6, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5],xmm11[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm2, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,7] ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,1,2,0,4,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # xmm6 = mem[0,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm10, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3,4,5],xmm0[6],xmm6[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm13, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1],ymm4[2],ymm13[3,4,5],ymm4[6],ymm13[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm11[4],xmm6[5],xmm11[6],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,2,0,4,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm15[2],ymm6[3,4,5],ymm15[6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5],xmm11[6],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1],ymm8[2],ymm14[3,4],ymm8[5],ymm14[6,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,1,2,0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm11, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm11, %ymm11 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm11, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,0,0,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm11[0,1,2],ymm6[3,4,5,6,7],ymm11[8,9,10],ymm6[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3,4,5,6,7],ymm11[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm16[0,1,2,0,4,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # xmm6 = mem[0,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3,4,5],xmm0[6],xmm6[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1],ymm5[2],ymm15[3,4],ymm5[5],ymm15[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,2,0] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm9[2],ymm2[3,4,5],ymm9[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,2,0,4,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm3[2],ymm9[3,4],ymm3[5],ymm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,0] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm10[2],ymm2[3,4,5],ymm10[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm10, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm6, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,0,0,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,5,4] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm11[3,4,5,6,7],ymm6[8,9,10],ymm11[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm11[3,4,5,6,7],ymm1[8,9,10],ymm11[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4,5],ymm5[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,8,9,8,9,8,9,0,1,14,15,0,1,10,11,24,25,24,25,24,25,24,25,16,17,30,31,16,17,26,27] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm11 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6],ymm11[7,8,9,10,11,12,13],ymm0[14],ymm11[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] @@ -13345,1799 +13169,1852 @@ ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm4[3],ymm13[4,5],ymm4[6],ymm13[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm28 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1],ymm10[2,3],ymm14[4,5],ymm10[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm14[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3],ymm11[4,5,6,7,8,9,10],ymm13[11],ymm11[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3,4,5],xmm15[6],xmm13[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm8[2,3],ymm6[4,5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm8, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm22[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7,8,9,10],ymm12[11],ymm11[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm11, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm30 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm11, %ymm13, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm11, %ymm12, %ymm11 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2],ymm4[3,4,5],ymm0[6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm11[2],ymm5[3,4,5],ymm11[6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7,8,9,10,11,12,13],ymm0[14],ymm6[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1],ymm4[2],ymm13[3,4],ymm4[5],ymm13[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6],ymm2[7,8,9,10,11,12,13],ymm0[14],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm14[2],ymm5[3,4],ymm14[5],ymm5[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm11 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm20, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3,4,5],xmm0[6],xmm6[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm22[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm11[3],ymm6[4,5,6,7,8,9,10],ymm11[11],ymm6[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm11[1],xmm6[2,3,4,5],xmm11[6],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm18, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm3[2,3],ymm9[4,5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm11[3],ymm2[4,5,6,7,8,9,10],ymm11[11],ymm2[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3,4,5],xmm11[6],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm6, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm30 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2,3],ymm1[4],ymm12[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm22 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3,4,5],xmm6[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1],xmm11[2],xmm6[3],xmm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm6[1],ymm15[2,3],ymm6[4],ymm15[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2],xmm2[3],xmm11[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm6, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm16 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm13[2,3],ymm10[4,5],ymm13[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm2[3],ymm7[4,5],ymm2[6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm28 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm27 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm11 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm11[1],xmm1[2,3,4,5],xmm11[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0],xmm11[1],xmm15[2],xmm11[3],xmm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm11, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm15[3],ymm5[4,5],ymm15[6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm9[3],ymm7[4,5],ymm9[6],ymm7[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm13, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm14[2,3],ymm5[4,5],ymm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm20, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm14[1],ymm10[2,3],ymm14[4],ymm10[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm18, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm13[3],ymm1[4,5],ymm13[6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm6[1],ymm15[2,3,4],ymm6[5],ymm15[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5],ymm2[6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4,5],ymm2[6],ymm4[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,3,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm19 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm8[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm15[1],ymm8[2,3,4,5,6,7,8],ymm15[9],ymm8[10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm12[3],ymm4[4,5],ymm12[6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm20, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm4[0,1],mem[2],ymm4[3,4,5],mem[6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm10 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm10, %zmm9, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm9, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,3,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm8, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1,2],ymm5[3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm1[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4,5,6,7,8],ymm9[9],ymm1[10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,4,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm18, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = ymm7[0,1],mem[2],ymm7[3,4,5],mem[6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm9 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm8, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm9, %zmm8, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,3,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm23 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm21 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm13 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm12 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm6 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm7, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm30 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm30 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm25 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm9, %zmm25 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm6 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, (%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 64(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm26 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm26 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm20 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm9, %zmm20 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm11, %zmm31 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm0 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm15, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 64(%r9) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm11 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm23, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm21, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm7, %zmm12 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $1864, %rsp # imm = 0x748 +; AVX512F-ONLY-SLOW-NEXT: addq $1800, %rsp # imm = 0x708 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: load_i16_stride7_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $1768, %rsp # imm = 0x6E8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,5,9,u,12,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm5, %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 480(%rdi), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm4, %ymm6, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 672(%rdi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm20 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 700(%rdi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm24 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm24[0,1,0,2] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm2, %ymm3, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm18 +; AVX512F-ONLY-FAST-NEXT: subq $1800, %rsp # imm = 0x708 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 160(%rdi), %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm21[0,1,0,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,9,4,13,4,13,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm3, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm4, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm4, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 544(%rdi), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm4, %ymm5, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 480(%rdi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3,4,5],ymm4[6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm4, %ymm9 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 672(%rdi), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 700(%rdi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,5,9,u,12,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm14, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm12, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm6[2],ymm13[3,4,5],ymm6[6],ymm13[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,4,5,6,7,6,7,12,13,16,17,18,19,20,21,22,23,20,21,22,23,22,23,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm14[0],xmm11[1],xmm14[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm11, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm24 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm20, %ymm19, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2],ymm9[3],ymm15[4,5],ymm9[6],ymm15[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm15, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm9, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2,3,4,5],xmm3[6],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm3, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm11[2],ymm3[3,4,5],ymm11[6],ymm3[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm13[1],xmm15[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm15, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm8, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6],xmm8[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm30[0,1,0,2] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 688(%rdi), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0],xmm2[1],xmm4[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,6,9,u,13,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 608(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm1[2],ymm13[3,4,5],ymm1[6],ymm13[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %ymm20 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm20[0,1,0,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 688(%rdi), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm10[1],xmm11[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,6,9,u,13,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm18, %ymm3, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0],xmm8[1],xmm5[2,3,4,5],xmm8[6],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm9, %ymm8, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2],ymm11[3],ymm3[4,5],ymm11[6],ymm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [2,5,2,5,2,5,2,5] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm24, %ymm11, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2,3,4,5,6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm13, %xmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,1,0,3,2,3,2,5] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm21, %ymm16, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5,6],ymm8[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm14, %xmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm23 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm28, %ymm27, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm28, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm27, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm12, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm7, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm10, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm30, %ymm11, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm5[1],xmm9[2],xmm5[3],xmm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [10,3,6,15,12,13,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm27, %zmm1, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm10, %ymm9, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3,4,5],xmm10[6],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm11 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm24[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm9, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm15, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0],xmm8[1],xmm15[2],xmm8[3],xmm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm1, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm8, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3,4,5],xmm8[6],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm30[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm10[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3],ymm13[4,5],ymm0[6],ymm13[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm19 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm20, %ymm16, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm10, %xmm22 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,11,14,7,4,5,14,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm26, %ymm5, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm3, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm13[1],ymm7[2,3],ymm13[4],ymm7[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3,4,5],xmm6[6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,8,9,10,11,2,3,8,9,16,17,18,19,20,21,22,23,24,25,26,27,18,19,24,25] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm21[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1,2,3,4,5,6],ymm12[7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm10 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm27, %ymm28, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0],xmm5[1],xmm9[2],xmm5[3],xmm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm4, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm14[1],ymm9[2,3],ymm14[4],ymm9[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm20[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm20, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm3, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [3,6,10,13,3,6,10,13] -; AVX512F-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm23, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm7 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-ONLY-FAST-NEXT: movw $992, %ax # imm = 0x3E0 -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 {%k1} # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm30, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm30, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0],ymm14[1],ymm9[2,3,4],ymm14[5],ymm9[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 680(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm18, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,3,3,0,3,7,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm24, %ymm2, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,5,9,12,2,5,9,12] -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7],ymm3[8,9,10,11,12],ymm1[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm17, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 736(%rdi), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm3[2,3],ymm8[4,5],ymm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4],xmm1[5],xmm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,u,u,u,4,7,11,14> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm23, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm29 {%k1} # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm30, %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm31, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm21 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 680(%rdi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm11, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm11, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,5,9,12,2,5,9,12] +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm24, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,3,3,0,3,7,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm21, %ymm3, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm23, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm30, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm4, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 416(%rdi), %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm18, %ymm0, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 864(%rdi), %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %ymm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm23, %ymm0, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm25, %ymm14, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm20, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm22, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm7, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,4,7,0,0,4,7,0] -; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm24, %ymm9, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [2,6,9,13,2,6,9,13] +; AVX512F-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm24, %ymm22, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [0,4,7,0,0,4,7,0] ; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} xmm31 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm21, %ymm15, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7],ymm10[8,9,10,11,12],ymm0[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm5, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3],xmm12[4],xmm3[5],xmm12[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm16, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm12, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3,4,5,6],xmm11[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm11, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm25, %ymm22, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm20, %ymm15, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,6,10,13,3,6,10,13] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm24, %ymm1, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7] +; AVX512F-ONLY-FAST-NEXT: movw $992, %ax # imm = 0x3E0 +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 {%k1} # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm8[2,3],ymm3[4,5],ymm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3],xmm13[4],xmm2[5],xmm13[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,10,11,10,11,10,11,8,9,6,7,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 352(%rdi), %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,u,u,u,4,7,11,14> +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm16, %ymm0, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3,4,5,6],xmm13[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm25, %ymm1, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 {%k1} # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 736(%rdi), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm4, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3],xmm4[4],xmm2[5],xmm4[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 800(%rdi), %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %ymm20 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm17, %ymm20, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3,4,5,6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5],xmm3[6],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <1,u,u,u,4,8,11,15> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm16, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm8, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [12,13,12,13,12,13,12,13,10,11,8,9,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <9,u,u,u,12,0,3,7> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm15, %ymm4, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3,4,5,6],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm11, %zmm15, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm12[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3,4,5,6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm18, %ymm22, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,2,3,4,5,6,7,2,3,4,5,10,11,12,13,20,21,18,19,20,21,22,23,18,19,20,21,26,27,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3],xmm3[4],xmm12[5],xmm3[6],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3,4,5,6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm13[3],ymm10[4,5],ymm13[6],ymm10[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4],xmm5[5],xmm0[6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm20, %ymm17, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm2[2],ymm12[3,4,5],ymm2[6],ymm12[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0],ymm12[1],ymm2[2,3],ymm12[4],ymm2[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm18, %ymm1, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm23, %ymm19, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm2[2],ymm3[3,4,5],ymm2[6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm14, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm14, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm2, %ymm22, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm12, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1],ymm11[2],ymm7[3,4,5],ymm11[6],ymm7[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm12[4],xmm8[5],xmm12[6],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <9,u,u,u,13,0,4,7> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm15, %ymm12, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm14, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm20, %ymm17, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm10[2],ymm13[3,4,5],ymm10[6],ymm13[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm19 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm13[2],ymm10[3,4,5],ymm13[6],ymm10[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,u,u,u,5,8,12,15> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm11, %zmm23, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm6[2],ymm0[3,4,5],ymm6[6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4],xmm1[5],xmm5[6],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm23, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm30, %ymm9, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm1[2],ymm14[3,4,5],ymm1[6],ymm14[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm8 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm13, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,10,3,4,13,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm15, %ymm2, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,11,2,11,12,5,8,9] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm5, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm9, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 864(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 832(%rdi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm1[2],ymm13[3,4,5],ymm1[6],ymm13[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3,4,5],xmm2[6],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm9[2],ymm12[3,4],ymm9[5],ymm12[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,3,7,10,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm15, %zmm16, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5],xmm1[6],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3,4,5],xmm11[6],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm20, %ymm17, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm10[3],ymm13[4,5],ymm10[6],ymm13[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = <2,u,u,u,6,9,13,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm18, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm5, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,u,u,u,6,9,13,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm16, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm6[2],ymm14[3,4],ymm6[5],ymm14[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm16, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,3,7,10,14,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm18, %ymm1, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm19, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm17, %ymm20, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm21, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm13, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3,4,5],xmm8[6],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm4, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm7[2],ymm10[3,4],ymm7[5],ymm10[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm2, %ymm13, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm26, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm6[3],ymm13[4,5],ymm6[6],ymm13[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm19, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm24, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3,4],ymm1[5],ymm11[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <3,u,u,u,6,10,13,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm16, %ymm3, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm18, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm10, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3,4,5],xmm5[6],xmm2[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm9[2,3],ymm12[4,5],ymm9[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm9, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm6[2,3],ymm14[4,5],ymm6[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm14, %ymm24 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm8, %xmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm15, %zmm16, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3,4,5],xmm12[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0],ymm2[1],ymm8[2,3,4],ymm2[5],ymm8[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1],xmm15[2],xmm12[3],xmm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <3,u,u,u,6,10,13,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm12, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm17 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm16, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm9[1],ymm6[2,3],ymm9[4],ymm6[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3,4,5],xmm11[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm4[1],xmm11[2],xmm4[3],xmm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,4,8,11,15,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm22, %zmm3, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm2[2],ymm8[3,4],ymm2[5],ymm8[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,3,14,7,10,3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm8[1,2],ymm13[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7],ymm8[8,9,10],ymm6[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3],ymm5[4,5],ymm8[6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,7,11,14,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm18, %ymm1, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,2,3,2,3,2,3,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm4[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm17, %ymm20, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm26, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm5[1],ymm11[2,3,4],ymm5[5],ymm11[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0],xmm4[1],xmm9[2],xmm4[3],xmm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm9[1],ymm5[2,3,4],ymm9[5],ymm5[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3],xmm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1],ymm10[2],ymm5[3,4],ymm10[5],ymm5[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm28, %zmm2, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm17 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm16 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm12, %zmm10, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm9[1,2],ymm7[3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm2, %ymm13, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm19, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,10,3,14,7,10,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm16, %ymm14, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm16 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <9,12,0,3,7,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm25, %ymm1, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm17, %ymm14, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,1,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1,2],ymm0[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm13, %ymm1, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm26 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm7, %zmm5, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm10, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm14 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm19, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm1 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm5, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm8 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm9, %zmm8 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm9 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm10, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm8 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm10, %zmm8 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm10 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm13, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm14, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm1 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 64(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 64(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 64(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm11, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm14, %zmm27 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm15, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 64(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 64(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 64(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 64(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%r9) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm20, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm2 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm1 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm0 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-ONLY-FAST-NEXT: addq $1768, %rsp # imm = 0x6E8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: addq $1800, %rsp # imm = 0x708 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: load_i16_stride7_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $1560, %rsp # imm = 0x618 -; AVX512DQ-SLOW-NEXT: vmovdqa 480(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm23 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqa 544(%rdi), %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX512DQ-SLOW-NEXT: vporq %ymm3, %ymm2, %ymm27 -; AVX512DQ-SLOW-NEXT: vpbroadcastw 700(%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 672(%rdi), %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: subq $1624, %rsp # imm = 0x658 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm9 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm7 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm18 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm18[0,1,0,2] -; AVX512DQ-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512DQ-SLOW-NEXT: vporq %ymm1, %ymm3, %ymm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm17 +; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm19 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm19[0,1,0,2] +; AVX512DQ-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm3 ; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,0,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm16 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 480(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm4[2],ymm8[3,4,5],ymm4[6],ymm8[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm23 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vmovdqa 544(%rdi), %ymm14 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpbroadcastw 700(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 672(%rdi), %xmm1 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm18 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm5, %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm22 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7,8,9,10],ymm0[11],ymm2[12,13,14,15] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3,4,5],xmm3[6],xmm5[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3,4,5],xmm4[6],xmm6[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm4, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm11[2],ymm13[3,4,5],ymm11[6],ymm13[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm19 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm12[2],ymm3[3,4,5],ymm12[6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm27[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm16, %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm4[1],xmm13[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,0,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 528(%rdi), %xmm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm15, %ymm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm16 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6,7,8,9,10],ymm7[11],ymm8[12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm15 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm6 +; AVX512DQ-SLOW-NEXT: vpor %ymm2, %ymm6, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 608(%rdi), %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm20 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm25 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm25[0,1,0,2] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm31[0,1,1,3,4,5,5,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm15 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0],xmm1[1],xmm15[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 688(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm18 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,0,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 528(%rdi), %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm12, %ymm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm21 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7,8,9,10],ymm6[11],ymm7[12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3,4,5],xmm6[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm5 -; AVX512DQ-SLOW-NEXT: vpor %ymm2, %ymm5, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa 608(%rdi), %ymm14 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm14[2],ymm12[3,4,5],ymm14[6],ymm12[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm20 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm20[0,1,0,2] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5,6],ymm5[7] -; AVX512DQ-SLOW-NEXT: vmovdqa 688(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm0, %xmm25 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm30 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4],ymm7[5,6,7,8,9,10,11],ymm6[12],ymm7[13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm11, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm10 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm6, %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm7 +; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2],ymm12[3],ymm3[4,5],ymm12[6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm12, %ymm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm23 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm7[1],xmm9[2,3,4,5],xmm7[6],xmm9[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm9, %xmm9 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm19[0,1,1,2] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,1,0,3,4,5,4,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm13, %xmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 528(%rdi), %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm12 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm10[1],ymm1[2,3],ymm10[4],ymm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm6 -; AVX512DQ-SLOW-NEXT: vpor %ymm6, %ymm8, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2],ymm0[3],ymm13[4,5],ymm0[6],ymm13[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm16 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4],ymm11[5,6,7,8,9,10,11],ymm9[12],ymm11[13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm9, %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3,4,5],xmm11[6],xmm9[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm9, %ymm6 +; AVX512DQ-SLOW-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm18[0,1,1,2] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,0,3,4,5,4,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm15, %xmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm11, %xmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm25[0,1,1,2] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,0,3,4,5,4,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm18, %xmm3 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm0, %xmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2],ymm5[3],ymm10[4,5],ymm5[6],ymm10[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[1,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm8 +; AVX512DQ-SLOW-NEXT: vpor %ymm9, %ymm8, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7,8,9,10,11],ymm9[12],ymm8[13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3],xmm9[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm5 -; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm5, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2],ymm14[3],ymm12[4,5],ymm14[6],ymm12[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm12, %ymm24 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm14, %ymm9 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm20[0,1,1,2] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,0,3,4,5,4,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm14 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm10[1],ymm1[2,3,4],ymm10[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[1,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vporq %ymm8, %ymm7, %ymm30 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm4[1],ymm11[2,3,4],ymm4[5],ymm11[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm5 -; AVX512DQ-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm6 +; AVX512DQ-SLOW-NEXT: vpor %ymm6, %ymm7, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm18[0,1,1,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm5[1],ymm10[2,3,4],ymm5[5],ymm10[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm19[0,1,1,3] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,1,2,1,4,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm8 -; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm17, %xmm7 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm22, %xmm8 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm11[1],ymm9[2,3,4],ymm11[5],ymm9[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[0,1,1,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm25[0,1,1,3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,1,2,1,4,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vpbroadcastw 680(%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm3 -; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm25, %xmm5 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm28[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vpbroadcastw 680(%rdi), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm18, %xmm2 +; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm18, %xmm6 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm4[2],ymm12[3,4],ymm4[5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm13 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm5[2],ymm10[3,4],ymm5[5],ymm10[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2],xmm4[3],xmm1[4],xmm4[5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5],xmm5[6],xmm6[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 -; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm22, %xmm7 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm14 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1],xmm2[2],xmm7[3],xmm2[4],xmm7[5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 656(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,5],xmm6[6],xmm7[7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm9 -; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm23, %xmm8 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm8[5,6,7],ymm4[8,9,10,11,12],ymm8[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm0 +; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm29, %xmm8 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm4 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm4, %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3],xmm8[4],xmm7[5],xmm8[6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm13 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm8 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1],xmm4[2],xmm8[3],xmm4[4],xmm8[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 656(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm4[0,1,2,3,4,5],xmm1[6],xmm4[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,7,6] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm16, %xmm11 +; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm16, %xmm9 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm7[1],xmm6[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm8 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7],ymm7[8,9,10,11,12],ymm6[13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm5[3],ymm10[4,5],ymm5[6],ymm10[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm19 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,1,2,1,4,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm27[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] ; AVX512DQ-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm27 {%k1} # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm18 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3],xmm6[4],xmm0[5],xmm6[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm24 {%k1} # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3],xmm5[4],xmm0[5],xmm5[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 704(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 736(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm17 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 800(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 768(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm16 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm5 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 832(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 864(%rdi), %ymm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm10 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm25 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6],ymm1[7,8,9,10,11,12,13],ymm3[14],ymm1[15] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,10,11,10,11,10,11,8,9,6,7,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3,4,5,6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 416(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,3,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm12, %ymm22 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[0,1,2,1,4,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 288(%rdi), %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm14 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm9[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6],ymm9[7,8,9,10,11,12,13],ymm13[14],ymm9[15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3,4,5,6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 416(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,4,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm31[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 {%k1} # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 704(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 736(%rdi), %ymm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm26 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm10 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4],xmm5[5],xmm6[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 800(%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa 768(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2],ymm9[3],ymm1[4,5],ymm9[6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm6[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm14[6],ymm6[7,8,9,10,11,12,13],ymm14[14],ymm6[15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1,2],xmm5[3,4,5,6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 832(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 864(%rdi), %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,3,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm14 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3],xmm0[4],xmm14[5],xmm0[6],xmm14[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm11[0],ymm2[1],ymm11[2,3],ymm2[4],ymm11[5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7,8],ymm14[9,10,11,12,13,14],ymm15[15] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [12,13,12,13,12,13,12,13,10,11,8,9,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm14, %ymm14 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3,4,5,6],xmm14[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm24 +; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm14, %xmm17 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm17[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm13[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm23 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0,1,2,3],xmm9[4],xmm13[5],xmm9[6],xmm13[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, %ymm14 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm13[1,2,3,4,5,6],ymm15[7,8],ymm13[9,10,11,12,13,14],ymm15[15] -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm15 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm15, %xmm9, %xmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm13, %ymm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0,1,2],xmm9[3,4,5,6],xmm13[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm3 -; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm13, %xmm29 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm29[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm13 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm0[4],xmm13[5],xmm0[6],xmm13[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm9[0],ymm3[1],ymm9[2,3],ymm3[4],ymm9[5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4,5,6],ymm14[7,8],ymm13[9,10,11,12,13,14],ymm14[15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm25 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm13 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm11[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5],xmm9[6],xmm11[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm15, %xmm9, %xmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm15 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0],ymm11[1,2,3,4,5,6],ymm13[7,8],ymm11[9,10,11,12,13,14],ymm13[15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2],xmm9[3,4,5,6],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm16 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm2[1],ymm11[2,3,4],ymm2[5],ymm11[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,0,1,14,15,8,9,10,11,4,5,6,7,20,21,20,21,16,17,30,31,24,25,26,27,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm13 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4,5,6,7,8],ymm0[9],ymm9[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm4[2],ymm1[3,4,5],ymm4[6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm13[4],xmm9[5],xmm13[6],xmm9[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm9, %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm23, %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm0, %xmm29 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1],ymm13[2,3,4,5,6,7,8],ymm0[9],ymm13[10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm14[4],xmm13[5],xmm14[6],xmm13[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm31 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm31, %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm12, %ymm15 +; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm0, %xmm17 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm29[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm17[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm9 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm11 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm24 ; AVX512DQ-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm5[1],ymm15[2,3,4],ymm5[5],ymm15[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm5, %ymm15 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm11 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm3[1],ymm9[2,3,4],ymm3[5],ymm9[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm14 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4,5,6,7,8],ymm0[9],ymm11[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm6[2],ymm7[3,4,5],ymm6[6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm14[4],xmm11[5],xmm14[6],xmm11[7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm11, %ymm11 -; AVX512DQ-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm23, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm25 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm13 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm8[2],ymm10[3,4,5],ymm8[6],ymm10[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm12[4],xmm1[5],xmm12[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm31, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm7[3],ymm4[4,5],ymm7[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm12 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm8 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm8 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3,4,5],xmm11[6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm1 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm19[0,1,2,0,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] -; AVX512DQ-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm11 = mem[0,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm24 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm17 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2],ymm3[3,4,5],ymm2[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm13[4],xmm11[5],xmm13[6],xmm11[7] +; AVX512DQ-SLOW-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,2,0,4,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm21 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm17 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm15[2],ymm2[3,4,5],ymm15[6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm11 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm12[4],xmm1[5],xmm12[6],xmm1[7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1],ymm8[2],ymm12[3,4],ymm8[5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[1,1,2,0] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm13, %ymm13 -; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm13, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,0,0,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,5,4] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm28, %zmm24 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm24 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3,4,5],xmm11[6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,2,0] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm12, %ymm12 +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm12, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7],ymm12[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm21 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm21 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm20[0,1,2,0,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] -; AVX512DQ-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm11 = mem[0,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm30 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm22 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1],ymm5[2],ymm9[3,4],ymm5[5],ymm9[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm20 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,1,2,0] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm11, %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1],ymm6[2],ymm10[3,4,5],ymm6[6],ymm10[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm21 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm14[4],xmm13[5],xmm14[6],xmm13[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm28[0,1,2,0,4,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm31 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm8[3],ymm10[4,5],ymm8[6],ymm10[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm28 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm27 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,0] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1],ymm4[2],ymm7[3,4,5],ymm4[6],ymm7[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm5 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm14[4],xmm12[5],xmm14[6],xmm12[7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm11, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,0,0,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,6,5,4] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3,4,5,6,7],ymm11[8,9,10],ymm13[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm30 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm30 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = ymm5[0,1],mem[2],ymm5[3,4,5],mem[6],ymm5[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm31 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm13 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6],ymm13[7,8,9,10,11,12,13],ymm0[14],ymm13[15] +; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,5,4] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm31 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm31 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [8,9,8,9,8,9,8,9,0,1,14,15,0,1,10,11,24,25,24,25,24,25,24,25,16,17,30,31,16,17,26,27] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm12 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6],ymm12[7,8,9,10,11,12,13],ymm1[14],ymm12[15] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm10[2],ymm2[3,4],ymm10[5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm26 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm0[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm27 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3,4,5],xmm0[6],xmm13[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm12, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5,6,7,8,9,10],ymm14[11],ymm13[12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm25 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm9 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm14[0],xmm9[1],xmm14[2,3,4,5],xmm9[6],xmm14[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7],ymm13[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm13, %ymm9, %ymm9 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm2 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm1[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm15, %ymm29 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0],xmm1[1],xmm12[2,3,4,5],xmm1[6],xmm12[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm13[2,3],ymm3[4,5],ymm13[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3],ymm12[4,5,6,7,8,9,10],ymm14[11],ymm12[12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm15 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm6[1],ymm15[2,3],ymm6[4],ymm15[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2,3,4,5],xmm11[6],xmm14[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm12, %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm19 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7],ymm12[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm12, %ymm11, %ymm11 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2],ymm0[3,4,5],ymm14[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm9 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6],ymm9[7,8,9,10,11,12,13],ymm0[14],ymm9[15] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm28, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3,4,5],xmm0[6],xmm9[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1],ymm15[2,3],ymm12[4,5],ymm15[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm20[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm11[3],ymm9[4,5,6,7,8,9,10],ymm11[11],ymm9[12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4,5],mem[6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7,8,9,10,11,12,13],ymm1[14],ymm0[15] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3,4,5],xmm11[6],xmm9[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm24, %zmm17 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm27, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm11[3],ymm1[4,5,6,7,8,9,10],ymm11[11],ymm1[12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm9 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm9, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm11, %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm17 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm13[3],ymm7[4,5],ymm13[6],ymm7[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm28 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3,4,5],xmm9[6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm22 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2],xmm9[3],xmm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3],ymm7[4],ymm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm11[1],xmm1[2,3,4,5],xmm11[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0],ymm3[1],ymm15[2,3,4],ymm3[5],ymm15[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm15, %ymm24 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm1 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm9, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm11, %ymm0 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm21 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm1[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm1[1,2,3,4,5,6],ymm9[7,8],ymm1[9,10,11,12,13,14],ymm9[15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm20 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm13 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm18 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm1[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1,2,3,4,5,6],ymm11[7,8],ymm1[9,10,11,12,13,14],ymm11[15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm17 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm14 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm9[0,1],ymm1[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm11[0,1],ymm1[2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm13 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm15[3],ymm12[4,5],ymm15[6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm14 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm14 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm26 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3,4,5],xmm9[6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm31 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2],xmm9[3],xmm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm27 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm11[1],xmm1[2,3,4,5],xmm11[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm15 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0],ymm9[1],ymm15[2,3,4],ymm9[5],ymm15[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm26 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm9, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm11, %ymm0 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm14[3],ymm7[4,5],ymm14[6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm14, %ymm12 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm1[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm1[1,2,3,4,5,6],ymm9[7,8],ymm1[9,10,11,12,13,14],ymm9[15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm9[3],ymm3[4,5],ymm9[6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm1[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1,2,3,4,5,6],ymm11[7,8],ymm1[9,10,11,12,13,14],ymm11[15] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm14[2,3],ymm6[4,5],ymm14[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm11 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm28, %zmm9 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm25, %zmm11 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3],ymm8[4,5],ymm1[6],ymm8[7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] @@ -15145,814 +15022,868 @@ ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4,5],mem[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,3,1] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm11 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm11 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm7[1],ymm12[2,3],ymm7[4],ymm12[5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6,7,8],ymm3[9],ymm1[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm14[3],ymm6[4,5],ymm14[6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,4,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm28, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4,5],mem[6],ymm5[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,3,1] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1,2],ymm0[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2],xmm1[3],xmm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = ymm5[0,1],mem[2],ymm5[3,4,5],mem[6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[0,1,3,1] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm8[1,2],ymm7[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7],ymm7[8,9,10],ymm4[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm0 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm4, %zmm0, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm3[1],ymm9[2,3],ymm3[4],ymm9[5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4,5,6,7,8],ymm7[9],ymm4[10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,4,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm25, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2,3,4],ymm2[5],ymm6[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2],xmm2[3],xmm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4,5],mem[6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,3,1] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, (%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%rdx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rcx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 64(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, (%r8) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm23 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm1, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, (%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 64(%rcx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, (%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, (%r8) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, (%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 64(%r9) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 64(%rax) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512DQ-SLOW-NEXT: addq $1560, %rsp # imm = 0x618 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512DQ-SLOW-NEXT: addq $1624, %rsp # imm = 0x658 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: load_i16_stride7_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $1288, %rsp # imm = 0x508 -; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = <2,5,9,u,12,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm18, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15] -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa 480(%rdi), %ymm15 +; AVX512DQ-FAST-NEXT: subq $1512, %rsp # imm = 0x5E8 +; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm0[0,1,0,2] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,9,4,13,4,13,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm16 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm14 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm5, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm6, %xmm0 +; AVX512DQ-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm6 +; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 512(%rdi), %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa 544(%rdi), %ymm6 +; AVX512DQ-FAST-NEXT: vpermi2d %ymm5, %ymm6, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm22 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 480(%rdi), %ymm5 ; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm15[2],ymm6[3,4,5],ymm15[6],ymm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm22 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vporq %ymm4, %ymm6, %ymm29 -; AVX512DQ-FAST-NEXT: vmovdqa 672(%rdi), %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm20 -; AVX512DQ-FAST-NEXT: vpbroadcastw 700(%rdi), %xmm7 -; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm24 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm19[0,1,0,2] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm8 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX512DQ-FAST-NEXT: vporq %ymm1, %ymm2, %ymm28 -; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm1 -; AVX512DQ-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm11 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 672(%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512DQ-FAST-NEXT: vpbroadcastw 700(%rdi), %xmm2 ; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm13 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, %ymm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm16 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3,4,5],xmm1[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u> +; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm17 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm16, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm19 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3,4,5],xmm3[6],xmm5[7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm8[2],ymm1[3,4,5],ymm8[6],ymm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm11 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1,2,3,4,5,6],ymm9[7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0],xmm3[1],xmm13[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm13, %xmm31 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm10 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm7 -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm18, %zmm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm7, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm0[2],ymm3[3,4,5],ymm0[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %ymm21 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm21[0,1,0,2] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm20 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,2,3,4,5,6,7,4,5,6,7,6,7,12,13,16,17,18,19,20,21,22,23,20,21,22,23,22,23,28,29] +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0],xmm0[1],xmm6[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm21 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2d %ymm15, %ymm22, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm15, %ymm24 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 608(%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm9[2],ymm11[3,4,5],ymm9[6],ymm11[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %ymm22 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm22[0,1,0,2] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa 688(%rdi), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm5[1],xmm7[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,6,9,u,13,u,u,u> +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %ymm16, %ymm4, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,6],ymm4[7] -; AVX512DQ-FAST-NEXT: vmovdqa 688(%rdi), %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, (%rsp) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm3[1],xmm12[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u> -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm14[1],ymm0[2,3],ymm14[4],ymm0[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm11, %ymm20 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm16 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [2,5,2,5,2,5,2,5] -; AVX512DQ-FAST-NEXT: vpermd %ymm19, %ymm10, %ymm13 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3,4,5,6],ymm13[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm5 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm14 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2],ymm0[3],ymm12[4,5],ymm0[6],ymm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm19 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,1,0,3,2,3,2,5] +; AVX512DQ-FAST-NEXT: vpermd %ymm28, %ymm16, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm17 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm29 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, %xmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2d %ymm24, %ymm25, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm27 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm9 -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm9, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vpermd %ymm21, %ymm10, %ymm4 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm12, %xmm22 -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm6 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3,4,5],xmm6[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm9, %ymm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm11, %ymm25 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpermd %ymm22, %ymm16, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm21 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2],xmm1[3],xmm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,11,14,7,4,5,14,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm10, %ymm4 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm23, %ymm6, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3,4,5],xmm4[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,8,9,10,11,2,3,8,9,16,17,18,19,20,21,22,23,24,25,26,27,18,19,24,25] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm28[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm10[7] +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm10 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2d %ymm27, %ymm24, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm15[1],ymm0[2,3,4],ymm15[5],ymm0[6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm6 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm2[1],xmm6[2],xmm2[3],xmm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [10,3,6,15,12,13,6,15] -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm16, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm9, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3,4,5],xmm9[6],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm14 -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm19[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7] -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm14 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm12, %zmm20 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm1[1],ymm15[2,3,4],ymm1[5],ymm15[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm5[1],xmm12[2],xmm5[3],xmm12[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm16, %zmm12 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm6 -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm5, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm2, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5],xmm6[6],xmm5[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm21[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm9[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm23 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2,3,4],ymm8[5],ymm3[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm22[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm12[1],ymm6[2,3,4],ymm12[5],ymm6[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] ; AVX512DQ-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm8 -; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm31, %xmm6 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm2, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm18 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [3,6,10,13,3,6,10,13] -; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm26, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-FAST-NEXT: movw $992, %ax # imm = 0x3E0 -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm24, %zmm2, %zmm29 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm13[1],ymm11[2,3,4],ymm13[5],ymm11[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm11 +; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm18, %xmm5 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm18 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2],xmm1[3],xmm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX512DQ-FAST-NEXT: vpbroadcastw 680(%rdi), %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm22, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm0 +; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm16, %xmm3 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,3,3,3,0,3,7,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm19, %ymm13, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,5,9,12,2,5,9,12] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm5, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6,7],ymm7[8,9,10,11,12],ymm2[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm11, %xmm7 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa 736(%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm9, %ymm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm17 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,5,9,12,2,5,9,12] +; AVX512DQ-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm20, %ymm7, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,3,3,3,0,3,7,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm6 +; AVX512DQ-FAST-NEXT: vpermd %ymm28, %ymm9, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm4 +; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm17, %xmm3 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm11, %xmm5 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 416(%rdi), %ymm24 +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, %ymm1 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm24, %ymm7, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 864(%rdi), %ymm23 +; AVX512DQ-FAST-NEXT: vmovdqa 832(%rdi), %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm15, %ymm20 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm23, %ymm7, %ymm20 +; AVX512DQ-FAST-NEXT: vpermi2d %ymm26, %ymm25, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm2 +; AVX512DQ-FAST-NEXT: vpermd %ymm22, %ymm9, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7],ymm2[8,9,10,11,12],ymm3[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm1 +; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm21, %xmm3 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, %xmm11 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm31 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [2,6,9,13,2,6,9,13] +; AVX512DQ-FAST-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm7 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm16, %ymm17, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,4,7,0,0,4,7,0] +; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %ymm6, %ymm9, %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm14, %ymm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm14[5,6,7],ymm3[8,9,10,11,12],ymm14[13,14,15] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm12, %xmm12 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm13 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm26, %ymm17, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vpermd %ymm22, %ymm9, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm29 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [3,6,10,13,3,6,10,13] +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm16, %ymm0, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm10, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-FAST-NEXT: movw $992, %ax # imm = 0x3E0 +; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 {%k1} # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 288(%rdi), %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3],xmm9[4],xmm2[5],xmm9[6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,u,u,u,4,7,11,14> -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm10, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm12, %ymm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3,4,5,6],xmm12[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm26, %zmm12 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm28 {%k1} # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermd %ymm21, %ymm13, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm5, %zmm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm7, %xmm1 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm19, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,6,9,13,2,6,9,13] -; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} xmm19 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512DQ-FAST-NEXT: vpermd %zmm6, %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 288(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0,1,2],xmm8[3],xmm15[4],xmm8[5],xmm15[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm28 -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm10, %zmm10 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm9 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3,4,5,6],xmm9[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm9 -; AVX512DQ-FAST-NEXT: vpermd %zmm9, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6],xmm8[7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,u,u,u,4,8,11,15> -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm14, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3,4,5,6],xmm10[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm9, %zmm4, %zmm10 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29] -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm10[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [10,11,10,11,10,11,10,11,8,9,6,7,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 352(%rdi), %ymm22 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,u,u,u,4,7,11,14> +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm2 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm22, %ymm12, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3,4,5,6],xmm2[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm14 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3],xmm5[4],xmm10[5],xmm5[6],xmm10[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm14, %zmm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3,4,5,6],xmm8[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm6, %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm13[2],ymm11[3,4,5],ymm13[6],ymm11[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm16 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm15 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,u,u,u,5,8,12,15> -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm10, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm9, %zmm26, %zmm5 -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm27 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermt2d %ymm26, %ymm0, %ymm13 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm13, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 {%k1} # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa 736(%rdi), %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm13[2,3],ymm5[4,5],ymm13[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm7 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4],xmm2[5],xmm5[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 800(%rdi), %ymm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %ymm16 +; AVX512DQ-FAST-NEXT: vpermi2d %ymm19, %ymm16, %ymm12 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3,4,5,6],xmm1[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm14, %ymm20 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm10, %ymm11 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6],xmm5[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [12,13,12,13,12,13,12,13,10,11,8,9,6,7,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <9,u,u,u,12,0,3,7> +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm12 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm21, %ymm9, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3,4,5,6],xmm12[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, %ymm12 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm24, %ymm17, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,2,3,4,5,6,7,2,3,4,5,10,11,12,13,20,21,18,19,20,21,22,23,18,19,20,21,26,27,28,29] +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm13[3],ymm7[4,5],ymm13[6],ymm7[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1,2,3],xmm1[4],xmm12[5],xmm1[6],xmm12[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpermi2d %ymm16, %ymm19, %ymm9 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5,6],xmm2[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm3[2],ymm8[3,4,5],ymm3[6],ymm8[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm24 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm8[1],ymm3[2,3,4],ymm8[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %ymm3, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm3 +; AVX512DQ-FAST-NEXT: vpermi2d %ymm23, %ymm15, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm3[2],ymm15[3,4,5],ymm3[6],ymm15[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2],ymm3[3],ymm15[4,5],ymm3[6],ymm15[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm15[1],ymm3[2,3],ymm15[4],ymm3[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm15[1],ymm3[2,3,4],ymm15[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %ymm23, %ymm17, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5],xmm6[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <9,u,u,u,13,0,4,7> +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm6 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm21, %ymm15, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm6 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm27 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm2[2],ymm3[3,4,5],ymm2[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6],xmm0[7] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2d %ymm16, %ymm19, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm7[2],ymm13[3,4,5],ymm7[6],ymm13[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm10[4],xmm3[5],xmm10[6],xmm3[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm10, %zmm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm8, %ymm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm6, %zmm26, %zmm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermd %ymm21, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm4, %zmm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 416(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm14 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2],ymm4[3],ymm15[4,5],ymm4[6],ymm15[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [2,11,2,11,12,5,8,9] -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm9, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm10, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm20 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm20 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 864(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 832(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm10[4],xmm0[5],xmm10[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm23 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm23 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, %xmm3 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = <0,3,7,10,14,u,u,u> -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermd %zmm23, %zmm27, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm29, %zmm25 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm14, %ymm20 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm0[1],xmm12[2,3,4,5],xmm0[6],xmm12[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm13 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <2,u,u,u,6,9,13,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm15, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm10, %ymm12, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm25 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm27, %zmm10 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm29, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2,3,4,5],xmm0[6],xmm10[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm15, %zmm10 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,10,3,4,13,0,1] ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0],ymm11[1],ymm3[2,3],ymm11[4],ymm3[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3,4,5],xmm12[6],xmm10[7] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm21, %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm10, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm31 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm21 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm27 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm25 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = <0,4,7,11,14,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm23, %zmm22, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1],xmm12[2,3,4,5],xmm15[6],xmm12[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0],ymm13[1],ymm7[2,3,4],ymm13[5],ymm7[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm15[1],xmm8[2],xmm15[3],xmm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm12 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <3,u,u,u,6,10,13,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm8, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm20 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, %ymm6 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm22, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm8[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2,3,4,5],xmm8[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm16, %zmm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm15 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0],ymm11[1],ymm3[2,3,4],ymm11[5],ymm3[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2d %ymm16, %ymm19, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6],xmm5[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm7[3],ymm13[4,5],ymm7[6],ymm13[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm23 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm8, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm9 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,4,8,11,15,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm23, %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3,4,5],xmm9[6],xmm0[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm13[2],ymm7[3,4],ymm13[5],ymm7[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,u,u,u,6,9,13,u> +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm3 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm22, %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm8[2],ymm6[3,4],ymm8[5],ymm6[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,7,10,14,u,u,u> +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %ymm15, %ymm9, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm10[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm27, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2d %ymm19, %ymm16, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm13[1],ymm7[2,3],ymm13[4],ymm7[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm25 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5],xmm4[6],xmm3[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm14 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm3 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,10,3,14,7,10,3] -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm7, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm4[1,2],ymm11[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7],ymm4[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm12 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm12 {%k1} +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %ymm18, %ymm1, %ymm9 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm27, %zmm30 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm2 +; AVX512DQ-FAST-NEXT: vextracti32x4 $1, %ymm24, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm20 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <3,u,u,u,6,10,13,u> +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm4 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm22, %ymm3, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm8[2,3],ymm6[4,5],ymm8[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm8, %ymm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm26 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm14, %xmm28 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,4,7,11,14,u,u,u> +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm12 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm15, %ymm4, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm15, %ymm24 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [2,3,2,3,2,3,2,3,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm12, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2d %ymm19, %ymm16, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3,4,5],xmm3[6],xmm5[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, %ymm8 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm9 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0],xmm5[1],xmm9[2],xmm5[3],xmm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm5, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2d %ymm18, %ymm1, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm10, %ymm15 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm27, %zmm31 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm31 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2],ymm11[3,4],ymm1[5],ymm11[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,10,3,14,7,10,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm5 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm22, %ymm3, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3,4,5,6,7],ymm1[8,9,10],ymm5[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm9 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm10, %zmm4 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <9,12,0,3,7,u,u,u> +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm12 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm17, %ymm9, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm12, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm12[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm5 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm19, %ymm3, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm15[2],ymm3[3,4],ymm15[5],ymm3[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm3 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm7, %zmm7 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2],xmm6[3],xmm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm25, %ymm9, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm27, %zmm29 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm29 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm2, %zmm14 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 64(%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rcx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rcx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%r8) -; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 64(%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm2, %zmm11 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm2, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 64(%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, (%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 64(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, (%rdx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 64(%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, (%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 64(%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, (%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 64(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, (%r9) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 64(%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, (%rax) -; AVX512DQ-FAST-NEXT: addq $1288, %rsp # imm = 0x508 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 64(%rax) +; AVX512DQ-FAST-NEXT: addq $1512, %rsp # imm = 0x5E8 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll @@ -541,7 +541,7 @@ ; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,4] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] @@ -560,7 +560,7 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3] ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm15 = [3,7,3,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [3,7,3,3] ; AVX512F-SLOW-NEXT: vpermt2d %xmm13, %xmm15, %xmm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] @@ -603,7 +603,7 @@ ; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm5 ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm6 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [0,4,0,4] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,0,4] ; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm0 ; AVX512F-FAST-NEXT: vpermt2d %xmm10, %xmm3, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm7 @@ -615,18 +615,18 @@ ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [1,5,1,1] ; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm2 ; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm9, %xmm2 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm0 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,6,2,6] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,6] ; AVX512F-FAST-NEXT: vpermt2d %xmm10, %xmm1, %xmm0 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [3,7,3,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [3,7,3,3] ; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm10, %xmm15 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm14 @@ -716,46 +716,45 @@ ; SSE-LABEL: load_i16_stride8_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $168, %rsp -; SSE-NEXT: movdqa 112(%rdi), %xmm6 +; SSE-NEXT: movdqa 112(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm5 +; SSE-NEXT: movdqa 192(%rdi), %xmm4 ; SSE-NEXT: movdqa 240(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm12 +; SSE-NEXT: movdqa 224(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm10 -; SSE-NEXT: movdqa 176(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa 128(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm13 +; SSE-NEXT: movdqa 160(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm11 +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,0,0,0] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] ; SSE-NEXT: movdqa 80(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm5[2],xmm15[3],xmm5[3] ; SSE-NEXT: movdqa 32(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm0 @@ -774,137 +773,136 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm9[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; SSE-NEXT: movdqa %xmm7, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm10[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; SSE-NEXT: movdqa %xmm2, %xmm15 ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm11[2],xmm15[3],xmm11[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm8[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[3,3,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm7[2,3] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckhwd (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm14 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,0,0] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,0,0] +; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] ; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm11[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm14[2],xmm11[3],xmm14[3] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm11[0],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm12[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[3,3,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm13[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm14[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm3[2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm3[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) +; SSE-NEXT: movaps %xmm2, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movaps %xmm8, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movaps %xmm2, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%r8) +; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movaps %xmm8, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%r8) +; SSE-NEXT: movapd %xmm15, 16(%r9) ; SSE-NEXT: movaps %xmm1, (%r9) -; SSE-NEXT: movapd %xmm12, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm7, (%rax) -; SSE-NEXT: movaps %xmm4, 16(%rax) +; SSE-NEXT: movaps %xmm7, 16(%rax) +; SSE-NEXT: movaps %xmm9, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movapd %xmm10, 16(%rax) ; SSE-NEXT: movaps %xmm6, (%rax) -; SSE-NEXT: movapd %xmm9, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps %xmm14, (%rax) +; SSE-NEXT: movaps %xmm12, (%rax) ; SSE-NEXT: addq $168, %rsp ; SSE-NEXT: retq ; @@ -1288,7 +1286,7 @@ ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm26 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm27 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [0,0,0,4] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm29 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] @@ -1353,7 +1351,7 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm17 = [3,7,3,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [3,7,3,3] ; AVX512F-SLOW-NEXT: vpermt2d %xmm4, %xmm17, %xmm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm11[2,3] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] @@ -1456,7 +1454,7 @@ ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm28 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm29 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,4,0,4] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,0,0,4] ; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm13 ; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm7, %xmm13 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 @@ -1490,7 +1488,7 @@ ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [1,5,1,1] ; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm1 ; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm15, %xmm1 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] @@ -1506,7 +1504,7 @@ ; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 ; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm0 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [2,6,2,6] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,6] ; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm12, %xmm0 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] @@ -1523,7 +1521,7 @@ ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm18 = [3,7,3,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} xmm18 = [3,7,3,3] ; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm18, %xmm11 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm2[2,3] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] @@ -1723,12 +1721,12 @@ ; SSE-NEXT: movdqa 128(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 176(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] @@ -1826,7 +1824,7 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa %xmm15, %xmm0 @@ -1913,13 +1911,13 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd (%rsp), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] @@ -1927,239 +1925,242 @@ ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm1, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm12 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,0,0] -; SSE-NEXT: movdqa %xmm8, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm14 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,0,0] +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,0,0] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,0,0,0] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm2, %xmm12 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: movdqa %xmm11, %xmm6 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm14 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa %xmm10, %xmm13 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movapd %xmm7, %xmm0 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movapd %xmm6, %xmm0 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: movdqa %xmm15, %xmm9 +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm4, %xmm14 -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm2[2],xmm14[3],xmm2[3] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[2,3] +; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm3[2],xmm15[3],xmm3[3] +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm1[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[3,3,3,3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm1[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,3,3,3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%r9) -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm9, 32(%rax) -; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rax) -; SSE-NEXT: movaps %xmm10, (%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rax) +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 16(%r8) +; SSE-NEXT: movaps (%rsp), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 32(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 48(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 16(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%r9) +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 16(%rax) +; SSE-NEXT: movaps %xmm10, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movapd %xmm14, 32(%rax) ; SSE-NEXT: movapd %xmm13, 48(%rax) -; SSE-NEXT: movapd %xmm12, 32(%rax) -; SSE-NEXT: movapd %xmm11, 16(%rax) -; SSE-NEXT: movaps %xmm14, (%rax) +; SSE-NEXT: movapd %xmm12, 16(%rax) +; SSE-NEXT: movaps %xmm15, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm1, 48(%rax) ; SSE-NEXT: movaps %xmm2, 32(%rax) -; SSE-NEXT: movaps %xmm15, 16(%rax) +; SSE-NEXT: movaps %xmm3, 16(%rax) ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: addq $696, %rsp # imm = 0x2B8 ; SSE-NEXT: retq @@ -2567,292 +2568,284 @@ ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm15, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: addq $872, %rsp # imm = 0x368 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i16_stride8_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1000, %rsp # imm = 0x3E8 -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX2-ONLY-NEXT: subq $968, %rsp # imm = 0x3C8 +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 304(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, %xmm9 -; AVX2-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm15[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa 304(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] -; AVX2-ONLY-NEXT: vmovdqa %xmm2, %xmm10 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm14 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm11[7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm12 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm10[7] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm2[0,1,0,2] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm12 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5],ymm12[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa %xmm9, %xmm14 -; AVX2-ONLY-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm15[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa %xmm10, %xmm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm15[0],xmm11[1],xmm15[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] -; AVX2-ONLY-NEXT: vmovdqa %xmm9, %xmm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm1[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1,2],xmm4[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm11, %xmm12 -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm11[2],xmm15[2],xmm11[3],xmm15[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %xmm15, %xmm9 +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpunpckhdq (%rsp), %xmm3, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = mem[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0],xmm5[1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm12 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4],ymm12[5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm12[0],xmm4[1],xmm12[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm10[2],mem[2],xmm10[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vpbroadcastd %xmm4, %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm2, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] @@ -2880,23 +2873,23 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vpbroadcastd %xmm11, %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm13, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm14 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] @@ -2907,8 +2900,8 @@ ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm13 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm14 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2920,16 +2913,16 @@ ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] @@ -2940,11 +2933,11 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] @@ -2955,111 +2948,112 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm5[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm0[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm11, %xmm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm10, %xmm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = mem[2,3,2,3] ; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm15 = mem[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm1 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm8, (%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rax) -; AVX2-ONLY-NEXT: addq $1000, %rsp # imm = 0x3E8 +; AVX2-ONLY-NEXT: addq $968, %rsp # imm = 0x3C8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -3076,7 +3070,7 @@ ; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,4] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm27 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] @@ -3238,7 +3232,7 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [3,7,3,3] ; AVX512F-SLOW-NEXT: vpermt2d %xmm11, %xmm0, %xmm7 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, %xmm6 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 @@ -3277,7 +3271,7 @@ ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm31 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm2 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,4] ; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm1, %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload @@ -3409,7 +3403,7 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [3,7,3,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [3,7,3,3] ; AVX512F-SLOW-NEXT: vpermt2d %xmm16, %xmm12, %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3] @@ -3466,7 +3460,7 @@ ; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,4] ; AVX512F-FAST-NEXT: vmovdqa %xmm3, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 ; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 @@ -3563,7 +3557,7 @@ ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,1,1] ; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm0 ; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm1, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm11 @@ -3594,7 +3588,7 @@ ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm2 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,6] ; AVX512F-FAST-NEXT: vpermt2d %xmm22, %xmm0, %xmm2 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm20[2],xmm9[3],xmm20[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] @@ -3630,7 +3624,7 @@ ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm27 = [3,7,3,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} xmm27 = [3,7,3,3] ; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm27, %xmm9 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm0 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3] @@ -3667,7 +3661,7 @@ ; AVX512F-FAST-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm5 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm1 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,4,0,4] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,4] ; AVX512F-FAST-NEXT: vpermt2d %xmm3, %xmm0, %xmm1 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm24 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] @@ -3733,7 +3727,7 @@ ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm31, %zmm31 ; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm0 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,1,1] ; AVX512F-FAST-NEXT: vpermt2d %xmm18, %xmm4, %xmm0 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm24[0],xmm12[1],xmm24[1] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] @@ -3762,7 +3756,7 @@ ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm12[2],xmm24[2],xmm12[3],xmm24[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,6] ; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm0, %xmm8 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm18[2],xmm5[3],xmm18[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] @@ -3986,291 +3980,275 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i16_stride8_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1800, %rsp # imm = 0x708 -; SSE-NEXT: movdqa 752(%rdi), %xmm2 +; SSE-NEXT: subq $1784, %rsp # imm = 0x6F8 +; SSE-NEXT: movdqa 240(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 592(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 576(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 736(%rdi), %xmm3 +; SSE-NEXT: movdqa 624(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm4 +; SSE-NEXT: movdqa 608(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm5 +; SSE-NEXT: movdqa 528(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm6 +; SSE-NEXT: movdqa 512(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm7 +; SSE-NEXT: movdqa 560(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm0 +; SSE-NEXT: movdqa 544(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: movdqa %xmm0, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,0,0] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 720(%rdi), %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 704(%rdi), %xmm1 +; SSE-NEXT: movdqa 192(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 688(%rdi), %xmm2 +; SSE-NEXT: movdqa 176(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm0 +; SSE-NEXT: movdqa 160(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 656(%rdi), %xmm2 +; SSE-NEXT: movdqa 144(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 640(%rdi), %xmm0 +; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 624(%rdi), %xmm0 +; SSE-NEXT: movdqa 752(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 608(%rdi), %xmm1 +; SSE-NEXT: movdqa 736(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 592(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 720(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm1 +; SSE-NEXT: movdqa 704(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 560(%rdi), %xmm2 +; SSE-NEXT: movdqa 688(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 544(%rdi), %xmm0 +; SSE-NEXT: movdqa 672(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 528(%rdi), %xmm2 +; SSE-NEXT: movdqa 656(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 512(%rdi), %xmm0 +; SSE-NEXT: movdqa 640(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 496(%rdi), %xmm0 +; SSE-NEXT: movdqa 368(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 480(%rdi), %xmm1 +; SSE-NEXT: movdqa 352(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 464(%rdi), %xmm0 +; SSE-NEXT: movdqa 336(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 320(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 432(%rdi), %xmm2 +; SSE-NEXT: movdqa 304(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 416(%rdi), %xmm0 +; SSE-NEXT: movdqa 288(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 400(%rdi), %xmm2 +; SSE-NEXT: movdqa 272(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 384(%rdi), %xmm0 +; SSE-NEXT: movdqa 256(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1008(%rdi), %xmm0 +; SSE-NEXT: movdqa 880(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 992(%rdi), %xmm1 +; SSE-NEXT: movdqa 864(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 976(%rdi), %xmm0 +; SSE-NEXT: movdqa 848(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 960(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 832(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 944(%rdi), %xmm2 +; SSE-NEXT: movdqa 816(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 928(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 912(%rdi), %xmm2 +; SSE-NEXT: movdqa 800(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] +; SSE-NEXT: movdqa 784(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 896(%rdi), %xmm0 +; SSE-NEXT: movdqa 768(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 368(%rdi), %xmm0 +; SSE-NEXT: movdqa 496(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm1 +; SSE-NEXT: movdqa 480(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm0 +; SSE-NEXT: movdqa 464(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 448(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 304(%rdi), %xmm2 +; SSE-NEXT: movdqa 432(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm0 +; SSE-NEXT: movdqa 416(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm0 +; SSE-NEXT: movdqa 400(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 384(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 880(%rdi), %xmm0 +; SSE-NEXT: movdqa 1008(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 864(%rdi), %xmm1 +; SSE-NEXT: movdqa 992(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 848(%rdi), %xmm0 +; SSE-NEXT: movdqa 976(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 832(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 960(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 816(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 800(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 784(%rdi), %xmm0 +; SSE-NEXT: movdqa 944(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 768(%rdi), %xmm15 +; SSE-NEXT: movdqa 928(%rdi), %xmm15 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa 912(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 896(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm9 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] +; SSE-NEXT: movdqa 64(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movdqa 32(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -4283,113 +4261,123 @@ ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm6[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movapd %xmm11, %xmm0 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movapd %xmm4, %xmm0 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movapd %xmm7, %xmm0 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,2,2] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -4402,19 +4390,15 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -4426,43 +4410,46 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm11[2],xmm2[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm14[2],xmm10[3],xmm14[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] @@ -4470,46 +4457,47 @@ ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movdqa %xmm6, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -4521,8 +4509,8 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload @@ -4536,23 +4524,24 @@ ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -4560,16 +4549,40 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,0,0] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] @@ -4578,29 +4591,6 @@ ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,0,0] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -4612,8 +4602,8 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4635,87 +4625,85 @@ ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] ; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm14 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 @@ -4724,33 +4712,32 @@ ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm13[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -4762,32 +4749,31 @@ ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movapd %xmm14, %xmm0 -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movapd %xmm13, %xmm0 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movapd %xmm12, %xmm0 +; SSE-NEXT: unpckhps (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] -; SSE-NEXT: movaps %xmm2, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm14[2],xmm7[3],xmm14[3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -4828,17 +4814,15 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[3,3,3,3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,3,3,3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[3,3,3,3] +; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] @@ -4847,66 +4831,67 @@ ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm15[2],xmm12[3],xmm15[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,3,3,3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,3,3,3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 64(%rsi) +; SSE-NEXT: movaps %xmm15, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, (%rsi) +; SSE-NEXT: movaps %xmm15, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 96(%rdx) +; SSE-NEXT: movaps %xmm15, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 32(%rdx) +; SSE-NEXT: movaps %xmm15, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 64(%rdx) +; SSE-NEXT: movaps %xmm15, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, (%rdx) +; SSE-NEXT: movaps %xmm15, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 96(%rcx) +; SSE-NEXT: movaps %xmm15, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 32(%rcx) +; SSE-NEXT: movaps %xmm15, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 48(%rcx) +; SSE-NEXT: movaps %xmm15, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, (%rcx) +; SSE-NEXT: movaps %xmm15, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 80(%rcx) +; SSE-NEXT: movaps %xmm15, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 112(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 96(%r8) @@ -4977,62 +4962,62 @@ ; SSE-NEXT: movaps %xmm6, 32(%rax) ; SSE-NEXT: movaps %xmm8, 16(%rax) ; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: addq $1800, %rsp # imm = 0x708 +; SSE-NEXT: addq $1784, %rsp # imm = 0x6F8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride8_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2056, %rsp # imm = 0x808 -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX1-ONLY-NEXT: subq $2040, %rsp # imm = 0x7F8 +; AVX1-ONLY-NEXT: vmovdqa 560(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm14[0],xmm11[1],xmm14[1] -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 624(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 608(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 592(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 752(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 720(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 704(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm9 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 672(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 656(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5042,57 +5027,57 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 880(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa 848(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 832(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 816(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 800(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 784(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 992(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa 976(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 960(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa 944(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 928(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 912(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 896(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5102,59 +5087,59 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 624(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 880(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 608(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa 592(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 848(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 832(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 560(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 816(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 800(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 784(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 752(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 992(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa 720(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 976(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 704(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 960(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 944(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 672(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 928(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 656(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 896(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5184,7 +5169,7 @@ ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm2 @@ -5200,7 +5185,7 @@ ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5291,14 +5276,14 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1],mem[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm15[4,5,6,7] @@ -5428,7 +5413,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps $238, (%rsp), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[3,3,3,3] @@ -5436,7 +5421,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpunpckhdq (%rsp), %xmm2, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] @@ -5447,23 +5432,25 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm11 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm14 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload @@ -5472,31 +5459,30 @@ ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm11 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -5508,8 +5494,8 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm13 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload @@ -5522,12 +5508,13 @@ ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] @@ -5537,9 +5524,8 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -5555,8 +5541,42 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -5565,388 +5585,356 @@ ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm3[2,3],xmm14[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm11[0],xmm14[1],xmm11[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm2[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm11[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm14 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm2[1,1,1,1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm12[0],mem[0],xmm12[1],mem[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm15[2],xmm3[2],xmm15[3],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm11[0,1,2,3,4,5],xmm14[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm9[0],mem[0],xmm9[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm14[2],xmm4[2],xmm14[3],xmm4[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm5[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps $170, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2],xmm14[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2],xmm15[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm7[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm8[2],xmm11[2],xmm8[3],xmm11[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm3[0,1,2],xmm14[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm13[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vunpckhps (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm15[2],mem[2],xmm15[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps (%rsp), %xmm2, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm11[2],mem[2],xmm11[3],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0],xmm9[1],xmm14[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm13[2],mem[2],xmm13[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) -; AVX1-ONLY-NEXT: addq $2056, %rsp # imm = 0x808 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, (%rax) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: addq $2040, %rsp # imm = 0x7F8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i16_stride8_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $2408, %rsp # imm = 0x968 -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX2-ONLY-NEXT: subq $2376, %rsp # imm = 0x948 +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 304(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5959,9 +5947,9 @@ ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5977,28 +5965,28 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 880(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 624(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa 848(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 592(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovdqa 784(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 816(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa 560(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6006,9 +5994,9 @@ ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6019,54 +6007,54 @@ ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa 304(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6077,76 +6065,76 @@ ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 624(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa 880(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm4, %xmm4 -; AVX2-ONLY-NEXT: vmovdqa 592(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa 848(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm5, %xmm5 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa 784(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 560(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 816(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm5[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm9[0,1,2,3,4,5,6],ymm8[7] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm8[0,1,0,2] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm7 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0,1,2,3,4,5,6],ymm6[7] +; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm6[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm9[0,1,0,2] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm7[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6155,10 +6143,10 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm8[1],xmm12[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[1,1,1,1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm6[1],xmm12[2,3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] @@ -6187,20 +6175,20 @@ ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,1,1] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[1,1,1,1] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm12[1],xmm6[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm12[1],xmm8[2,3] +; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] @@ -6208,15 +6196,15 @@ ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm8[0],mem[0],xmm8[1],mem[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] @@ -6233,7 +6221,7 @@ ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] @@ -6274,11 +6262,10 @@ ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6287,28 +6274,28 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] @@ -6365,35 +6352,34 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm11 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5],ymm11[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd $255, (%rsp), %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm3 = mem[3,3,3,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,3,3,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[3,3,3,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] @@ -6406,25 +6392,25 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vpbroadcastd %xmm4, %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm2, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] @@ -6482,10 +6468,10 @@ ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] @@ -6493,56 +6479,10 @@ ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vpbroadcastd %xmm2, %xmm1 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6560,118 +6500,168 @@ ; AVX2-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vpbroadcastd %xmm12, %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm4, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm12 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastd %xmm10, %xmm10 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %xmm11 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm11[0,1],xmm10[2,3] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm13 = ymm0[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5,6],ymm12[7] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm12 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7] ; AVX2-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,1,1,1] -; AVX2-ONLY-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX2-ONLY-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = xmm11[0],mem[0],xmm11[1],mem[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm13[1],xmm4[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] @@ -6694,213 +6684,210 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm14[2],mem[2],xmm14[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm5[2],xmm14[3],xmm5[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[2,2,2,2] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm6[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm11[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX2-ONLY-NEXT: vpshufd $238, (%rsp), %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[2,3,2,3] ; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm3 = mem[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX2-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; AVX2-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm3 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = mem[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX2-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = mem[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,6],ymm3[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm11 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = mem[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm8 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = mem[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r9) -; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r9) +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rax) -; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm15, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rax) +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 96(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 64(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%rax) -; AVX2-ONLY-NEXT: addq $2408, %rsp # imm = 0x968 +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 96(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-ONLY-NEXT: addq $2376, %rsp # imm = 0x948 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -6917,7 +6904,7 @@ ; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,0,0,4] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] @@ -7306,7 +7293,7 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [3,7,3,3] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm1 ; AVX512F-SLOW-NEXT: vpermt2d %xmm16, %xmm0, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm16 @@ -7379,7 +7366,7 @@ ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [0,0,0,4] ; AVX512F-SLOW-NEXT: vpermt2d %xmm5, %xmm10, %xmm6 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload @@ -7720,7 +7707,7 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm16 = [3,7,3,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm16 = [3,7,3,3] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 ; AVX512F-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm16, %xmm1 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -7827,7 +7814,7 @@ ; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,4] ; AVX512F-FAST-NEXT: vmovdqa %xmm14, %xmm0 ; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 @@ -8059,7 +8046,7 @@ ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [1,5,1,1] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm0 ; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm9, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm30 @@ -8132,7 +8119,7 @@ ; AVX512F-FAST-NEXT: vmovdqa %xmm14, %xmm0 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm24[2],xmm14[3],xmm24[3] ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,6,2,6] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,6] ; AVX512F-FAST-NEXT: vpermt2d %xmm24, %xmm1, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm5 ; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload @@ -8234,7 +8221,7 @@ ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [3,7,3,3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm1 ; AVX512F-FAST-NEXT: vpermt2d %xmm28, %xmm0, %xmm1 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 @@ -8307,7 +8294,7 @@ ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm0 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [0,4,0,4] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,0,0,4] ; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm5, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm29 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -8486,7 +8473,7 @@ ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, (%rsp) # 16-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [1,5,1,1] ; AVX512F-FAST-NEXT: vpermt2d %xmm19, %xmm15, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm29[0],xmm9[1],xmm29[1] @@ -8553,7 +8540,7 @@ ; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm0 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm29[2],xmm9[3],xmm29[3] ; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [2,6,2,6] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,6] ; AVX512F-FAST-NEXT: vpermt2d %xmm29, %xmm5, %xmm0 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm23[2],xmm19[2],xmm23[3],xmm19[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] @@ -8651,7 +8638,7 @@ ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm17 = [3,7,3,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} xmm17 = [3,7,3,3] ; AVX512F-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX512F-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm17, %xmm0 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll @@ -53,25 +53,15 @@ ; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: retq ; -; AVX1-LABEL: load_i32_stride2_vf4: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps (%rdi), %xmm0 -; AVX1-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; AVX1-NEXT: vmovaps %xmm2, (%rsi) -; AVX1-NEXT: vmovaps %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX512-LABEL: load_i32_stride2_vf4: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vmovaps (%rdi), %xmm1 -; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],mem[1,3] -; AVX512-NEXT: vpmovqd %ymm0, (%rsi) -; AVX512-NEXT: vmovaps %xmm1, (%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: load_i32_stride2_vf4: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,2] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX-NEXT: vmovaps %xmm2, (%rsi) +; AVX-NEXT: vmovaps %xmm0, (%rdx) +; AVX-NEXT: retq %wide.vec = load <8 x i32>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> %strided.vec1 = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> @@ -87,16 +77,16 @@ ; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps 32(%rdi), %xmm2 ; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2] -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm3[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; SSE-NEXT: movaps %xmm5, (%rsi) -; SSE-NEXT: movaps %xmm4, 16(%rsi) -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE-NEXT: movaps %xmm5, 16(%rsi) +; SSE-NEXT: movaps %xmm4, (%rsi) ; SSE-NEXT: movaps %xmm2, 16(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride2_vf8: @@ -126,23 +116,27 @@ ; ; AVX512-SLOW-LABEL: load_i32_stride2_vf8: ; AVX512-SLOW: # %bb.0: -; AVX512-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-SLOW-NEXT: vmovaps (%rdi), %ymm1 -; AVX512-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],mem[1,3],ymm1[5,7],mem[5,7] -; AVX512-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3] -; AVX512-SLOW-NEXT: vpmovqd %zmm0, (%rsi) -; AVX512-SLOW-NEXT: vmovaps %ymm1, (%rdx) +; AVX512-SLOW-NEXT: vmovaps (%rdi), %ymm0 +; AVX512-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX512-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX512-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX512-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX512-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512-SLOW-NEXT: vmovaps %ymm2, (%rsi) +; AVX512-SLOW-NEXT: vmovaps %ymm0, (%rdx) ; AVX512-SLOW-NEXT: vzeroupper ; AVX512-SLOW-NEXT: retq ; ; AVX512-FAST-LABEL: load_i32_stride2_vf8: ; AVX512-FAST: # %bb.0: -; AVX512-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] -; AVX512-FAST-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2 -; AVX512-FAST-NEXT: vpmovqd %zmm0, (%rsi) -; AVX512-FAST-NEXT: vmovdqa %ymm2, (%rdx) +; AVX512-FAST-NEXT: vmovaps {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14] +; AVX512-FAST-NEXT: vmovaps (%rdi), %ymm1 +; AVX512-FAST-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX512-FAST-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0 +; AVX512-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15] +; AVX512-FAST-NEXT: vpermi2ps %ymm2, %ymm1, %ymm3 +; AVX512-FAST-NEXT: vmovaps %ymm0, (%rsi) +; AVX512-FAST-NEXT: vmovaps %ymm3, (%rdx) ; AVX512-FAST-NEXT: vzeroupper ; AVX512-FAST-NEXT: retq %wide.vec = load <16 x i32>, ptr %in.vec, align 64 @@ -160,48 +154,48 @@ ; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps 32(%rdi), %xmm2 ; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: movaps 80(%rdi), %xmm4 -; SSE-NEXT: movaps 64(%rdi), %xmm5 -; SSE-NEXT: movaps 112(%rdi), %xmm6 -; SSE-NEXT: movaps 96(%rdi), %xmm7 +; SSE-NEXT: movaps 112(%rdi), %xmm4 +; SSE-NEXT: movaps 96(%rdi), %xmm5 +; SSE-NEXT: movaps 80(%rdi), %xmm6 +; SSE-NEXT: movaps 64(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm6[0,2] ; SSE-NEXT: movaps %xmm5, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm4[0,2] -; SSE-NEXT: movaps %xmm2, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm3[0,2] -; SSE-NEXT: movaps %xmm0, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm2, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm3[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm6[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; SSE-NEXT: movaps %xmm9, 32(%rsi) -; SSE-NEXT: movaps %xmm8, 48(%rsi) -; SSE-NEXT: movaps %xmm11, (%rsi) -; SSE-NEXT: movaps %xmm10, 16(%rsi) -; SSE-NEXT: movaps %xmm5, 32(%rdx) -; SSE-NEXT: movaps %xmm7, 48(%rdx) -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE-NEXT: movaps %xmm9, 48(%rsi) +; SSE-NEXT: movaps %xmm8, 32(%rsi) +; SSE-NEXT: movaps %xmm11, 16(%rsi) +; SSE-NEXT: movaps %xmm10, (%rsi) +; SSE-NEXT: movaps %xmm5, 48(%rdx) +; SSE-NEXT: movaps %xmm7, 32(%rdx) ; SSE-NEXT: movaps %xmm2, 16(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride2_vf16: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,2],ymm2[0,2],ymm1[4,6],ymm2[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,2],ymm4[0,2],ymm0[4,6],ymm4[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm2[1,3],ymm1[5,7],ymm2[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm4[1,3],ymm0[5,7],ymm4[5,7] -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[0,2],ymm4[0,2],ymm1[4,6],ymm4[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm4[1,3],ymm1[5,7],ymm4[5,7] +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -211,18 +205,18 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm3[1,3],ymm2[5,7],ymm3[5,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,3],ymm3[1,3],ymm2[5,7],ymm3[5,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -249,21 +243,21 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind { ; SSE-LABEL: load_i32_stride2_vf32: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 32(%rdi), %xmm1 -; SSE-NEXT: movaps 48(%rdi), %xmm8 -; SSE-NEXT: movaps 208(%rdi), %xmm9 -; SSE-NEXT: movaps 192(%rdi), %xmm3 -; SSE-NEXT: movaps 80(%rdi), %xmm13 -; SSE-NEXT: movaps 64(%rdi), %xmm2 -; SSE-NEXT: movaps 240(%rdi), %xmm11 -; SSE-NEXT: movaps 224(%rdi), %xmm5 -; SSE-NEXT: movaps 112(%rdi), %xmm14 -; SSE-NEXT: movaps 96(%rdi), %xmm4 -; SSE-NEXT: movaps 144(%rdi), %xmm12 -; SSE-NEXT: movaps 128(%rdi), %xmm6 -; SSE-NEXT: movaps 176(%rdi), %xmm15 -; SSE-NEXT: movaps 160(%rdi), %xmm7 +; SSE-NEXT: movaps (%rdi), %xmm1 +; SSE-NEXT: movaps 16(%rdi), %xmm8 +; SSE-NEXT: movaps 32(%rdi), %xmm0 +; SSE-NEXT: movaps 240(%rdi), %xmm9 +; SSE-NEXT: movaps 224(%rdi), %xmm3 +; SSE-NEXT: movaps 112(%rdi), %xmm13 +; SSE-NEXT: movaps 96(%rdi), %xmm2 +; SSE-NEXT: movaps 208(%rdi), %xmm11 +; SSE-NEXT: movaps 192(%rdi), %xmm5 +; SSE-NEXT: movaps 80(%rdi), %xmm14 +; SSE-NEXT: movaps 64(%rdi), %xmm4 +; SSE-NEXT: movaps 176(%rdi), %xmm12 +; SSE-NEXT: movaps 160(%rdi), %xmm6 +; SSE-NEXT: movaps 144(%rdi), %xmm15 +; SSE-NEXT: movaps 128(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm4, %xmm10 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm14[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm14[1,3] @@ -286,27 +280,27 @@ ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm8[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm8[1,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rdi), %xmm8 +; SSE-NEXT: movaps 48(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm8[1,3] -; SSE-NEXT: movaps %xmm11, 96(%rsi) -; SSE-NEXT: movaps %xmm14, 32(%rsi) -; SSE-NEXT: movaps %xmm12, 112(%rsi) -; SSE-NEXT: movaps %xmm10, 48(%rsi) -; SSE-NEXT: movaps %xmm15, 64(%rsi) -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps %xmm13, 80(%rsi) -; SSE-NEXT: movaps %xmm9, 16(%rsi) -; SSE-NEXT: movaps %xmm3, 96(%rdx) -; SSE-NEXT: movaps %xmm5, 112(%rdx) -; SSE-NEXT: movaps %xmm6, 64(%rdx) -; SSE-NEXT: movaps %xmm7, 80(%rdx) -; SSE-NEXT: movaps %xmm2, 32(%rdx) -; SSE-NEXT: movaps %xmm4, 48(%rdx) -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 112(%rsi) +; SSE-NEXT: movaps %xmm14, 48(%rsi) +; SSE-NEXT: movaps %xmm12, 96(%rsi) +; SSE-NEXT: movaps %xmm10, 32(%rsi) +; SSE-NEXT: movaps %xmm15, 80(%rsi) +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps %xmm13, 64(%rsi) +; SSE-NEXT: movaps %xmm9, (%rsi) +; SSE-NEXT: movaps %xmm3, 112(%rdx) +; SSE-NEXT: movaps %xmm5, 96(%rdx) +; SSE-NEXT: movaps %xmm6, 80(%rdx) +; SSE-NEXT: movaps %xmm7, 64(%rdx) +; SSE-NEXT: movaps %xmm2, 48(%rdx) +; SSE-NEXT: movaps %xmm4, 32(%rdx) ; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride2_vf32: @@ -315,30 +309,30 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,2],ymm4[0,2],ymm3[4,6],ymm4[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[0,2],ymm6[0,2],ymm1[4,6],ymm6[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[0,2],ymm8[0,2],ymm0[4,6],ymm8[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,2],ymm4[0,2],ymm0[4,6],ymm4[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm2[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm2[0,2],ymm10[0,2],ymm2[4,6],ymm10[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm6[1,3],ymm1[5,7],ymm6[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3],ymm4[1,3],ymm3[5,7],ymm4[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm8[1,3],ymm0[5,7],ymm8[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm10[1,3],ymm2[5,7],ymm10[5,7] -; AVX1-ONLY-NEXT: vmovaps %ymm11, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm2[0,2],ymm6[0,2],ymm2[4,6],ymm6[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm1[0,2],ymm8[0,2],ymm1[4,6],ymm8[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm3[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm3[0,2],ymm10[0,2],ymm3[4,6],ymm10[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm4[1,3],ymm0[5,7],ymm4[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm8[1,3],ymm1[5,7],ymm8[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm6[1,3],ymm2[5,7],ymm6[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3],ymm10[1,3],ymm3[5,7],ymm10[5,7] +; AVX1-ONLY-NEXT: vmovaps %ymm11, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -348,34 +342,34 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm7[0,2],ymm6[0,2],ymm7[4,6],ymm6[4,6] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,1,3] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm5[0,2],ymm4[0,2],ymm5[4,6],ymm4[4,6] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,1,3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,1,3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,1,3] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[1,3],ymm6[1,3],ymm7[5,7],ymm6[5,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,1,3] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,3],ymm4[1,3],ymm5[5,7],ymm4[5,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm3[1,3],ymm2[5,7],ymm3[5,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm9, 64(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm11, (%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 32(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,3],ymm3[1,3],ymm2[5,7],ymm3[5,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm9, 96(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm10, (%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -410,21 +404,21 @@ ; SSE-LABEL: load_i32_stride2_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movaps 208(%rdi), %xmm11 -; SSE-NEXT: movaps 192(%rdi), %xmm6 -; SSE-NEXT: movaps 80(%rdi), %xmm1 -; SSE-NEXT: movaps 64(%rdi), %xmm5 -; SSE-NEXT: movaps 240(%rdi), %xmm14 -; SSE-NEXT: movaps 224(%rdi), %xmm8 -; SSE-NEXT: movaps 112(%rdi), %xmm3 -; SSE-NEXT: movaps 96(%rdi), %xmm7 -; SSE-NEXT: movaps 272(%rdi), %xmm12 -; SSE-NEXT: movaps 144(%rdi), %xmm2 -; SSE-NEXT: movaps 128(%rdi), %xmm9 -; SSE-NEXT: movaps 304(%rdi), %xmm0 -; SSE-NEXT: movaps 288(%rdi), %xmm13 -; SSE-NEXT: movaps 176(%rdi), %xmm4 -; SSE-NEXT: movaps 160(%rdi), %xmm10 +; SSE-NEXT: movaps 240(%rdi), %xmm11 +; SSE-NEXT: movaps 224(%rdi), %xmm6 +; SSE-NEXT: movaps 112(%rdi), %xmm1 +; SSE-NEXT: movaps 96(%rdi), %xmm5 +; SSE-NEXT: movaps 208(%rdi), %xmm14 +; SSE-NEXT: movaps 192(%rdi), %xmm8 +; SSE-NEXT: movaps 80(%rdi), %xmm3 +; SSE-NEXT: movaps 64(%rdi), %xmm7 +; SSE-NEXT: movaps 304(%rdi), %xmm12 +; SSE-NEXT: movaps 176(%rdi), %xmm2 +; SSE-NEXT: movaps 160(%rdi), %xmm9 +; SSE-NEXT: movaps 272(%rdi), %xmm0 +; SSE-NEXT: movaps 256(%rdi), %xmm13 +; SSE-NEXT: movaps 144(%rdi), %xmm4 +; SSE-NEXT: movaps 128(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm7, %xmm15 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm3[0,2] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -460,236 +454,236 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,3],xmm0[1,3] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 256(%rdi), %xmm0 +; SSE-NEXT: movaps 288(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm12[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm12[1,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm0 -; SSE-NEXT: movaps 352(%rdi), %xmm15 +; SSE-NEXT: movaps 336(%rdi), %xmm0 +; SSE-NEXT: movaps 320(%rdi), %xmm15 ; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,3],xmm0[1,3] -; SSE-NEXT: movaps 336(%rdi), %xmm0 -; SSE-NEXT: movaps 320(%rdi), %xmm13 +; SSE-NEXT: movaps 368(%rdi), %xmm0 +; SSE-NEXT: movaps 352(%rdi), %xmm13 ; SSE-NEXT: movaps %xmm13, %xmm11 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,3],xmm0[1,3] -; SSE-NEXT: movaps 432(%rdi), %xmm0 -; SSE-NEXT: movaps 416(%rdi), %xmm12 +; SSE-NEXT: movaps 400(%rdi), %xmm0 +; SSE-NEXT: movaps 384(%rdi), %xmm12 ; SSE-NEXT: movaps %xmm12, %xmm14 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,3],xmm0[1,3] -; SSE-NEXT: movaps 400(%rdi), %xmm0 -; SSE-NEXT: movaps 384(%rdi), %xmm9 +; SSE-NEXT: movaps 432(%rdi), %xmm0 +; SSE-NEXT: movaps 416(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm9, %xmm10 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,3],xmm0[1,3] -; SSE-NEXT: movaps 496(%rdi), %xmm0 -; SSE-NEXT: movaps 480(%rdi), %xmm7 +; SSE-NEXT: movaps 464(%rdi), %xmm0 +; SSE-NEXT: movaps 448(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, %xmm6 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm0[1,3] -; SSE-NEXT: movaps 464(%rdi), %xmm1 -; SSE-NEXT: movaps 448(%rdi), %xmm3 +; SSE-NEXT: movaps 496(%rdi), %xmm1 +; SSE-NEXT: movaps 480(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm1[1,3] -; SSE-NEXT: movaps 32(%rdi), %xmm8 -; SSE-NEXT: movaps 48(%rdi), %xmm1 +; SSE-NEXT: movaps (%rdi), %xmm8 +; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm8, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,3],xmm1[1,3] -; SSE-NEXT: movaps (%rdi), %xmm4 -; SSE-NEXT: movaps 16(%rdi), %xmm0 +; SSE-NEXT: movaps 32(%rdi), %xmm4 +; SSE-NEXT: movaps 48(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm0[1,3] -; SSE-NEXT: movaps %xmm2, 224(%rsi) -; SSE-NEXT: movaps %xmm11, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps %xmm6, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 176(%rsi) +; SSE-NEXT: movaps %xmm2, 240(%rsi) +; SSE-NEXT: movaps %xmm11, 176(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) -; SSE-NEXT: movaps %xmm10, 192(%rsi) +; SSE-NEXT: movaps %xmm6, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps %xmm0, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps %xmm14, 208(%rsi) +; SSE-NEXT: movaps %xmm0, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps %xmm10, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rsi) -; SSE-NEXT: movaps %xmm5, 16(%rsi) -; SSE-NEXT: movaps %xmm3, 224(%rdx) -; SSE-NEXT: movaps %xmm7, 240(%rdx) -; SSE-NEXT: movaps %xmm9, 192(%rdx) -; SSE-NEXT: movaps %xmm12, 208(%rdx) -; SSE-NEXT: movaps %xmm13, 160(%rdx) -; SSE-NEXT: movaps %xmm15, 176(%rdx) +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps %xmm14, 192(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rdx) +; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps %xmm5, (%rsi) +; SSE-NEXT: movaps %xmm3, 240(%rdx) +; SSE-NEXT: movaps %xmm7, 224(%rdx) +; SSE-NEXT: movaps %xmm9, 208(%rdx) +; SSE-NEXT: movaps %xmm12, 192(%rdx) +; SSE-NEXT: movaps %xmm13, 176(%rdx) +; SSE-NEXT: movaps %xmm15, 160(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movaps %xmm4, (%rdx) -; SSE-NEXT: movaps %xmm8, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm4, 16(%rdx) +; SSE-NEXT: movaps %xmm8, (%rdx) ; SSE-NEXT: addq $152, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride2_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,2],ymm8[0,2],ymm10[4,6],ymm8[4,6] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm2, %ymm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,2],ymm7[0,2],ymm11[4,6],ymm7[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm3[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm3, %ymm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,2],ymm12[0,2],ymm13[4,6],ymm12[4,6] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2],ymm8[0,2],ymm9[4,6],ymm8[4,6] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm2, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,2],ymm10[0,2],ymm11[4,6],ymm10[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm4[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm4, %ymm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,2],ymm12[0,2],ymm13[4,6],ymm12[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm5, %ymm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,2],ymm14[0,2],ymm15[4,6],ymm14[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm11[1,3],ymm7[1,3],ymm11[5,7],ymm7[5,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm9[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm5, %ymm15 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm6[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[1,3],ymm8[1,3],ymm9[5,7],ymm8[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm6[0,2],ymm5[0,2],ymm6[4,6],ymm5[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,3],ymm5[1,3],ymm6[5,7],ymm5[5,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm7[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[1,3],ymm10[1,3],ymm11[5,7],ymm10[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm7[0,2],ymm6[0,2],ymm7[4,6],ymm6[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[1,3],ymm6[1,3],ymm7[5,7],ymm6[5,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,3],ymm12[1,3],ymm13[5,7],ymm12[5,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm6[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm3[0,2],ymm7[0,2],ymm3[4,6],ymm7[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3],ymm7[1,3],ymm3[5,7],ymm7[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm15[0,2],ymm14[0,2],ymm15[4,6],ymm14[4,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,3],ymm14[1,3],ymm15[5,7],ymm14[5,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm4[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[1,3],ymm8[1,3],ymm10[5,7],ymm8[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[0,2],ymm15[0,2],ymm4[4,6],ymm15[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,3],ymm15[1,3],ymm4[5,7],ymm15[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[0,2],ymm13[0,2],ymm6[4,6],ymm13[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3],ymm13[1,3],ymm6[5,7],ymm13[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2],ymm11[0,2],ymm9[4,6],ymm11[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3],ymm11[1,3],ymm9[5,7],ymm11[5,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm1[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm11[0,2],ymm1[4,6],ymm11[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm11[1,3],ymm1[5,7],ymm11[5,7] -; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 128(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 160(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm15[0,2],ymm0[4,6],ymm15[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm15[1,3],ymm0[5,7],ymm15[5,7] +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride2_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[0,2],ymm2[0,2],ymm13[4,6],ymm2[4,6] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,3],ymm2[1,3],ymm13[5,7],ymm2[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,2],ymm5[0,2],ymm15[4,6],ymm5[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm15[1,3],ymm5[1,3],ymm15[5,7],ymm5[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm14[0,2],ymm6[0,2],ymm14[4,6],ymm6[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm14[1,3],ymm6[1,3],ymm14[5,7],ymm6[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm12[0,2],ymm10[0,2],ymm12[4,6],ymm10[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,3],ymm10[1,3],ymm12[5,7],ymm10[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm11[0,2],ymm8[0,2],ymm11[4,6],ymm8[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[1,3],ymm8[1,3],ymm11[5,7],ymm8[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm9[0,2],ymm7[0,2],ymm9[4,6],ymm7[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,3],ymm7[1,3],ymm9[5,7],ymm7[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm3[0,2],ymm4[0,2],ymm3[4,6],ymm4[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3],ymm4[1,3],ymm3[5,7],ymm4[5,7] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm4[0,2],ymm1[4,6],ymm4[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm4[1,3],ymm1[5,7],ymm4[5,7] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,2],ymm3[0,2],ymm15[4,6],ymm3[4,6] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm15[1,3],ymm3[1,3],ymm15[5,7],ymm3[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[0,2],ymm5[0,2],ymm8[4,6],ymm5[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,3],ymm5[1,3],ymm8[5,7],ymm5[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,2],ymm13[0,2],ymm14[4,6],ymm13[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm14[1,3],ymm13[1,3],ymm14[5,7],ymm13[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,2],ymm9[0,2],ymm10[4,6],ymm9[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[1,3],ymm9[1,3],ymm10[5,7],ymm9[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[0,2],ymm11[0,2],ymm12[4,6],ymm11[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm12[1,3],ymm11[1,3],ymm12[5,7],ymm11[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm7[0,2],ymm6[0,2],ymm7[4,6],ymm6[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[1,3],ymm6[1,3],ymm7[5,7],ymm6[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm4[1,3],ymm2[5,7],ymm4[5,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[0,2],ymm4[0,2],ymm0[4,6],ymm4[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm4[1,3],ymm0[5,7],ymm4[5,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm12[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 224(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm13[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 160(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm15[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm14[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm10[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm4, 192(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm12[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm4, 128(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm11[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -708,22 +702,22 @@ ; AVX512-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10 ; AVX512-NEXT: vpermt2d %zmm4, %zmm8, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-NEXT: vpermt2d %zmm3, %zmm8, %zmm11 -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 +; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] -; AVX512-NEXT: vpermt2d %zmm4, %zmm12, %zmm5 ; AVX512-NEXT: vpermt2d %zmm6, %zmm12, %zmm7 -; AVX512-NEXT: vpermt2d %zmm3, %zmm12, %zmm2 +; AVX512-NEXT: vpermt2d %zmm4, %zmm12, %zmm5 ; AVX512-NEXT: vpermt2d %zmm1, %zmm12, %zmm0 +; AVX512-NEXT: vpermt2d %zmm3, %zmm12, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm10, 192(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm11, 64(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm9, 128(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm11, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm5, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm7, 128(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <128 x i32>, ptr %in.vec, align 64 @@ -734,6 +728,7 @@ ret void } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX1: {{.*}} ; AVX2: {{.*}} ; AVX2-FAST: {{.*}} ; AVX2-FAST-PERLANE: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll @@ -100,22 +100,22 @@ ; SSE-LABEL: load_i32_stride3_vf4: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movaps 32(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[2,0] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,3] -; SSE-NEXT: movaps %xmm3, (%rsi) +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[2,0] +; SSE-NEXT: movaps %xmm5, (%rsi) ; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps %xmm5, (%rcx) +; SSE-NEXT: movaps %xmm4, (%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride3_vf4: @@ -126,8 +126,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,3,2,1] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1,2],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0,3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] @@ -185,40 +184,40 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i32_stride3_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps 80(%rdi), %xmm1 -; SSE-NEXT: movaps 64(%rdi), %xmm5 +; SSE-NEXT: movdqa 64(%rdi), %xmm2 +; SSE-NEXT: movaps 80(%rdi), %xmm4 ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm7 -; SSE-NEXT: movaps 32(%rdi), %xmm4 -; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[1,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm7[0,2] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: movaps %xmm5, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[2,3,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm5[0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[1,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm5[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm10[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,2] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm7, 16(%rsi) -; SSE-NEXT: movaps %xmm3, (%rsi) -; SSE-NEXT: movaps %xmm2, 16(%rdx) +; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movaps 32(%rdi), %xmm6 +; SSE-NEXT: movdqa 48(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm3[2,0] +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm6[2,0] +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm4[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[2,0] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[2,0] +; SSE-NEXT: movaps %xmm10, 16(%rsi) +; SSE-NEXT: movaps %xmm8, (%rsi) +; SSE-NEXT: movaps %xmm1, 16(%rdx) ; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps %xmm11, 16(%rcx) -; SSE-NEXT: movaps %xmm6, (%rcx) +; SSE-NEXT: movaps %xmm9, 16(%rcx) +; SSE-NEXT: movaps %xmm5, (%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride3_vf8: @@ -227,27 +226,27 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1],ymm1[1,3],ymm4[6,5],ymm1[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,0],ymm4[3,0],ymm0[6,4],ymm4[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm5[2,0],ymm4[4,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm4[2,1],ymm1[1,3],ymm4[6,5],ymm1[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm5[0,2],ymm3[4,7],ymm5[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[1,0],ymm5[2,0],ymm0[5,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[2,0],ymm5[3,0],ymm0[6,4],ymm5[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[0,0],ymm6[2,0],ymm5[4,4],ymm6[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,2],ymm4[0,3],ymm7[5,6],ymm4[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,3],ymm2[6,4],ymm1[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,1],ymm0[0,3],ymm4[4,5],ymm0[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,1],ymm0[0,3],ymm5[4,5],ymm0[4,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -371,99 +370,78 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i32_stride3_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movaps 96(%rdi), %xmm6 -; SSE-NEXT: movaps 128(%rdi), %xmm1 -; SSE-NEXT: movaps 112(%rdi), %xmm13 -; SSE-NEXT: movaps 144(%rdi), %xmm11 -; SSE-NEXT: movaps 176(%rdi), %xmm10 -; SSE-NEXT: movaps 160(%rdi), %xmm9 -; SSE-NEXT: movaps (%rdi), %xmm7 -; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps 32(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm15 +; SSE-NEXT: movaps 144(%rdi), %xmm10 +; SSE-NEXT: movaps 160(%rdi), %xmm1 +; SSE-NEXT: movaps 176(%rdi), %xmm6 +; SSE-NEXT: movaps 96(%rdi), %xmm11 +; SSE-NEXT: movaps 112(%rdi), %xmm9 +; SSE-NEXT: movaps 128(%rdi), %xmm12 +; SSE-NEXT: movaps 64(%rdi), %xmm13 ; SSE-NEXT: movaps 80(%rdi), %xmm14 -; SSE-NEXT: movaps 64(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm14[1,0] -; SSE-NEXT: movaps %xmm15, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0] -; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm3 -; SSE-NEXT: movaps %xmm11, %xmm4 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps %xmm1, %xmm12 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm15, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm2[0,0] -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm14[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm4, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm9[0,0] -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm10[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm3, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm13[0,0] -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm12[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm8[0,0] -; SSE-NEXT: movaps %xmm8, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm12[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm14[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[1,1,1,1] +; SSE-NEXT: movaps (%rdi), %xmm3 +; SSE-NEXT: movaps 16(%rdi), %xmm2 +; SSE-NEXT: movaps 32(%rdi), %xmm5 +; SSE-NEXT: movaps 48(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[2,0] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[2,0] +; SSE-NEXT: movaps %xmm15, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[2,0] +; SSE-NEXT: movaps %xmm11, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[2,0] +; SSE-NEXT: movaps %xmm10, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm0[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm14[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm13[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm13[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm14[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm9[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm9[0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0,1],mem[0,3] -; SSE-NEXT: movaps %xmm5, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps %xmm6, 48(%rdx) -; SSE-NEXT: movaps %xmm7, (%rdx) -; SSE-NEXT: movaps %xmm11, 16(%rdx) -; SSE-NEXT: movaps %xmm4, 32(%rcx) -; SSE-NEXT: movaps %xmm8, 48(%rcx) -; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm12[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm6[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[2,0] +; SSE-NEXT: movaps %xmm7, 48(%rsi) +; SSE-NEXT: movaps %xmm8, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps %xmm10, 48(%rdx) +; SSE-NEXT: movaps %xmm11, 32(%rdx) +; SSE-NEXT: movaps %xmm15, 16(%rdx) +; SSE-NEXT: movaps %xmm3, (%rdx) +; SSE-NEXT: movaps %xmm4, 48(%rcx) +; SSE-NEXT: movaps %xmm13, 32(%rcx) +; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps %xmm1, (%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride3_vf16: @@ -472,52 +450,52 @@ ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6],ymm4[7] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,1],ymm4[1,3],ymm7[6,5],ymm4[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,3],ymm7[0,2],ymm5[4,7],ymm7[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,0],ymm7[2,0],ymm3[5,4],ymm7[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1],ymm1[1,3],ymm9[6,5],ymm1[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,3],ymm9[0,2],ymm8[4,7],ymm9[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[1,0],ymm9[2,0],ymm0[5,4],ymm9[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm3[2,0],ymm7[3,0],ymm3[6,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0],ymm10[2,0],ymm7[4,4],ymm10[6,4] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[1,2],ymm11[0,3],ymm12[5,6],ymm11[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,0],ymm9[3,0],ymm0[6,4],ymm9[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm9[0,0],ymm12[2,0],ymm9[4,4],ymm12[6,4] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm13[0,3],ymm14[5,6],ymm13[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm11[1,0],ymm6[2,0],ymm11[5,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[2,0],ymm4[0,3],ymm6[6,4],ymm4[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1],ymm3[0,3],ymm7[4,5],ymm3[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm7[2,1],ymm5[1,3],ymm7[6,5],ymm5[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,3],ymm8[0,2],ymm4[4,7],ymm8[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm3[1,0],ymm8[2,0],ymm3[5,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm10[2,1],ymm1[1,3],ymm10[6,5],ymm1[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,3],ymm11[0,2],ymm9[4,7],ymm11[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[1,0],ymm11[2,0],ymm0[5,4],ymm11[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm3[2,0],ymm8[3,0],ymm3[6,4],ymm8[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm8[0,0],ymm12[2,0],ymm8[4,4],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,2],ymm7[0,3],ymm13[5,6],ymm7[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,0],ymm11[3,0],ymm0[6,4],ymm11[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm11[0,0],ymm12[2,0],ymm11[4,4],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm13[1,2],ymm10[0,3],ymm13[5,6],ymm10[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,0],ymm6[2,0],ymm12[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,0],ymm5[0,3],ymm6[6,4],ymm5[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1],ymm3[0,3],ymm8[4,5],ymm3[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,0],ymm2[2,0],ymm13[5,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm2[2,0],ymm5[5,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,3],ymm2[6,4],ymm1[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[0,1],ymm0[0,3],ymm9[4,5],ymm0[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,1],ymm0[0,3],ymm11[4,5],ymm0[4,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) ; AVX1-ONLY-NEXT: vzeroupper @@ -704,476 +682,430 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i32_stride3_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $392, %rsp # imm = 0x188 -; SSE-NEXT: movaps 192(%rdi), %xmm4 -; SSE-NEXT: movaps 224(%rdi), %xmm3 -; SSE-NEXT: movaps 208(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm7 -; SSE-NEXT: movaps 272(%rdi), %xmm6 -; SSE-NEXT: movaps 256(%rdi), %xmm9 -; SSE-NEXT: movaps (%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm2 -; SSE-NEXT: movaps 80(%rdi), %xmm1 -; SSE-NEXT: movaps 64(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps %xmm1, %xmm12 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[1,0] -; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: subq $216, %rsp +; SSE-NEXT: movaps 240(%rdi), %xmm6 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[1,0] -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0] -; SSE-NEXT: movaps %xmm3, %xmm13 +; SSE-NEXT: movaps 256(%rdi), %xmm10 +; SSE-NEXT: movaps 272(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdi), %xmm4 +; SSE-NEXT: movaps 80(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rdi), %xmm8 +; SSE-NEXT: movaps 208(%rdi), %xmm2 +; SSE-NEXT: movaps 224(%rdi), %xmm7 +; SSE-NEXT: movaps (%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: movaps %xmm4, %xmm11 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%rdi), %xmm1 +; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm0 +; SSE-NEXT: movaps 32(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 144(%rdi), %xmm3 +; SSE-NEXT: movaps 48(%rdi), %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[2,0] ; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[2,0] +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 352(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 336(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[2,0] +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[2,0] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 96(%rdi), %xmm1 +; SSE-NEXT: movaps 112(%rdi), %xmm3 +; SSE-NEXT: movaps 128(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[2,0] +; SSE-NEXT: movaps 96(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps 304(%rdi), %xmm5 +; SSE-NEXT: movaps 320(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm5[2,0] +; SSE-NEXT: movaps 288(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm1 +; SSE-NEXT: movaps 160(%rdi), %xmm6 +; SSE-NEXT: movaps 176(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[2,0] +; SSE-NEXT: movaps 144(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 304(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 288(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps 352(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps 368(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[2,0] +; SSE-NEXT: movaps 336(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm12[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm14 -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm9[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm10[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm11, %xmm10 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm13[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm3, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm0[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm4, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm0[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[0,2] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm15[0,0] -; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm7[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm4[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm7[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm7[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm10[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm10[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm7[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm12[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm3[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm12[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm10[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm5[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm5[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm10[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm12[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm6[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm12[0,3] +; SSE-NEXT: movaps %xmm14, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm6[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm12[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa (%rsp), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm14[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm12[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm14[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm14[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm12[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] -; SSE-NEXT: movaps %xmm0, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm13[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, (%rsp), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm15[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm12[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rsi) +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm14[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm14[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 64(%rsi) +; SSE-NEXT: movaps %xmm2, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps %xmm2, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movaps %xmm3, 96(%rdx) -; SSE-NEXT: movaps %xmm11, 32(%rdx) -; SSE-NEXT: movaps %xmm7, 112(%rdx) -; SSE-NEXT: movaps %xmm9, 48(%rdx) -; SSE-NEXT: movaps %xmm10, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) -; SSE-NEXT: movaps %xmm14, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movaps %xmm0, 96(%rcx) -; SSE-NEXT: movaps %xmm1, 112(%rcx) -; SSE-NEXT: movaps %xmm15, 64(%rcx) -; SSE-NEXT: movaps %xmm13, 80(%rcx) -; SSE-NEXT: movaps %xmm4, 32(%rcx) -; SSE-NEXT: movaps %xmm6, 48(%rcx) -; SSE-NEXT: movaps %xmm5, (%rcx) -; SSE-NEXT: movaps %xmm8, 16(%rcx) -; SSE-NEXT: addq $392, %rsp # imm = 0x188 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, (%rsi) +; SSE-NEXT: movaps %xmm1, 112(%rdx) +; SSE-NEXT: movaps %xmm10, 48(%rdx) +; SSE-NEXT: movaps %xmm13, 96(%rdx) +; SSE-NEXT: movaps %xmm9, 32(%rdx) +; SSE-NEXT: movaps %xmm15, 80(%rdx) +; SSE-NEXT: movaps %xmm11, 16(%rdx) +; SSE-NEXT: movaps %xmm8, 64(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm6, 112(%rcx) +; SSE-NEXT: movaps %xmm3, 96(%rcx) +; SSE-NEXT: movaps %xmm4, 80(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rcx) +; SSE-NEXT: movaps %xmm5, 48(%rcx) +; SSE-NEXT: movaps %xmm7, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps %xmm12, (%rcx) +; SSE-NEXT: addq $216, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride3_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $392, %rsp # imm = 0x188 +; AVX1-ONLY-NEXT: subq $264, %rsp # imm = 0x108 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm9[1,3],ymm1[6,5],ymm9[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm10[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm11[2,0],ymm10[5,4],ymm11[6,4] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm5[1,3],ymm1[6,5],ymm5[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm14[2,0],ymm4[5,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm12[2,0],ymm6[5,4],ymm12[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm7[1],ymm3[2,3],ymm7[4],ymm3[5,6],ymm7[7] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm7[1,3],ymm1[6,5],ymm7[5,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5,6],ymm8[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm8[1,3],ymm1[6,5],ymm8[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm12[2,0],ymm2[5,4],ymm12[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm7[2,0],ymm2[5,4],ymm7[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm15[1,3],ymm0[6,5],ymm15[5,7] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm15[1],ymm2[2,3],ymm15[4],ymm2[5,6],ymm15[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm7[1,0],ymm0[2,0],ymm7[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm11, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[2,0],ymm11[3,0],ymm10[6,4],ymm11[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[0,0],ymm8[2,0],ymm11[4,4],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm13[0,3],ymm6[5,6],ymm13[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm4[2,0],ymm14[3,0],ymm4[6,4],ymm14[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,0],ymm6[2,0],ymm14[4,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,2],ymm8[0,3],ymm10[5,6],ymm8[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm0[3,0],ymm7[6,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[0,0],ymm6[2,0],ymm0[4,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm15[2],ymm2[3,4],ymm15[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,2],ymm11[0,3],ymm4[5,6],ymm11[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm4[1,3],ymm1[6,5],ymm4[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,0],ymm3[2,0],ymm6[5,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0],ymm13[1],ymm15[2,3],ymm13[4],ymm15[5,6],ymm13[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm15[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm5[2,1],ymm13[1,3],ymm5[6,5],ymm13[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,3],ymm14[0,2],ymm2[4,7],ymm14[4,6] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm14[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm14[1,0],ymm2[2,0],ymm14[5,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7] +; AVX1-ONLY-NEXT: vshufps $201, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1,2],mem[0,3],ymm0[5,6],mem[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[2,0],ymm12[3,0],ymm1[6,4],ymm12[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm12[0,0],ymm4[2,0],ymm12[4,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[2,0],ymm12[3,0],ymm1[6,4],ymm12[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[0,0],ymm10[2,0],ymm12[4,4],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $219, (%rsp), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm8[2],mem[3,4],ymm8[5],mem[6,7] +; AVX1-ONLY-NEXT: vshufps $201, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1,2],mem[0,3],ymm0[5,6],mem[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0],ymm7[3,0],ymm10[6,4],ymm7[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0],ymm10[2,0],ymm7[4,4],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm10[0,3],ymm6[5,6],ymm10[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm9[0,1],mem[2],ymm9[3,4],mem[5],ymm9[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm13[1,0],ymm6[2,0],ymm13[5,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm9[0,3],ymm6[6,4],ymm9[4,7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm9[0,1],mem[0,3],ymm9[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm8[2,0],ymm5[0,3],ymm8[6,4],ymm5[4,7] -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm14[0,1],mem[0,3],ymm14[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm11[1,0],ymm3[2,0],ymm11[5,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm15[0,3],ymm3[6,4],ymm15[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm7[0,3],ymm0[4,5],ymm7[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm1[2,0],ymm10[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[0,3],ymm1[6,4],ymm2[4,7] -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm12[0,1],mem[0,3],ymm12[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps $201, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1,2],mem[0,3],ymm0[5,6],mem[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm6[2,0],ymm3[3,0],ymm6[6,4],ymm3[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm3[0,0],ymm10[2,0],ymm3[4,4],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm5[0,3],ymm0[5,6],ymm5[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm14[2,0],ymm2[3,0],ymm14[6,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm2[0,0],ymm5[2,0],ymm2[4,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,0],ymm0[2,0],ymm10[5,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm9[0,3],ymm0[6,4],ymm9[4,7] +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm12[0,1],mem[0,3],ymm12[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,0],ymm7[2,0],ymm9[5,4],ymm7[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[2,0],ymm4[0,3],ymm7[6,4],ymm4[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm6[0,3],ymm3[4,5],ymm6[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps $36, (%rsp), %ymm8, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm8[0,1],mem[2],ymm8[3,4],mem[5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm8[0,3],ymm3[6,4],ymm8[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm13[0,3],ymm4[6,4],ymm13[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1],ymm14[0,3],ymm2[4,5],ymm14[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX1-ONLY-NEXT: addq $392, %rsp # imm = 0x188 +; AVX1-ONLY-NEXT: addq $264, %rsp # imm = 0x108 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i32_stride3_vf32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $136, %rsp -; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm12 -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm9 ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2,5,2,5,2,5,2,5] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6],ymm10[7] +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm1, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6],ymm12[7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm14 = <0,3,6,1,4,7,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm14, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm14, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm14, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm7[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vmovaps %ymm12, %ymm3 -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm14, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0],ymm13[1],ymm4[2,3],ymm13[4],ymm4[5,6],ymm13[7] +; AVX2-SLOW-NEXT: vmovaps %ymm13, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm14, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm7[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm1, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm6[1],ymm9[2,3],ymm6[4],ymm9[5,6],ymm6[7] ; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm14, %ymm8 -; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0],ymm0[1],ymm13[2,3],ymm0[4],ymm13[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm14, %ymm7 +; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm15 ; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = <1,4,7,2,5,u,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm7[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm6, %ymm8 -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm2, %ymm9 -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm2, %ymm8 +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm14[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm1[2],ymm12[3,4],ymm1[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1],ymm1[2],ymm13[3,4],ymm1[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm10 = <2,5,0,3,6,u,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm10, %ymm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm10, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm11 = <2,5,0,3,6,u,u,u> +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm11, %ymm6 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm12[2],ymm1[3,4],ymm12[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm10, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1],ymm13[2],ymm1[3,4],ymm13[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm11, %ymm4 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rsi) ; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm8, 32(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdx) ; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-SLOW-NEXT: addq $136, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -1181,101 +1113,99 @@ ; AVX2-FAST-LABEL: load_i32_stride3_vf32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $104, %rsp -; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm15 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm12 -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm8 = [2,5,2,5,2,5,2,5] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6],ymm10[7] +; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm8 +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm7 = [2,5,2,5,2,5,2,5] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5,6],ymm13[7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm14 = <0,3,6,1,4,7,u,u> ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm14, %ymm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6],ymm3[7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6],ymm15[7] ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm14, %ymm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0],ymm15[1],ymm2[2,3],ymm15[4],ymm2[5,6],ymm15[7] -; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm7 +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5,6],ymm5[7] ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm14, %ymm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm1[1],ymm12[2,3],ymm1[4],ymm12[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm14, %ymm9 -; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm1, %ymm8 +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm1, %ymm7 ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm1, %ymm8 -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7] -; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm9 -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm1, %ymm8 -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm1, %ymm7 +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm1, %ymm7 +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm7[2],ymm12[3,4],ymm7[5],ymm12[6,7] ; AVX2-FAST-NEXT: vpermps %ymm14, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm10 = [0,1,0,3,0,1,4,7] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm10, %ymm12 +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm10, %ymm11 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm13 = <2,5,0,3,6,u,u,u> ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm13, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm12[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm13, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1],ymm9[2],ymm15[3,4],ymm9[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4],ymm8[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm10, %ymm6 +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm13, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1],ymm4[2],ymm15[3,4],ymm4[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm10, %ymm3 ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm13, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm11[2],ymm7[3,4],ymm11[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm13, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX2-FAST-NEXT: vmovaps %ymm8, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm13, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, (%rsi) ; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm9, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-FAST-NEXT: addq $104, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -1283,106 +1213,105 @@ ; AVX2-FAST-PERLANE-LABEL: load_i32_stride3_vf32: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $136, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2,5,2,5,2,5,2,5] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm1, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm1, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6],ymm12[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm14 = <0,3,6,1,4,7,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm14, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm1, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm14, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm14, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm14, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0],ymm13[1],ymm4[2,3],ymm13[4],ymm4[5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm14, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm1, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm6[1],ymm9[2,3],ymm6[4],ymm9[5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm14, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0],ymm0[1],ymm13[2,3],ymm0[4],ymm13[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm14, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm2 = <1,4,7,2,5,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm7[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm2, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm2, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm1[2],ymm12[3,4],ymm1[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1],ymm1[2],ymm13[3,4],ymm1[5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm10 = <2,5,0,3,6,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm10, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,3,4,5,4,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm11 = <2,5,0,3,6,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm11, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm12[2],ymm1[3,4],ymm12[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm10, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1],ymm13[2],ymm1[3,4],ymm13[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm11, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 32(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-FAST-PERLANE-NEXT: addq $136, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -1436,812 +1365,750 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i32_stride3_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1112, %rsp # imm = 0x458 -; SSE-NEXT: movaps 624(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 656(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 640(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 432(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 464(%rdi), %xmm5 +; SSE-NEXT: subq $808, %rsp # imm = 0x328 +; SSE-NEXT: movaps 576(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 448(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 272(%rdi), %xmm3 +; SSE-NEXT: movaps 592(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 608(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 256(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm9 -; SSE-NEXT: movaps 80(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: movaps 384(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps 400(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 416(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 208(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0] -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[1,0] -; SSE-NEXT: movaps %xmm6, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps (%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[1,0] +; SSE-NEXT: movaps 32(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[2,0] +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps (%rdi), %xmm1 +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[2,0] +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[2,0] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm1 +; SSE-NEXT: movaps 64(%rdi), %xmm6 +; SSE-NEXT: movaps 80(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[2,0] +; SSE-NEXT: movaps 48(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%rdi), %xmm0 +; SSE-NEXT: movaps 256(%rdi), %xmm10 +; SSE-NEXT: movaps 272(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 192(%rdi), %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[2,0] +; SSE-NEXT: movaps 240(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 416(%rdi), %xmm1 +; SSE-NEXT: movaps 448(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 400(%rdi), %xmm0 +; SSE-NEXT: movaps 464(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 384(%rdi), %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[2,0] +; SSE-NEXT: movaps 432(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 608(%rdi), %xmm1 +; SSE-NEXT: movaps 640(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 592(%rdi), %xmm0 +; SSE-NEXT: movaps 656(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 576(%rdi), %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[2,0] +; SSE-NEXT: movaps 624(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%rdi), %xmm10 -; SSE-NEXT: movaps 160(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps 112(%rdi), %xmm1 +; SSE-NEXT: movaps 128(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[2,0] +; SSE-NEXT: movaps 96(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 352(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 336(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps 304(%rdi), %xmm11 +; SSE-NEXT: movaps 320(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[2,0] +; SSE-NEXT: movaps 288(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 496(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 512(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[2,0] +; SSE-NEXT: movaps 480(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 688(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 704(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[2,0] +; SSE-NEXT: movaps 672(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdi), %xmm2 +; SSE-NEXT: movaps 176(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm2[2,0] +; SSE-NEXT: movaps 144(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm9[2,0] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 352(%rdi), %xmm9 +; SSE-NEXT: movaps 368(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm9[2,0] +; SSE-NEXT: movaps 336(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm13[2,0] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 560(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 544(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 528(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 752(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 560(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm0[2,0] +; SSE-NEXT: movaps 528(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm13[2,0] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 736(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 720(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm6 -; SSE-NEXT: movaps 112(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[1,0] -; SSE-NEXT: movaps 96(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm7 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm13 -; SSE-NEXT: movaps 304(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[1,0] +; SSE-NEXT: movaps 752(%rdi), %xmm13 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 288(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm0[2,0] +; SSE-NEXT: movaps 720(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm13[2,0] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm6[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm12[0,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm6[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm12[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 512(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 496(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0] -; SSE-NEXT: movaps 480(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 704(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 688(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[1,0] -; SSE-NEXT: movaps 672(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm12[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm12[0,2] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm8[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm10[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm8[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm4[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm6[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm1[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm4[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[0,0] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm3, %xmm12 -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[0,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm15[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm15[0,2] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm11[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm13[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm11[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm8[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[0,0] -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm10[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm5[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm11[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm9[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm0[2,0] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm14[0,0] -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movaps %xmm14, %xmm3 -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm1[0,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm1[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] -; SSE-NEXT: movaps %xmm2, %xmm11 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm1[0,2] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,3] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm6[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[2,3,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,3] +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[2,3,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 176(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 192(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 128(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 208(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 144(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 16(%rsi) -; SSE-NEXT: movaps %xmm14, 224(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 240(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 192(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 208(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 160(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 176(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 128(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 144(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 16(%rdx) -; SSE-NEXT: movaps %xmm0, 240(%rcx) -; SSE-NEXT: movaps %xmm1, 224(%rcx) -; SSE-NEXT: movaps %xmm2, 208(%rcx) -; SSE-NEXT: movaps %xmm3, 192(%rcx) -; SSE-NEXT: movaps %xmm5, 176(%rcx) -; SSE-NEXT: movaps %xmm6, 160(%rcx) -; SSE-NEXT: movaps %xmm7, 144(%rcx) -; SSE-NEXT: movaps %xmm8, 128(%rcx) -; SSE-NEXT: movaps %xmm9, 112(%rcx) -; SSE-NEXT: movaps %xmm10, 96(%rcx) -; SSE-NEXT: movaps %xmm11, 80(%rcx) -; SSE-NEXT: movaps %xmm12, 64(%rcx) -; SSE-NEXT: movaps %xmm13, 48(%rcx) -; SSE-NEXT: movaps %xmm15, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm15[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm15[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 240(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 176(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 224(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 160(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 208(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 144(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, (%rsi) +; SSE-NEXT: movaps %xmm2, 240(%rdx) +; SSE-NEXT: movaps %xmm4, 224(%rdx) +; SSE-NEXT: movaps %xmm6, 208(%rdx) +; SSE-NEXT: movaps %xmm8, 192(%rdx) +; SSE-NEXT: movaps %xmm10, 176(%rdx) +; SSE-NEXT: movaps %xmm12, 160(%rdx) +; SSE-NEXT: movaps %xmm14, 144(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 128(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm5, 240(%rcx) +; SSE-NEXT: movaps %xmm7, 224(%rcx) +; SSE-NEXT: movaps %xmm9, 208(%rcx) +; SSE-NEXT: movaps %xmm11, 192(%rcx) +; SSE-NEXT: movaps %xmm13, 176(%rcx) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 160(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps %xmm0, 144(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: addq $1112, %rsp # imm = 0x458 +; SSE-NEXT: movaps %xmm0, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps %xmm3, (%rcx) +; SSE-NEXT: addq $808, %rsp # imm = 0x328 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1384, %rsp # imm = 0x568 -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 +; AVX1-ONLY-NEXT: subq $1064, %rsp # imm = 0x428 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovups %ymm9, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm9[1,3],ymm1[6,5],ymm9[5,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5,6],ymm8[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm8[1,3],ymm1[6,5],ymm8[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm1[2,0],ymm8[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,0],ymm1[2,0],ymm7[5,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm6[1,3],ymm1[6,5],ymm6[5,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm5[1,3],ymm1[6,5],ymm5[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm14[2,0],ymm5[5,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm3[1,3],ymm1[6,5],ymm3[5,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm2[1,3],ymm1[6,5],ymm2[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm3[2,0],ymm2[5,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm2[2,0],ymm1[5,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm2[1,3],ymm0[6,5],ymm2[5,7] -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm2[1,3],ymm1[6,5],ymm2[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm15[2,0],ymm1[5,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm2[2,0],ymm1[5,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm7[1,3],ymm0[6,5],ymm7[5,7] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3],ymm7[4],ymm1[5,6],ymm7[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6],ymm4[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm6[1,3],ymm0[6,5],ymm6[5,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6],ymm6[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm1[2,0],ymm5[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[2,1],ymm4[1,3],ymm0[6,5],ymm4[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,3],ymm2[0,2],ymm1[4,7],ymm2[4,6] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm5[2,0],ymm6[5,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm12[1,3],ymm0[6,5],ymm12[5,7] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6],ymm12[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm8[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0],ymm8[1],ymm12[2,3],ymm8[4],ymm12[5,6],ymm8[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7] -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[2,1],ymm8[1,3],ymm0[6,5],ymm8[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,3],ymm3[0,2],ymm2[4,7],ymm3[4,6] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm13[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm13[1,0],ymm11[2,0],ymm13[5,4],ymm11[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3],ymm10[4],ymm1[5,6],ymm10[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm11[2,0],ymm2[5,4],ymm11[6,4] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm13 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm0, %ymm9 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[2,1],ymm2[1,3],ymm1[6,5],ymm2[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm10[0,2],ymm3[4,7],ymm10[4,6] +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm0[0,3],ymm1[5,6],ymm0[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[1,0],ymm10[2,0],ymm0[5,4],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm0[2,1],ymm3[1,3],ymm0[6,5],ymm3[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,3],ymm15[0,2],ymm14[4,7],ymm15[4,6] +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,0],ymm14[2,0],ymm1[5,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[3,0],ymm0[6,4],ymm14[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,0],ymm0[2,0],ymm14[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX1-ONLY-NEXT: vshufps $201, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1,2],mem[0,3],ymm0[5,6],mem[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm1[3,0],ymm15[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm15[2,0],ymm1[4,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[3,0],ymm0[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0],ymm0[2,0],ymm3[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX1-ONLY-NEXT: vshufps $201, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1,2],mem[0,3],ymm0[5,6],mem[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm3[0,3],ymm1[5,6],ymm3[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm1[3,0],ymm15[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm15[2,0],ymm1[4,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm15[3,0],ymm0[6,4],ymm15[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[0,0],ymm0[2,0],ymm15[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX1-ONLY-NEXT: vshufps $201, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1,2],mem[0,3],ymm0[5,6],mem[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm3[0,3],ymm1[5,6],ymm3[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm1[3,0],ymm15[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm15[2,0],ymm1[4,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm1[3,0],ymm4[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm7[2],ymm0[3,4],ymm7[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,2],ymm4[0,3],ymm2[5,6],ymm4[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,0],ymm2[3,0],ymm5[6,4],ymm2[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0],ymm1[2,0],ymm2[4,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,2],ymm5[0,3],ymm3[5,6],ymm5[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[2,0],ymm9[3,0],ymm8[6,4],ymm9[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,0],ymm2[2,0],ymm9[4,4],ymm2[6,4] -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,2],ymm8[0,3],ymm15[5,6],ymm8[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm13[2,0],ymm11[3,0],ymm13[6,4],ymm11[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[0,0],ymm3[2,0],ymm11[4,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0,1],ymm10[2],ymm3[3,4],ymm10[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm9[0,3],ymm14[5,6],ymm9[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm13[1,0],ymm14[2,0],ymm13[5,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm15[0,3],ymm14[6,4],ymm15[4,7] +; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX1-ONLY-NEXT: vshufps $201, (%rsp), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm0[1,2],mem[0,3],ymm0[5,6],mem[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm15[0,1],mem[0,3],ymm15[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm0[2],ymm7[3,4],ymm0[5],ymm7[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm14[2,0],ymm4[5,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm7[0,3],ymm0[6,4],ymm7[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm14[0,1],mem[2],ymm14[3,4],mem[5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm0[2,0],ymm4[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[0,3],ymm0[6,4],ymm14[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,0],ymm0[2,0],ymm5[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[0,3],ymm0[6,4],ymm6[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm0[3,0],ymm15[6,4],ymm0[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm0[0,0],ymm15[2,0],ymm0[4,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] +; AVX1-ONLY-NEXT: vshufps $201, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1,2],mem[0,3],ymm0[5,6],mem[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[2,0],ymm5[3,0],ymm6[6,4],ymm5[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[0,0],ymm15[2,0],ymm5[4,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm8[2],ymm12[3,4],ymm8[5],ymm12[6,7] +; AVX1-ONLY-NEXT: vshufps $201, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1,2],mem[0,3],ymm0[5,6],mem[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm13[2,0],ymm11[3,0],ymm13[6,4],ymm11[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[0,0],ymm15[2,0],ymm11[4,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm2[2],ymm9[3,4],ymm2[5],ymm9[6,7] +; AVX1-ONLY-NEXT: vshufps $201, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1,2],mem[0,3],ymm0[5,6],mem[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[2,0],ymm10[3,0],ymm9[6,4],ymm10[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm10[0,0],ymm15[2,0],ymm10[4,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] +; AVX1-ONLY-NEXT: vshufps $201, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1,2],mem[0,3],ymm0[5,6],mem[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm10[2,0],ymm14[3,0],ymm10[6,4],ymm14[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm14[0,0],ymm15[2,0],ymm14[4,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1],mem[0,3],ymm1[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,0],ymm0[2,0],ymm15[5,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[0,3],ymm0[6,4],ymm1[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm1[0,1],mem[0,3],ymm1[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,0],ymm0[2,0],ymm7[5,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[0,3],ymm0[6,4],ymm4[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1],ymm6[0,3],ymm5[4,5],ymm6[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm6[0,3],ymm1[6,4],ymm6[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1],ymm2[2],ymm12[3,4],ymm2[5],ymm12[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,0],ymm6[2,0],ymm8[5,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm12[0,3],ymm2[6,4],ymm12[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,0],ymm4[2,0],ymm5[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm6[0,3],ymm4[6,4],ymm6[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm1[0,1],mem[0,3],ymm1[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm12[2],ymm8[3,4],ymm12[5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,0],ymm5[2,0],ymm6[5,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,0],ymm8[0,3],ymm5[6,4],ymm8[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm11[0,1],ymm13[0,3],ymm11[4,5],ymm13[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm7[0,1],mem[2],ymm7[3,4],mem[5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,0],ymm5[2,0],ymm6[5,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm7[0,3],ymm5[6,4],ymm7[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm4[1,0],ymm5[2,0],ymm4[5,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm6[0,3],ymm5[6,4],ymm6[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,1],mem[0,3],ymm6[4,5],mem[4,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm3[2],ymm10[3,4],ymm3[5],ymm10[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,0],ymm6[2,0],ymm9[5,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm10[0,3],ymm3[6,4],ymm10[4,7] -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm11[0,1],mem[0,3],ymm11[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 32(%rcx) -; AVX1-ONLY-NEXT: addq $1384, %rsp # imm = 0x568 +; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[1,0],ymm6[2,0],ymm7[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[2,0],ymm2[0,3],ymm6[6,4],ymm2[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1],ymm9[0,3],ymm6[4,5],ymm9[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm8[0,1],mem[2],ymm8[3,4],mem[5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[1,0],ymm6[2,0],ymm7[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm8[0,3],ymm6[6,4],ymm8[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,1],mem[0,3],ymm7[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,0],ymm7[2,0],ymm8[5,4],ymm7[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[2,0],ymm3[0,3],ymm7[6,4],ymm3[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm14[0,1],ymm10[0,3],ymm14[4,5],ymm10[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm15, (%rcx) +; AVX1-ONLY-NEXT: addq $1064, %rsp # imm = 0x428 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i32_stride3_vf64: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovaps 608(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovaps 416(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovaps 448(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm9 ; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,5,2,5,2,5,2,5] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7] -; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5,6],ymm15[7] +; AVX2-SLOW-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm8 = <0,3,6,1,4,7,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2 @@ -2257,110 +2124,110 @@ ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1 -; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovaps %ymm3, %ymm5 ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1 -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6],ymm7[7] -; AVX2-SLOW-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0],ymm7[1],ymm14[2,3],ymm7[4],ymm14[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1 -; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 416(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovaps 480(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovaps %ymm3, %ymm6 ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1 -; AVX2-SLOW-NEXT: vmovaps 448(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm12 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 608(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm11 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7] ; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1 -; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm2 ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm12[2],mem[3,4],ymm12[5],mem[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vpermps (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vpermps (%rsp), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm7[2],ymm14[3,4],ymm7[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps %ymm6, %ymm7 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm1 @@ -2368,67 +2235,59 @@ ; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm14 = <2,5,0,3,6,u,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm15 = <2,5,0,3,6,u,u,u> +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = ymm12[0,1],mem[2],ymm12[3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm14, %ymm2 +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm15, %ymm2 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm14, %ymm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm15[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm15, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm14, %ymm4 +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm15, %ymm4 ; AVX2-SLOW-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm14, %ymm5 +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm15, %ymm5 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm8[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm14, %ymm6 +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm15, %ymm6 ; AVX2-SLOW-NEXT: vpermilps $196, (%rsp), %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] ; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = mem[0,1],ymm11[2],mem[3,4],ymm11[5],mem[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm14, %ymm7 +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm15, %ymm7 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm8, 192(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm8, 128(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm8, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm8, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm8, 224(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm8, 160(%rsi) @@ -2436,14 +2295,14 @@ ; AVX2-SLOW-NEXT: vmovaps %ymm8, 96(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm8, 32(%rsi) -; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 192(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 128(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, (%rsi) ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rdx) @@ -2451,14 +2310,22 @@ ; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-SLOW-NEXT: vmovaps %ymm7, 192(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm5, 128(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm4, 160(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm10, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 224(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 160(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 128(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm10, (%rcx) ; AVX2-SLOW-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -2466,28 +2333,28 @@ ; AVX2-FAST-LABEL: load_i32_stride3_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovaps 608(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 480(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovaps 384(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 544(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm7 ; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm10 ; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm14 -; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,5,2,5,2,5,2,5] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0],ymm11[1],ymm14[2,3],ymm11[4],ymm14[5,6],ymm11[7] -; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6],ymm15[7] +; AVX2-FAST-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = <0,3,6,1,4,7,u,u> ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm6, %ymm2 @@ -2503,104 +2370,104 @@ ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm15 -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm14 +; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6],ymm15[7] -; AVX2-FAST-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3],ymm14[4],ymm1[5,6],ymm14[7] +; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 384(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovaps 480(%rdi), %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovaps 544(%rdi), %ymm10 ; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm12 ; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 608(%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovaps 576(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0],ymm13[1],ymm9[2,3],ymm13[4],ymm9[5,6],ymm13[7] ; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm2 ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm12[2],mem[3,4],ymm12[5],mem[6,7] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $219, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm15[2],mem[3,4],ymm15[5],mem[6,7] +; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm14[2],ymm7[3,4],ymm14[5],ymm7[6,7] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm13[2],ymm9[3,4],ymm13[5],ymm9[6,7] ; AVX2-FAST-NEXT: vmovaps %ymm9, %ymm13 @@ -2610,73 +2477,65 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm14 = [0,1,0,3,0,1,4,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm15 = [0,1,0,3,0,1,4,7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm15, %ymm0 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm15, %ymm2 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm12[0,1],mem[2],ymm12[3,4],mem[5],ymm12[6,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm15, %ymm2 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm15, %ymm2 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $36, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm7[2],ymm14[3,4],ymm7[5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm8 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpermps %ymm7, %ymm1, %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm14, %ymm6 +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm15, %ymm6 ; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm8 = mem[0,1],ymm13[2],mem[3,4],ymm13[5],mem[6,7] ; AVX2-FAST-NEXT: vpermps %ymm8, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 192(%rsi) +; AVX2-FAST-NEXT: vmovaps %ymm4, 224(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 128(%rsi) +; AVX2-FAST-NEXT: vmovaps %ymm4, 160(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm6, 224(%rsi) +; AVX2-FAST-NEXT: vmovaps %ymm6, 192(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm6, 160(%rsi) +; AVX2-FAST-NEXT: vmovaps %ymm6, 128(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-FAST-NEXT: vmovaps %ymm6, 64(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 192(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 128(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm6, (%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm4, 224(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -2685,14 +2544,22 @@ ; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm7, 224(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm3, 160(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm11, 96(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm5, (%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm10, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 192(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 128(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm7, 192(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm11, 64(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm10, (%rcx) ; AVX2-FAST-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -2700,28 +2567,28 @@ ; AVX2-FAST-PERLANE-LABEL: load_i32_stride3_vf64: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps 608(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps 416(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps 448(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,5,2,5,2,5,2,5] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm8 = <0,3,6,1,4,7,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2 @@ -2737,110 +2604,110 @@ ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0],ymm7[1],ymm14[2,3],ymm7[4],ymm14[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 416(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps 480(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 448(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 608(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm12[2],mem[3,4],ymm12[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermps (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps (%rsp), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm7[2],ymm14[3,4],ymm7[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm1 @@ -2848,67 +2715,59 @@ ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm14 = <2,5,0,3,6,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm14, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm15 = <2,5,0,3,6,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm14, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm12[0,1],mem[2],ymm12[3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm14, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm15, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm14, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm15[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm15, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm14, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm15, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm14, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm15, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm8[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm14, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm15, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpermilps $196, (%rsp), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1],ymm11[2],mem[3,4],ymm11[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm14, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm15, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 192(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 128(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 224(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 160(%rsi) @@ -2916,14 +2775,14 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 96(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 192(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 128(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rdx) @@ -2931,14 +2790,22 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 192(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 128(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 160(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 224(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 160(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, (%rcx) ; AVX2-FAST-PERLANE-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -2971,13 +2838,13 @@ ; AVX512-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 ; AVX512-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = <17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u> -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512-NEXT: vpermt2d %zmm3, %zmm14, %zmm17 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] -; AVX512-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm19 -; AVX512-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 -; AVX512-NEXT: vpermt2d %zmm8, %zmm18, %zmm19 +; AVX512-NEXT: vpermt2d %zmm8, %zmm18, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512-NEXT: vpermt2d %zmm1, %zmm14, %zmm19 +; AVX512-NEXT: vpermt2d %zmm6, %zmm18, %zmm19 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512-NEXT: vpermt2d %zmm0, %zmm14, %zmm20 ; AVX512-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 @@ -2987,10 +2854,10 @@ ; AVX512-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 -; AVX512-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 -; AVX512-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 ; AVX512-NEXT: vpermt2d %zmm7, %zmm18, %zmm1 ; AVX512-NEXT: vpermt2d %zmm6, %zmm9, %zmm1 +; AVX512-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 +; AVX512-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 ; AVX512-NEXT: vpermt2d %zmm10, %zmm18, %zmm2 ; AVX512-NEXT: vpermt2d %zmm11, %zmm9, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rsi) @@ -2998,13 +2865,13 @@ ; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm12, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm20, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm19, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm17, 64(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm17, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm2, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm2, (%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <192 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll @@ -147,59 +147,40 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm0, (%r8) ; AVX1-ONLY-NEXT: retq ; -; AVX2-ONLY-LABEL: load_i32_stride4_vf4: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,4,0,4] -; AVX2-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5] -; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [2,6,2,6] -; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm1 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [3,7,3,7] -; AVX2-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vmovaps %xmm6, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rcx) -; AVX2-ONLY-NEXT: vmovaps %xmm2, (%r8) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: load_i32_stride4_vf4: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12] -; AVX512-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [1,5,9,13] -; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [2,6,10,14] -; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [3,7,11,15] -; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 -; AVX512-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512-NEXT: vmovdqa %xmm4, (%rcx) -; AVX512-NEXT: vmovdqa %xmm5, (%r8) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX2-LABEL: load_i32_stride4_vf4: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = [0,4,0,4] +; AVX2-NEXT: # xmm0 = mem[0,0] +; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovaps (%rdi), %xmm2 +; AVX2-NEXT: vmovaps 16(%rdi), %xmm3 +; AVX2-NEXT: vmovaps 32(%rdi), %xmm4 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] +; AVX2-NEXT: vmovaps 48(%rdi), %xmm5 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5] +; AVX2-NEXT: # xmm7 = mem[0,0] +; AVX2-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-NEXT: vpermps %ymm8, %ymm7, %ymm7 +; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = [2,6,2,6] +; AVX2-NEXT: # xmm7 = mem[0,0] +; AVX2-NEXT: vpermps %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = [3,7,3,7] +; AVX2-NEXT: # xmm3 = mem[0,0] +; AVX2-NEXT: vpermps %ymm8, %ymm3, %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-NEXT: vmovaps %xmm0, (%rsi) +; AVX2-NEXT: vmovaps %xmm6, (%rdx) +; AVX2-NEXT: vmovaps %xmm1, (%rcx) +; AVX2-NEXT: vmovaps %xmm2, (%r8) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %wide.vec = load <16 x i32>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <4 x i32> %strided.vec1 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <4 x i32> @@ -439,13 +420,13 @@ ; SSE-NEXT: movaps 48(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm0, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] -; SSE-NEXT: movaps %xmm14, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm1[1] +; SSE-NEXT: movaps (%rdi), %xmm14 +; SSE-NEXT: movaps 16(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movaps %xmm0, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload @@ -459,38 +440,38 @@ ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm4[2],xmm14[3],xmm4[3] ; SSE-NEXT: movaps %xmm11, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm7[1] -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm9[0] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm9[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm9[1] ; SSE-NEXT: movaps %xmm10, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm12[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm12[1] -; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: movaps %xmm14, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 48(%rsi) -; SSE-NEXT: movaps %xmm4, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 32(%rsi) ; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps %xmm6, (%rsi) ; SSE-NEXT: movaps %xmm15, 48(%rdx) -; SSE-NEXT: movaps %xmm14, (%rdx) ; SSE-NEXT: movaps %xmm5, 32(%rdx) ; SSE-NEXT: movaps %xmm13, 16(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps %xmm7, 48(%rcx) -; SSE-NEXT: movaps %xmm6, 32(%rcx) +; SSE-NEXT: movaps %xmm4, 32(%rcx) ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps %xmm8, (%rcx) ; SSE-NEXT: movaps %xmm10, 48(%r8) ; SSE-NEXT: movaps %xmm3, 32(%r8) ; SSE-NEXT: movaps %xmm11, 16(%r8) -; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movaps %xmm14, (%r8) ; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: retq ; @@ -791,39 +772,38 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i32_stride4_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $456, %rsp # imm = 0x1C8 -; SSE-NEXT: movaps 272(%rdi), %xmm7 +; SSE-NEXT: subq $424, %rsp # imm = 0x1A8 +; SSE-NEXT: movaps 336(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 304(%rdi), %xmm8 +; SSE-NEXT: movaps 368(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 288(%rdi), %xmm2 +; SSE-NEXT: movaps 352(%rdi), %xmm11 +; SSE-NEXT: movaps 80(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 336(%rdi), %xmm10 +; SSE-NEXT: movaps 112(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 352(%rdi), %xmm5 +; SSE-NEXT: movaps 96(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm1 +; SSE-NEXT: movaps 272(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 256(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdi), %xmm4 -; SSE-NEXT: movaps 96(%rdi), %xmm0 +; SSE-NEXT: movaps 304(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 288(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps %xmm4, %xmm3 ; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm0 @@ -831,9 +811,9 @@ ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm11, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movaps 256(%rdi), %xmm1 +; SSE-NEXT: movaps 320(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -841,29 +821,29 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm1 +; SSE-NEXT: movaps 176(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps 160(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 208(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps 144(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 496(%rdi), %xmm1 +; SSE-NEXT: movaps 432(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 480(%rdi), %xmm0 +; SSE-NEXT: movaps 416(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 464(%rdi), %xmm2 +; SSE-NEXT: movaps 400(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 448(%rdi), %xmm1 +; SSE-NEXT: movaps 384(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -871,206 +851,199 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 144(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps 432(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 416(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 400(%rdi), %xmm1 +; SSE-NEXT: movaps 240(%rdi), %xmm0 +; SSE-NEXT: movaps 224(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 208(%rdi), %xmm12 +; SSE-NEXT: movaps 192(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm1[1] +; SSE-NEXT: movaps 496(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 480(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps 464(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 448(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE-NEXT: movaps 32(%rdi), %xmm8 +; SSE-NEXT: movaps 48(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 384(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm7 +; SSE-NEXT: movaps %xmm8, %xmm7 ; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps (%rdi), %xmm2 +; SSE-NEXT: movaps 16(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] -; SSE-NEXT: movaps 32(%rdi), %xmm11 -; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movaps (%rdi), %xmm6 -; SSE-NEXT: movaps 16(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm8[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: unpckhps (%rsp), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: unpckhps (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: movaps %xmm7, %xmm11 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm9[1] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm5[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm5[1] +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm4[1] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm10 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm11[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 16(%rsi) -; SSE-NEXT: movaps %xmm7, 96(%rdx) -; SSE-NEXT: movaps %xmm14, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, 64(%rdx) -; SSE-NEXT: movaps %xmm5, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps %xmm3, 96(%rcx) -; SSE-NEXT: movaps %xmm9, 32(%rcx) -; SSE-NEXT: movaps %xmm10, 112(%rcx) -; SSE-NEXT: movaps %xmm4, 48(%rcx) -; SSE-NEXT: movaps %xmm13, 64(%rcx) -; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: movaps %xmm1, %xmm11 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: movaps %xmm13, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm10[1] +; SSE-NEXT: movaps %xmm2, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm8[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm8[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, (%rsi) +; SSE-NEXT: movaps %xmm6, 112(%rdx) +; SSE-NEXT: movaps %xmm15, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, 64(%rdx) +; SSE-NEXT: movaps %xmm4, (%rdx) +; SSE-NEXT: movaps %xmm3, 112(%rcx) +; SSE-NEXT: movaps %xmm11, 96(%rcx) ; SSE-NEXT: movaps %xmm0, 80(%rcx) -; SSE-NEXT: movaps %xmm8, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 112(%r8) -; SSE-NEXT: movaps %xmm12, 96(%r8) +; SSE-NEXT: movaps %xmm5, 64(%rcx) +; SSE-NEXT: movaps %xmm9, 48(%rcx) +; SSE-NEXT: movaps %xmm12, 32(%rcx) +; SSE-NEXT: movaps %xmm7, 16(%rcx) +; SSE-NEXT: movaps %xmm10, (%rcx) +; SSE-NEXT: movaps %xmm13, 112(%r8) +; SSE-NEXT: movaps %xmm1, 96(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r8) +; SSE-NEXT: movaps %xmm14, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r8) -; SSE-NEXT: movaps %xmm15, 32(%r8) +; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps %xmm6, (%r8) -; SSE-NEXT: addq $456, %rsp # imm = 0x1C8 +; SSE-NEXT: movaps %xmm2, (%r8) +; SSE-NEXT: addq $424, %rsp # imm = 0x1A8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride4_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1000, %rsp # imm = 0x3E8 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 +; AVX1-ONLY-NEXT: subq $984, %rsp # imm = 0x3D8 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm11[0],ymm2[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm15[0],ymm5[2],ymm15[2] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm10 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[4],ymm5[4],ymm2[5],ymm5[5] +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm8 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm9 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm10 +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm14 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1083,178 +1056,181 @@ ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm12 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm8[0],ymm2[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm13 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm11 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm12 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1],ymm5[2,0],ymm7[4,5],ymm5[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm4[2,0],ymm5[4,5],ymm4[6,4] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm15[0],ymm10[0],ymm15[1],ymm10[1],ymm15[4],ymm10[4],ymm15[5],ymm10[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm10[1,0],ymm6[5,4],ymm10[5,4] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[4],ymm7[4],ymm11[5],ymm7[5] +; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm9[1,0],ymm8[5,4],ymm9[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm9[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm9[0],mem[0],xmm9[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm13[1],xmm3[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps %xmm10, %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm14, %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm13[0],ymm8[1],ymm13[1],ymm8[4],ymm13[4],ymm8[5],ymm13[5] -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm11[1,0],mem[1,0],ymm11[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[4],ymm8[4],ymm4[5],ymm8[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm5[1,0],ymm10[5,4],ymm5[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm12[1],xmm4[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm6[1],xmm12[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm14[0],mem[0],ymm14[1],mem[1],ymm14[4],mem[4],ymm14[5],mem[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm3[1,0],ymm1[5,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[2,0],ymm14[2,3],ymm1[6,4],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm4[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm5[0],mem[0],xmm5[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm11[1,0],mem[1,0],ymm11[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[2,0],ymm8[2,3],ymm1[6,4],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm7, %xmm12 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[1],xmm12[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm1[1],ymm15[1],ymm1[3],ymm15[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[0,1],ymm8[2,0],ymm1[4,5],ymm8[6,4] +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = zero,zero,xmm9[2],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm8[2,0],ymm1[4,5],ymm8[6,4] +; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = zero,zero,xmm5[2],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm2[2],xmm9[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[6],ymm10[6],ymm5[7],ymm10[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm6[2],xmm12[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm14[1],ymm6[3],ymm14[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[6],ymm3[6],ymm8[7],ymm3[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm2[2],xmm4[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm2[2],xmm3[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm11[2],ymm1[3],ymm11[3],ymm1[6],ymm11[6],ymm1[7],ymm11[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm7[2],xmm13[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = zero,zero,xmm15[2],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] @@ -1263,97 +1239,94 @@ ; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm9[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0],ymm8[3,0],ymm3[7,4],ymm8[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,0],ymm1[2,3],ymm3[6,4],ymm1[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,0],xmm9[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,0],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[2,0],ymm3[2,3],ymm6[6,4],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm2[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm8[2,0],xmm6[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm11[3,0],mem[3,0],ymm11[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[2,0],ymm2[2,3],ymm4[6,4],ymm2[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm13[2],xmm7[3],xmm13[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm9[3,0],mem[3,0],ymm9[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[2,0],ymm1[2,3],ymm9[6,4],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,0],xmm7[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm12[2,0],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[3,0],ymm5[3,0],ymm4[7,4],ymm5[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm13[3,0],xmm14[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm9[2,0],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0],ymm11[3,0],ymm3[7,4],ymm11[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,0],ymm2[2,3],ymm3[6,4],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm15, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX1-ONLY-NEXT: addq $1000, %rsp # imm = 0x3E8 +; AVX1-ONLY-NEXT: addq $984, %rsp # imm = 0x3D8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride4_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $680, %rsp # imm = 0x2A8 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: subq $728, %rsp # imm = 0x2D8 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,4,0,4,0,4,0,4] -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm7 -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,4,0,4] @@ -1362,40 +1335,41 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovaps %ymm6, %ymm12 +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps %ymm8, %ymm9 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm1, %ymm4 +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm1, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm13 -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm1, %ymm4 +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm1, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1408,173 +1382,171 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,5,1,5,1,5,1,5] -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm7, %ymm8 -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm1, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5] -; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm7, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm1, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm1, %ymm5 -; AVX2-ONLY-NEXT: vmovaps %ymm13, %ymm10 -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm1, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,5,1,5,1,5,1,5] +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm7, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [1,5,1,5] +; AVX2-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps %ymm12, %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm13 -; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm15[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,6,2,6,2,6,2,6] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vmovaps %ymm11, %ymm7 -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [2,6,2,6] -; AVX2-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm12 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm2, %ymm9 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm3, %ymm15 +; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm10 = [2,6,2,6,2,6,2,6] +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm10, %ymm14 +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm10, %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [2,6,2,6] +; AVX2-ONLY-NEXT: # xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vmovaps %ymm9, %ymm3 -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm12 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm10, %ymm9 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm10, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm10, %ymm9 +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm10, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps (%rsp), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm10, %ymm9 +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm10 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm10 = xmm8[2],mem[2],xmm8[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm9 = [3,7,3,7,3,7,3,7] +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm9, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm9, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [3,7,3,7] +; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm9, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm9, %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm7, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm9, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm9, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm11 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm11 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm1 = [3,7,3,7,3,7,3,7] -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm1, %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [3,7,3,7] -; AVX2-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm1, %ymm2 -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm1, %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm1, %ymm8 -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm1, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm8 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm8 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, (%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, (%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX2-ONLY-NEXT: addq $680, %rsp # imm = 0x2A8 +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%r8) +; AVX2-ONLY-NEXT: addq $728, %rsp # imm = 0x2D8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -1655,36 +1627,37 @@ ; SSE-LABEL: load_i32_stride4_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $1224, %rsp # imm = 0x4C8 -; SSE-NEXT: movaps 144(%rdi), %xmm14 -; SSE-NEXT: movaps 176(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm5 +; SSE-NEXT: movaps 208(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%rdi), %xmm8 +; SSE-NEXT: movaps 240(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdi), %xmm3 +; SSE-NEXT: movaps 176(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm7 +; SSE-NEXT: movaps 160(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm4 -; SSE-NEXT: movaps 112(%rdi), %xmm2 +; SSE-NEXT: movaps 80(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 96(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm8, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movaps %xmm3, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm6, %xmm0 @@ -1692,54 +1665,52 @@ ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: movaps 128(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movaps 192(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm10 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm12 -; SSE-NEXT: movaps 352(%rdi), %xmm0 +; SSE-NEXT: movaps 304(%rdi), %xmm12 +; SSE-NEXT: movaps 288(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: movaps 336(%rdi), %xmm13 -; SSE-NEXT: movaps 320(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movaps 272(%rdi), %xmm13 +; SSE-NEXT: movaps 256(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 304(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 288(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps 272(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 256(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movaps 368(%rdi), %xmm14 +; SSE-NEXT: movaps 352(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movaps 336(%rdi), %xmm15 +; SSE-NEXT: movaps 320(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 496(%rdi), %xmm1 +; SSE-NEXT: movaps 432(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 480(%rdi), %xmm0 +; SSE-NEXT: movaps 416(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 464(%rdi), %xmm2 +; SSE-NEXT: movaps 400(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 448(%rdi), %xmm1 +; SSE-NEXT: movaps 384(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -1747,44 +1718,56 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 432(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 416(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps 400(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 384(%rdi), %xmm1 +; SSE-NEXT: movaps 496(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps 480(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 464(%rdi), %xmm9 +; SSE-NEXT: movaps 448(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 624(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 608(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps 592(%rdi), %xmm2 +; SSE-NEXT: movaps 560(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 544(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 528(%rdi), %xmm8 +; SSE-NEXT: movaps 512(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 624(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 608(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 592(%rdi), %xmm7 ; SSE-NEXT: movaps 576(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 560(%rdi), %xmm1 +; SSE-NEXT: movaps 688(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 544(%rdi), %xmm0 +; SSE-NEXT: movaps 672(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 528(%rdi), %xmm2 +; SSE-NEXT: movaps 656(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 512(%rdi), %xmm1 +; SSE-NEXT: movaps 640(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -1807,17 +1790,16 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 688(%rdi), %xmm1 +; SSE-NEXT: movaps 816(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 672(%rdi), %xmm0 +; SSE-NEXT: movaps 800(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 656(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 640(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movaps 784(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 768(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1838,52 +1820,35 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 816(%rdi), %xmm1 +; SSE-NEXT: movaps 944(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 800(%rdi), %xmm0 +; SSE-NEXT: movaps 928(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 784(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 768(%rdi), %xmm1 +; SSE-NEXT: movaps 912(%rdi), %xmm11 +; SSE-NEXT: movaps 896(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1008(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 992(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 976(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 976(%rdi), %xmm10 ; SSE-NEXT: movaps 960(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 944(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 928(%rdi), %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 912(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 896(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdi), %xmm0 @@ -1891,76 +1856,75 @@ ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movaps (%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rdi), %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movaps 16(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm14[2],xmm10[3],xmm14[3] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps (%rsp), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: unpckhps (%rsp), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm9[2],xmm6[3],xmm9[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] @@ -1968,202 +1932,209 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm14[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm14[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm15 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm11[1] -; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm10 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm9[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm8[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps %xmm15, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: movaps %xmm14, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm8 +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm7[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm7[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm8 +; SSE-NEXT: movaps %xmm11, %xmm14 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm7[1] +; SSE-NEXT: movaps %xmm8, %xmm11 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: movaps %xmm8, %xmm13 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm7[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 240(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm9[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] -; SSE-NEXT: movaps %xmm8, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 176(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 192(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 128(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 208(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 144(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 224(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 240(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 192(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 208(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 160(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 176(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 128(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 144(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 16(%rdx) -; SSE-NEXT: movaps %xmm0, 240(%rcx) -; SSE-NEXT: movaps %xmm12, 224(%rcx) +; SSE-NEXT: movaps %xmm9, 176(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 224(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 160(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 208(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 144(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps %xmm9, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 240(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 224(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps %xmm9, 208(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps %xmm9, 192(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, 176(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, 160(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, 144(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, 128(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, (%rdx) +; SSE-NEXT: movaps %xmm14, 240(%rcx) +; SSE-NEXT: movaps %xmm0, 224(%rcx) ; SSE-NEXT: movaps %xmm1, 208(%rcx) ; SSE-NEXT: movaps %xmm2, 192(%rcx) ; SSE-NEXT: movaps %xmm3, 176(%rcx) ; SSE-NEXT: movaps %xmm4, 160(%rcx) ; SSE-NEXT: movaps %xmm5, 144(%rcx) ; SSE-NEXT: movaps %xmm6, 128(%rcx) -; SSE-NEXT: movaps %xmm10, 112(%rcx) -; SSE-NEXT: movaps %xmm15, 96(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rcx) +; SSE-NEXT: movaps %xmm0, 112(%rcx) +; SSE-NEXT: movaps %xmm10, 96(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rcx) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rcx) @@ -2172,9 +2143,9 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps %xmm13, (%rcx) +; SSE-NEXT: movaps %xmm11, 240(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 240(%r8) -; SSE-NEXT: movaps %xmm7, 224(%r8) +; SSE-NEXT: movaps %xmm0, 224(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2185,7 +2156,8 @@ ; SSE-NEXT: movaps %xmm0, 160(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%r8) -; SSE-NEXT: movaps %xmm11, 128(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2200,89 +2172,88 @@ ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps %xmm9, (%r8) +; SSE-NEXT: movaps %xmm8, (%r8) ; SSE-NEXT: addq $1224, %rsp # imm = 0x4C8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride4_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2184, %rsp # imm = 0x888 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm8 -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm9 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm14[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm14[0],ymm5[1],ymm14[1],ymm5[4],ymm14[4],ymm5[5],ymm14[5] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: subq $2152, %rsp # imm = 0x868 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[4],ymm6[4],ymm9[5],ymm6[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm13 -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps %xmm8, %xmm13 +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm8 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2291,22 +2262,22 @@ ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 944(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 912(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2315,22 +2286,22 @@ ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2339,22 +2310,22 @@ ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2363,14 +2334,14 @@ ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 944(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 912(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,0] @@ -2390,7 +2361,7 @@ ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 @@ -2400,51 +2371,46 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm10 -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[4],ymm3[4],ymm7[5],ymm3[5] ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,0],ymm7[1,0],ymm14[5,4],ymm7[5,4] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm9[1,0],ymm6[5,4],ymm9[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm12[1],xmm4[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps %xmm13, %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm13[0],xmm3[1],xmm13[1] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm13[1],xmm8[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[4],ymm5[4],ymm8[5],ymm5[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,0],ymm5[1,0],ymm9[5,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[4],ymm13[4],ymm2[5],ymm13[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm10[1,0],ymm5[5,4],ymm10[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm6[1],xmm13[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[1],xmm5[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[4],ymm12[4],ymm14[5],ymm12[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[1],xmm1[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm11[0],mem[0],xmm11[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2487,7 +2453,7 @@ ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -2519,7 +2485,7 @@ ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -2528,82 +2494,82 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm10[1],ymm2[3],ymm10[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm14[2],ymm10[3],ymm14[3],ymm10[6],ymm14[6],ymm10[7],ymm14[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm3[2],xmm7[2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = zero,zero,xmm3[2],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[6],ymm9[6],ymm5[7],ymm9[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm2[1],ymm13[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm6[2],xmm11[2] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = zero,zero,xmm3[2],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm8[1],ymm4[3],ymm8[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm11[2],ymm5[2],ymm11[3],ymm5[3],ymm11[6],ymm5[6],ymm11[7],ymm5[7] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[6],ymm14[6],ymm1[7],ymm14[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm2[2],xmm3[2] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = zero,zero,xmm11[2],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm5[2],ymm13[3],ymm5[3],ymm13[6],ymm5[6],ymm13[7],ymm5[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = zero,zero,xmm15[2],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm3[2],xmm4[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[6],ymm14[6],ymm1[7],ymm14[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = zero,zero,xmm15[2],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm10[2],xmm11[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,0],ymm1[4,5],ymm2[6,4] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload @@ -2635,53 +2601,53 @@ ; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vunpckhps (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = zero,zero,xmm15[2],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,0],ymm13[3,0],ymm5[7,4],ymm13[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[3,0],xmm9[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm10[3,0],ymm1[7,4],ymm10[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm10[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,0],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[2,0],ymm0[2,3],ymm7[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm2[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm13[2,0],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[6],ymm7[6],ymm0[7],ymm7[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm7[3,0],xmm14[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,0],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[6],ymm4[6],ymm8[7],ymm4[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,0],ymm11[3,0],ymm5[7,4],ymm11[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm12[3,0],xmm13[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm14[2,0],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm12[2],ymm0[3],ymm12[3],ymm0[6],ymm12[6],ymm0[7],ymm12[7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm14[3,0],mem[3,0],ymm14[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,0],ymm0[2,3],ymm6[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm2[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm12[2,0],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm12 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,0],ymm1[2,3],ymm12[6,4],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm12[2,0],ymm6[2,3],ymm12[6,4],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = xmm2[2],mem[2],xmm2[3],mem[3] @@ -2689,14 +2655,14 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = xmm2[3,0],mem[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm13[2,0],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm9 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[2,0],ymm6[2,3],ymm9[6,4],ymm6[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm9[2,0],ymm5[2,3],ymm9[6,4],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = xmm2[2],mem[2],xmm2[3],mem[3] @@ -2704,7 +2670,22 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = xmm2[3,0],mem[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm12[2,0],xmm9[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm12[2,0],ymm9[2,3],ymm12[6,4],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm2[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm13[2,0],xmm12[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] @@ -2715,121 +2696,107 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm8 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm2[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm9[2,0],xmm8[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm2[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm12[2,0],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[2,0],ymm2[2,3],ymm7[6,4],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,0],ymm2[2,3],ymm3[6,4],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm3[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[2,0],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,0],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm5[3,0],mem[3,0],ymm5[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,0],ymm3[2,3],ymm5[6,4],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,0],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 96(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%r8) -; AVX1-ONLY-NEXT: addq $2184, %rsp # imm = 0x888 +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm15, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 192(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride4_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1992, %rsp # imm = 0x7C8 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 +; AVX2-ONLY-NEXT: subq $1960, %rsp # imm = 0x7A8 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4] -; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm11 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm13 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,4,0,4] @@ -2838,102 +2805,106 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovaps %ymm6, %ymm10 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 528(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm3, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm8 +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm15 +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 912(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vmovaps %ymm4, %ymm12 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 528(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 912(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm4 @@ -2943,10 +2914,9 @@ ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm8 +; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vmovaps %ymm4, %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 @@ -2960,303 +2930,295 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,5,1,5,1,5,1,5] -; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5] -; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm14 = [1,5,1,5,1,5,1,5] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm14, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm14, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [1,5,1,5] +; AVX2-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm4, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm14, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm14, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 944(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm4, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm14, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm14, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 560(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm4, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 560(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm14, %ymm1 +; AVX2-ONLY-NEXT: vmovaps %ymm8, %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm14, %ymm2 +; AVX2-ONLY-NEXT: vmovaps %ymm9, %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm1 ; AVX2-ONLY-NEXT: vmovaps 816(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm4, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps %ymm15, %ymm3 +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm14, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm14, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm4, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm14, %ymm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm14, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm4, %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm14, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm14, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm4, %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm14, %ymm0 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm14, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 944(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,6,2,6,2,6,2,6] -; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [2,6,2,6] ; AVX2-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovaps %ymm9, %ymm7 -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps %ymm6, %ymm8 -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm7 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = xmm13[2],mem[2],xmm13[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vmovaps %ymm9, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,7,3,7,3,7,3,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [3,7,3,7] ; AVX2-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm14 = xmm6[2],mem[2],xmm6[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps (%rsp), %xmm6, %xmm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm14 = xmm6[2],mem[2],xmm6[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm14 = xmm6[2],mem[2],xmm6[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm14 = xmm6[2],mem[2],xmm6[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) @@ -3265,13 +3227,13 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -3281,28 +3243,36 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 64(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 224(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 128(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm15, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 192(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 224(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%r8) -; AVX2-ONLY-NEXT: addq $1992, %rsp # imm = 0x7C8 +; AVX2-ONLY-NEXT: addq $1960, %rsp # imm = 0x7A8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -3314,72 +3284,72 @@ ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm17 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm10 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm12 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm18 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm17 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] ; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512-NEXT: vpermt2d %zmm12, %zmm19, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512-NEXT: vpermt2d %zmm8, %zmm19, %zmm13 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512-NEXT: vpermt2d %zmm17, %zmm19, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512-NEXT: vpermt2d %zmm14, %zmm19, %zmm18 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512-NEXT: vpermt2d %zmm10, %zmm19, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512-NEXT: vpermt2d %zmm18, %zmm19, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512-NEXT: vpermt2d %zmm14, %zmm19, %zmm13 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512-NEXT: vpermt2d %zmm12, %zmm19, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512-NEXT: vpermt2d %zmm9, %zmm19, %zmm16 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm16[0,1,2,3],zmm13[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm16 +; AVX512-NEXT: vpermt2d %zmm7, %zmm19, %zmm16 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512-NEXT: vpermt2d %zmm4, %zmm19, %zmm20 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm20[0,1,2,3],zmm16[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm20 ; AVX512-NEXT: vpermt2d %zmm3, %zmm19, %zmm20 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm19 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512-NEXT: vpermt2d %zmm12, %zmm21, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512-NEXT: vpermt2d %zmm8, %zmm21, %zmm22 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512-NEXT: vpermt2d %zmm17, %zmm21, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512-NEXT: vpermt2d %zmm14, %zmm21, %zmm23 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] +; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm21 +; AVX512-NEXT: vpermt2d %zmm18, %zmm20, %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512-NEXT: vpermt2d %zmm14, %zmm20, %zmm22 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm22[0,1,2,3],zmm21[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512-NEXT: vpermt2d %zmm12, %zmm20, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm23 +; AVX512-NEXT: vpermt2d %zmm9, %zmm20, %zmm23 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512-NEXT: vpermt2d %zmm10, %zmm21, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm23 +; AVX512-NEXT: vpermt2d %zmm7, %zmm20, %zmm23 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512-NEXT: vpermt2d %zmm4, %zmm21, %zmm24 +; AVX512-NEXT: vpermt2d %zmm4, %zmm20, %zmm24 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512-NEXT: vpermt2d %zmm3, %zmm21, %zmm24 -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm21 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] +; AVX512-NEXT: vpermt2d %zmm3, %zmm20, %zmm24 +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm20 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm20[0,1,2,3],zmm24[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512-NEXT: vpermt2d %zmm8, %zmm24, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512-NEXT: vpermt2d %zmm18, %zmm24, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm26 +; AVX512-NEXT: vpermt2d %zmm14, %zmm24, %zmm26 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512-NEXT: vpermt2d %zmm17, %zmm24, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512-NEXT: vpermt2d %zmm14, %zmm24, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512-NEXT: vpermt2d %zmm12, %zmm24, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm27 +; AVX512-NEXT: vpermt2d %zmm9, %zmm24, %zmm27 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512-NEXT: vpermt2d %zmm10, %zmm24, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm27 +; AVX512-NEXT: vpermt2d %zmm7, %zmm24, %zmm27 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm28 ; AVX512-NEXT: vpermt2d %zmm4, %zmm24, %zmm28 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7] @@ -3389,34 +3359,34 @@ ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] ; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm17, %zmm28, %zmm16 +; AVX512-NEXT: vpermt2d %zmm18, %zmm28, %zmm17 ; AVX512-NEXT: vpermt2d %zmm14, %zmm28, %zmm15 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512-NEXT: vpermt2d %zmm10, %zmm28, %zmm7 -; AVX512-NEXT: vpermt2d %zmm4, %zmm28, %zmm5 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm17[4,5,6,7] ; AVX512-NEXT: vpermt2d %zmm12, %zmm28, %zmm11 -; AVX512-NEXT: vpermt2d %zmm8, %zmm28, %zmm9 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] +; AVX512-NEXT: vpermt2d %zmm9, %zmm28, %zmm10 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm11[4,5,6,7] +; AVX512-NEXT: vpermt2d %zmm7, %zmm28, %zmm6 +; AVX512-NEXT: vpermt2d %zmm4, %zmm28, %zmm5 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm6[4,5,6,7] ; AVX512-NEXT: vpermt2d %zmm3, %zmm28, %zmm2 ; AVX512-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm18, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm13, 128(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm8, 64(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm23, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm21, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm22, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm20, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm22, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm20, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm27, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm25, 64(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm26, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm5, 128(%r8) ; AVX512-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512-NEXT: vmovdqa64 %zmm9, 128(%r8) ; AVX512-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <256 x i32>, ptr %in.vec, align 64 @@ -3433,7 +3403,6 @@ ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; AVX: {{.*}} ; AVX1: {{.*}} -; AVX2: {{.*}} ; AVX2-FAST: {{.*}} ; AVX2-FAST-PERLANE: {{.*}} ; AVX2-SLOW: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll @@ -68,13 +68,13 @@ ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX2-ONLY-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] +; AVX2-ONLY-NEXT: vbroadcastss 16(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vmovq %xmm3, (%rsi) ; AVX2-ONLY-NEXT: vmovq %xmm4, (%rdx) ; AVX2-ONLY-NEXT: vpextrq $1, %xmm1, (%rcx) ; AVX2-ONLY-NEXT: vmovq %xmm0, (%r8) -; AVX2-ONLY-NEXT: vmovq %xmm2, (%r9) +; AVX2-ONLY-NEXT: vmovlps %xmm2, (%r9) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -90,7 +90,9 @@ ; AVX512-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm5 ; AVX512-SLOW-NEXT: vpextrd $3, %xmm1, %eax ; AVX512-SLOW-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1 -; AVX512-SLOW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX512-SLOW-NEXT: vmovd %xmm2, %eax +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-SLOW-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX512-SLOW-NEXT: vpbroadcastd 16(%rdi), %ymm5 ; AVX512-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] ; AVX512-SLOW-NEXT: vmovq %xmm3, (%rsi) @@ -109,16 +111,17 @@ ; AVX512-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,6,1,6] ; AVX512-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm4 -; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [2,7,2,7] -; AVX512-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm5 +; AVX512-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm5 +; AVX512-FAST-NEXT: vpextrd $3, %xmm1, %eax +; AVX512-FAST-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1 ; AVX512-FAST-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512-FAST-NEXT: vpbroadcastd 16(%rdi), %ymm1 -; AVX512-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX512-FAST-NEXT: vpbroadcastd 16(%rdi), %ymm5 +; AVX512-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] ; AVX512-FAST-NEXT: vmovq %xmm3, (%rsi) ; AVX512-FAST-NEXT: vmovq %xmm4, (%rdx) -; AVX512-FAST-NEXT: vmovq %xmm5, (%rcx) +; AVX512-FAST-NEXT: vmovq %xmm1, (%rcx) ; AVX512-FAST-NEXT: vmovq %xmm0, (%r8) -; AVX512-FAST-NEXT: vmovq %xmm1, (%r9) +; AVX512-FAST-NEXT: vmovq %xmm2, (%r9) ; AVX512-FAST-NEXT: vzeroupper ; AVX512-FAST-NEXT: retq %wide.vec = load <10 x i32>, ptr %in.vec, align 64 @@ -194,7 +197,7 @@ ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2],xmm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm3[2],xmm7[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm7 = xmm7[1,0] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1,2],xmm6[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] @@ -381,56 +384,55 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2],xmm6[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm5, %ymm7 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4],ymm9[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5,6],ymm7[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1],ymm1[1,3],ymm9[6,5],ymm1[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[2,0],ymm7[3,0],ymm9[6,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm9[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2],xmm11[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm11[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0],ymm1[2,0],ymm8[7,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm1[2,1],ymm8[6,4],ymm1[6,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2],xmm11[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm11 = xmm11[1,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2],ymm8[3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[1,0],ymm11[0,0],ymm0[5,4],ymm11[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[0,0],ymm1[3,0],ymm2[4,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0],ymm1[2,2],ymm12[6,4],ymm1[6,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,0],mem[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm12[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[2,0],ymm11[1,0],ymm0[6,4],ymm11[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm5, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,1],ymm1[1,3],ymm8[6,5],ymm1[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[2,0],ymm6[3,0],ymm8[6,4],ymm6[7,4] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm8[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2],xmm10[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0],ymm1[2,0],ymm7[7,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0],ymm1[2,1],ymm7[6,4],ymm1[6,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm8[2],xmm10[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm10 = xmm10[1,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,0],ymm10[0,0],ymm0[5,4],ymm10[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm2[0,0],ymm1[3,0],ymm2[4,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0],ymm1[2,2],ymm11[6,4],ymm1[6,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,0],mem[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm11[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[2,0],ymm10[1,0],ymm0[6,4],ymm10[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm9, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -545,21 +547,21 @@ ; SSE-LABEL: load_i32_stride5_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $312, %rsp # imm = 0x138 -; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm3 -; SSE-NEXT: movdqa 240(%rdi), %xmm14 -; SSE-NEXT: movdqa 256(%rdi), %xmm8 -; SSE-NEXT: movdqa (%rdi), %xmm11 -; SSE-NEXT: movdqa 16(%rdi), %xmm15 -; SSE-NEXT: movdqa 32(%rdi), %xmm5 -; SSE-NEXT: movdqa 48(%rdi), %xmm4 +; SSE-NEXT: movdqa 192(%rdi), %xmm3 +; SSE-NEXT: movdqa 160(%rdi), %xmm14 +; SSE-NEXT: movdqa 176(%rdi), %xmm8 +; SSE-NEXT: movdqa 128(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm6 +; SSE-NEXT: movdqa 112(%rdi), %xmm5 +; SSE-NEXT: movdqa 80(%rdi), %xmm11 +; SSE-NEXT: movdqa 96(%rdi), %xmm15 +; SSE-NEXT: movdqa (%rdi), %xmm10 +; SSE-NEXT: movdqa 16(%rdi), %xmm13 +; SSE-NEXT: movdqa 32(%rdi), %xmm7 +; SSE-NEXT: movdqa 48(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm7 -; SSE-NEXT: movdqa 160(%rdi), %xmm10 -; SSE-NEXT: movdqa 176(%rdi), %xmm13 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -588,14 +590,14 @@ ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm4 -; SSE-NEXT: movdqa 96(%rdi), %xmm11 +; SSE-NEXT: movdqa 240(%rdi), %xmm4 +; SSE-NEXT: movdqa 256(%rdi), %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 128(%rdi), %xmm2 +; SSE-NEXT: movdqa 288(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm12 +; SSE-NEXT: movdqa 272(%rdi), %xmm12 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] @@ -605,7 +607,7 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 224(%rdi), %xmm1 +; SSE-NEXT: movdqa 64(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,3,2,3] @@ -617,7 +619,7 @@ ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] @@ -630,7 +632,7 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] ; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 304(%rdi), %xmm1 +; SSE-NEXT: movdqa 224(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] @@ -642,7 +644,7 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 144(%rdi), %xmm1 +; SSE-NEXT: movdqa 304(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] @@ -743,35 +745,35 @@ ; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm11[0],xmm14[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rsi) +; SSE-NEXT: movaps %xmm3, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 48(%rsi) +; SSE-NEXT: movaps %xmm11, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: movaps %xmm11, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, (%rdx) +; SSE-NEXT: movapd %xmm9, 48(%rcx) +; SSE-NEXT: movapd %xmm12, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rdx) -; SSE-NEXT: movapd %xmm9, 16(%rcx) -; SSE-NEXT: movapd %xmm12, 48(%rcx) +; SSE-NEXT: movaps %xmm3, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rcx) -; SSE-NEXT: movapd %xmm1, 16(%r8) -; SSE-NEXT: movapd %xmm2, 48(%r8) -; SSE-NEXT: movapd %xmm6, (%r8) -; SSE-NEXT: movapd %xmm8, 32(%r8) -; SSE-NEXT: movapd %xmm14, 16(%r9) -; SSE-NEXT: movapd %xmm15, 48(%r9) -; SSE-NEXT: movapd %xmm13, (%r9) -; SSE-NEXT: movapd %xmm0, 32(%r9) +; SSE-NEXT: movapd %xmm1, 48(%r8) +; SSE-NEXT: movapd %xmm2, 32(%r8) +; SSE-NEXT: movapd %xmm6, 16(%r8) +; SSE-NEXT: movapd %xmm8, (%r8) +; SSE-NEXT: movapd %xmm14, 48(%r9) +; SSE-NEXT: movapd %xmm15, 32(%r9) +; SSE-NEXT: movapd %xmm13, 16(%r9) +; SSE-NEXT: movapd %xmm0, (%r9) ; SSE-NEXT: addq $312, %rsp # imm = 0x138 ; SSE-NEXT: retq ; @@ -780,120 +782,116 @@ ; AVX1-ONLY-NEXT: subq $136, %rsp ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm11 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm1[2,3],ymm12[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm14 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm12 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm13 -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm1, %ymm4 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm9[2,3],ymm11[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2],xmm3[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1],ymm5[1,3],ymm4[6,5],ymm5[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,0],ymm1[3,0],ymm4[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm15[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,0],ymm3[3,0],ymm4[6,4],ymm3[7,4] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm13[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm2[1,3],ymm1[6,5],ymm2[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1],ymm2[1,3],ymm3[6,5],ymm2[5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm7[2,3],ymm2[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm4[3,0],ymm1[6,4],ymm4[7,4] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm4[3,0],ymm3[6,4],ymm4[7,4] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm15[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm5[2,0],ymm0[7,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[2,1],ymm0[6,4],ymm5[6,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm14[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm5[2,0],ymm1[7,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm5[2,1],ymm1[6,4],ymm5[6,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm12[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm13[2],xmm3[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[1,0],ymm4[0,0],ymm1[5,4],ymm4[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,0],ymm2[2,0],ymm3[7,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm1[1,0],ymm4[0,0],ymm1[5,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm2[2,0],ymm0[7,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,1],ymm0[6,4],ymm2[6,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm11[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm9[4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm15[2],xmm3[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm0[1,0],ymm3[0,0],ymm0[5,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm6[0,0],ymm5[3,0],ymm6[4,4],ymm5[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0],ymm5[2,2],ymm13[6,4],ymm5[6,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,0],mem[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm13[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[2,0],ymm4[1,0],ymm1[6,4],ymm4[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm7[0,0],ymm2[3,0],ymm7[4,4],ymm2[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0],ymm2[2,2],ymm9[6,4],ymm2[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,0],ymm3[0,0],ymm0[5,4],ymm3[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm14[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm6[0,0],ymm5[3,0],ymm6[4,4],ymm5[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0],ymm5[2,2],ymm12[6,4],ymm5[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[3,0],mem[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm1[2,0],ymm4[1,0],ymm1[6,4],ymm4[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm7[0,0],ymm2[3,0],ymm7[4,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0],ymm2[2,2],ymm11[6,4],ymm2[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,0],mem[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[2,0],ymm3[1,0],ymm0[6,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm11[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm0[2,0],ymm3[1,0],ymm0[6,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3],ymm12[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3],ymm10[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],mem[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm11[0,1,2,3],mem[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],mem[1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] @@ -907,11 +905,12 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm13, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX1-ONLY-NEXT: addq $136, %rsp @@ -1173,154 +1172,156 @@ ; SSE-LABEL: load_i32_stride5_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $904, %rsp # imm = 0x388 -; SSE-NEXT: movdqa (%rdi), %xmm11 -; SSE-NEXT: movdqa 16(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm8 +; SSE-NEXT: movdqa 128(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm4 -; SSE-NEXT: movdqa 400(%rdi), %xmm10 -; SSE-NEXT: movdqa 416(%rdi), %xmm14 -; SSE-NEXT: movdqa 128(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm7 -; SSE-NEXT: movdqa 80(%rdi), %xmm12 -; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] +; SSE-NEXT: movdqa 80(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 368(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 352(%rdi), %xmm6 +; SSE-NEXT: movdqa 320(%rdi), %xmm12 +; SSE-NEXT: movdqa 336(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm13 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa 32(%rdi), %xmm10 +; SSE-NEXT: movdqa 48(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm7 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: movdqa 400(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa 416(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 368(%rdi), %xmm2 +; SSE-NEXT: movdqa 448(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 432(%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm0 +; SSE-NEXT: movdqa 160(%rdi), %xmm2 +; SSE-NEXT: movdqa 176(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm0 +; SSE-NEXT: movdqa 192(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 560(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa 480(%rdi), %xmm15 +; SSE-NEXT: movdqa 496(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 608(%rdi), %xmm2 +; SSE-NEXT: movdqa 528(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 592(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 512(%rdi), %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm11 +; SSE-NEXT: movdqa 256(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 208(%rdi), %xmm2 +; SSE-NEXT: movdqa 288(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa 272(%rdi), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 480(%rdi), %xmm1 +; SSE-NEXT: movdqa 560(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 496(%rdi), %xmm0 +; SSE-NEXT: movdqa 576(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 528(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 512(%rdi), %xmm0 +; SSE-NEXT: movdqa 608(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 592(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm10 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa 144(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa %xmm5, %xmm13 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 464(%rdi), %xmm1 +; SSE-NEXT: movdqa 384(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -1330,48 +1331,60 @@ ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 384(%rdi), %xmm1 +; SSE-NEXT: movdqa 464(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 304(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa 224(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] -; SSE-NEXT: movdqa %xmm8, %xmm11 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 624(%rdi), %xmm1 +; SSE-NEXT: movdqa 544(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa 304(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 224(%rdi), %xmm1 +; SSE-NEXT: movdqa 624(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -1379,203 +1392,196 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 544(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,1,1] +; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,0,1,1] -; SSE-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm6, %xmm13 -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa %xmm10, %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm10 +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa %xmm11, %xmm9 -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,2,2,2] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm2[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[2,2,2,2] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm4[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,2,2,2] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[2,2,2,2] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: punpckhdq (%rsp), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm15[0],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -1583,711 +1589,702 @@ ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,2,2] -; SSE-NEXT: punpckhdq (%rsp), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 80(%rdx) +; SSE-NEXT: movaps %xmm15, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 96(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 112(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 64(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 80(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 16(%rcx) -; SSE-NEXT: movapd %xmm7, 112(%r8) -; SSE-NEXT: movapd %xmm9, 96(%r8) -; SSE-NEXT: movapd %xmm10, 80(%r8) +; SSE-NEXT: movaps %xmm15, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movapd %xmm8, 112(%r8) +; SSE-NEXT: movapd %xmm10, 96(%r8) +; SSE-NEXT: movapd %xmm11, 80(%r8) ; SSE-NEXT: movapd %xmm12, 64(%r8) -; SSE-NEXT: movapd %xmm13, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, 16(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, (%r8) +; SSE-NEXT: movapd %xmm14, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%r8) ; SSE-NEXT: movapd %xmm0, 112(%r9) -; SSE-NEXT: movapd %xmm1, 96(%r9) -; SSE-NEXT: movapd %xmm2, 80(%r9) -; SSE-NEXT: movapd %xmm3, 64(%r9) -; SSE-NEXT: movapd %xmm4, 48(%r9) -; SSE-NEXT: movapd %xmm5, 32(%r9) -; SSE-NEXT: movapd %xmm6, 16(%r9) -; SSE-NEXT: movapd %xmm8, (%r9) +; SSE-NEXT: movapd %xmm2, 96(%r9) +; SSE-NEXT: movapd %xmm3, 80(%r9) +; SSE-NEXT: movapd %xmm4, 64(%r9) +; SSE-NEXT: movapd %xmm5, 48(%r9) +; SSE-NEXT: movapd %xmm6, 32(%r9) +; SSE-NEXT: movapd %xmm7, 16(%r9) +; SSE-NEXT: movapd %xmm9, (%r9) ; SSE-NEXT: addq $904, %rsp # imm = 0x388 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride5_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $952, %rsp # imm = 0x3B8 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX1-ONLY-NEXT: subq $936, %rsp # imm = 0x3A8 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm14 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm15 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm8[1,3],ymm0[6,5],ymm8[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm14[1,3],ymm0[6,5],ymm14[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm5[2,3],ymm14[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm13[1,3],ymm0[6,5],ymm13[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm4[2,3],ymm13[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 624(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss 464(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm11[1,3],ymm0[6,5],ymm11[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm3[2,3],ymm11[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm12[2,3],ymm10[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm15[1,3],ymm0[6,5],ymm15[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm9[2,3],ymm15[4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 464(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss 624(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0],ymm8[2,0],ymm13[7,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,0],ymm8[2,0],ymm7[7,4],ymm8[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,1],ymm0[6,4],ymm8[6,5] -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm7[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm14[0,1,2,3],mem[4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm3[0,0],ymm5[5,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm11[0,0],ymm4[5,4],ymm11[4,4] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,0],ymm14[2,0],ymm12[7,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[2,1],ymm0[6,4],ymm14[6,5] -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm6[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm13[2,0],ymm0[7,4],ymm13[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm13[2,1],ymm0[6,4],ymm13[6,5] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm5[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm7[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,0],ymm2[0,0],ymm7[5,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm11[2,0],ymm0[7,4],ymm11[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm11[2,1],ymm0[6,4],ymm11[6,5] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm15[0,1,2,3],mem[4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,0],ymm13[0,0],ymm9[5,4],ymm13[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm10[2,0],ymm0[7,4],ymm10[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm10[2,1],ymm0[6,4],ymm10[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3],ymm1[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],mem[2],xmm6[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm6 = xmm6[1,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[1,0],ymm12[0,0],ymm6[5,4],ymm12[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm4[0,0],ymm8[3,0],ymm4[4,4],ymm8[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm8[2,2],ymm15[6,4],ymm8[6,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[2,0],ymm3[1,0],ymm5[6,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,0],ymm0[0,0],ymm6[5,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vmovaps %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,0],ymm15[2,0],ymm0[7,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm15[2,1],ymm1[6,4],ymm15[6,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3],ymm0[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm9[2],xmm5[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm5 = xmm5[1,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm9[1,0],ymm1[0,0],ymm9[5,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,0],ymm14[3,0],ymm3[4,4],ymm14[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm14[2,2],ymm15[6,4],ymm14[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm3[0,0],ymm1[3,0],ymm3[4,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm1[2,2],ymm14[6,4],ymm1[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm7[2,0],ymm2[1,0],ymm7[6,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm4[2,0],ymm11[1,0],ymm4[6,4],ymm11[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,0],ymm11[3,0],ymm2[4,4],ymm11[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm11[2,2],ymm15[6,4],ymm11[6,6] +; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm13[0,0],ymm1[3,0],ymm13[4,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm1[2,2],ymm14[6,4],ymm1[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[2,0],ymm13[1,0],ymm9[6,4],ymm13[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm7[2,0],ymm2[1,0],ymm7[6,4],ymm2[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm12[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm1[0,0],ymm10[3,0],ymm1[4,4],ymm10[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0],ymm10[2,2],ymm12[6,4],ymm10[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm6[2,0],ymm8[1,0],ymm6[6,4],ymm8[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm10[3,0],ymm1[4,4],ymm10[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm10[2,2],ymm15[6,4],ymm10[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm5[0,0],ymm15[3,0],ymm5[4,4],ymm15[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0],ymm15[2,2],ymm11[6,4],ymm15[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[2,0],ymm12[1,0],ymm6[6,4],ymm12[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5,6],ymm5[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm9[2,0],ymm7[1,0],ymm9[6,4],ymm7[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2,3],ymm3[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5,6],ymm7[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3],ymm2[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm9[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm6[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r9) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],mem[1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm13[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2,3],ymm1[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],mem[1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $128, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm1[0,1,2,3,4,5,6],mem[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],mem[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6],ymm9[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) -; AVX1-ONLY-NEXT: addq $952, %rsp # imm = 0x3B8 +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r9) +; AVX1-ONLY-NEXT: addq $936, %rsp # imm = 0x3A8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride5_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1000, %rsp # imm = 0x3E8 -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm13 +; AVX2-ONLY-NEXT: subq $968, %rsp # imm = 0x3C8 +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,2,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm15 ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm11[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4],ymm3[5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm13, %ymm12 +; AVX2-ONLY-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm2 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm9[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4],ymm3[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa %ymm9, %ymm13 -; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm10 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm11[0,1,0,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4],ymm2[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, 608(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm7, %ymm10 +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %ymm6, %ymm11 ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,0,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4],ymm2[5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %ymm5, %ymm8 +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm4, %ymm13 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 608(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm5 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4],ymm1[5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm4[0,1,0,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <1,6,3,u> -; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm8[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [5,2,7,0,5,2,7,0] -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 304(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1],ymm5[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm4[2,3],ymm1[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,2,7,0,5,2,7,0] +; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastd 144(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm6[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm5[2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm13[2,3],ymm10[4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm10, %ymm11 -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 624(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastd 464(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm6[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm10[2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 144(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm9[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm14[0,1],mem[2,3],ymm14[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastd 304(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm6[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm14[0,1],mem[2,3],ymm14[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 464(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vmovdqa %ymm11, %ymm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm8[2,3],ymm13[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm7, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastd 624(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <2,7,4,u> -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm14 -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vmovdqa %ymm12, %ymm8 -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <2,7,4,u> +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm13[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-ONLY-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5,6],ymm6[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm6[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] -; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm12 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm5, %ymm13 -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 576(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa %ymm11, %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm8, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm10[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa %ymm6, %ymm7 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm15[4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 416(%rdi), %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vmovdqa %ymm4, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2,3,4,5,6],ymm7[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm7[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vmovdqa %ymm6, %ymm4 ; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 416(%rdi), %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm9[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3,4,5,6],ymm15[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2],ymm15[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm1[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm5, %ymm12 +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vinserti128 $1, 576(%rdi), %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5,6],ymm9[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2],ymm9[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm10[0,1,2,3],mem[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] +; AVX2-ONLY-NEXT: vmovdqa %ymm11, %ymm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = ymm11[12,13,14,15],ymm13[0,1,2,3,4,5,6,7,8,9,10,11],ymm11[28,29,30,31],ymm13[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,3,2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm9 = [1,6,1,6,1,6,1,6] +; AVX2-ONLY-NEXT: vpermd %ymm8, %ymm9, %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm11[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],ymm11[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,3,2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm5 = [1,6,1,6,1,6,1,6] -; AVX2-ONLY-NEXT: vpermd %ymm12, %ymm5, %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm11[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm11[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm9, %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm12[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm3[12,13,14,15],ymm13[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm13[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm5, %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm7[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vmovdqa %ymm7, %ymm13 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm9, %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm5, %ymm10 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm7[4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm1[12,13,14,15],ymm12[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm12[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm12 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm14[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm14[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm5, %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm2[12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $207, (%rsp), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3],ymm11[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm8[0,1],ymm1[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm1[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <4,1,6,u> -; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm10, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm5 = [2,7,2,7,2,7,2,7] -; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3],ymm1[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[0,1],ymm12[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm10, %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm5, %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1],ymm13[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm13[5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm14[4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm10, %ymm6 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7] +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm1[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3],ymm6[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <4,1,6,u> +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm9, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm5 = [2,7,2,7,2,7,2,7] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm10, %ymm4 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm7[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm4[0,1],ymm1[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm1[5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm9, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm5, %ymm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm11[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,2,3],ymm12[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm9, %ymm4 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm10[0,1],ymm7[0,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm5, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm5, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm3, 96(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%r9) -; AVX2-ONLY-NEXT: addq $1000, %rsp # imm = 0x3E8 +; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-ONLY-NEXT: addq $968, %rsp # imm = 0x3C8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -2506,234 +2503,234 @@ ; SSE-LABEL: load_i32_stride5_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $1928, %rsp # imm = 0x788 -; SSE-NEXT: movdqa 768(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 752(%rdi), %xmm4 +; SSE-NEXT: movdqa 688(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 720(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 736(%rdi), %xmm3 +; SSE-NEXT: movdqa 672(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 400(%rdi), %xmm10 +; SSE-NEXT: movdqa 640(%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 416(%rdi), %xmm9 +; SSE-NEXT: movdqa 656(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 368(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 352(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 320(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 336(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1040(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1056(%rdi), %xmm0 +; SSE-NEXT: movdqa 960(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 976(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1088(%rdi), %xmm2 +; SSE-NEXT: movdqa 1008(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1072(%rdi), %xmm0 +; SSE-NEXT: movdqa 992(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm14 +; SSE-NEXT: movdqa 80(%rdi), %xmm13 +; SSE-NEXT: movdqa 96(%rdi), %xmm14 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm12 -; SSE-NEXT: movdqa 48(%rdi), %xmm2 +; SSE-NEXT: movdqa 128(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm12 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm0 +; SSE-NEXT: movdqa 400(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 416(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 368(%rdi), %xmm2 +; SSE-NEXT: movdqa 448(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm0 +; SSE-NEXT: movdqa 432(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 640(%rdi), %xmm1 +; SSE-NEXT: movdqa 720(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 656(%rdi), %xmm0 +; SSE-NEXT: movdqa 736(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 688(%rdi), %xmm2 +; SSE-NEXT: movdqa 768(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm0 +; SSE-NEXT: movdqa 752(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 960(%rdi), %xmm1 +; SSE-NEXT: movdqa 1040(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 976(%rdi), %xmm0 +; SSE-NEXT: movdqa 1056(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1008(%rdi), %xmm2 +; SSE-NEXT: movdqa 1088(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 992(%rdi), %xmm0 +; SSE-NEXT: movdqa 1072(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa 160(%rdi), %xmm3 +; SSE-NEXT: movdqa 176(%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 560(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa 480(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 496(%rdi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 608(%rdi), %xmm2 +; SSE-NEXT: movdqa 528(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 592(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 512(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 880(%rdi), %xmm2 +; SSE-NEXT: movdqa 800(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 896(%rdi), %xmm0 +; SSE-NEXT: movdqa 816(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 928(%rdi), %xmm2 +; SSE-NEXT: movdqa 848(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 912(%rdi), %xmm0 +; SSE-NEXT: movdqa 832(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1200(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa 1216(%rdi), %xmm0 +; SSE-NEXT: movdqa 1120(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1136(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1248(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1232(%rdi), %xmm0 +; SSE-NEXT: movdqa 1168(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1152(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm3 -; SSE-NEXT: movdqa 176(%rdi), %xmm8 +; SSE-NEXT: movdqa 240(%rdi), %xmm4 +; SSE-NEXT: movdqa 256(%rdi), %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 208(%rdi), %xmm2 +; SSE-NEXT: movdqa 288(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 272(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 480(%rdi), %xmm9 -; SSE-NEXT: movdqa 496(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: movdqa 560(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 576(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 528(%rdi), %xmm2 +; SSE-NEXT: movdqa 608(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 512(%rdi), %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 592(%rdi), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 800(%rdi), %xmm2 +; SSE-NEXT: movdqa 880(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 816(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa 896(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 848(%rdi), %xmm2 +; SSE-NEXT: movdqa 928(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 832(%rdi), %xmm0 +; SSE-NEXT: movdqa 912(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1120(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1136(%rdi), %xmm0 +; SSE-NEXT: movdqa 1200(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1216(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1168(%rdi), %xmm2 +; SSE-NEXT: movdqa 1248(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1152(%rdi), %xmm0 +; SSE-NEXT: movdqa 1232(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] @@ -2744,43 +2741,41 @@ ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 144(%rdi), %xmm1 +; SSE-NEXT: movdqa 64(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 304(%rdi), %xmm1 +; SSE-NEXT: movdqa 224(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 224(%rdi), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa 304(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2789,7 +2784,7 @@ ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 464(%rdi), %xmm1 +; SSE-NEXT: movdqa 384(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -2797,12 +2792,12 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 384(%rdi), %xmm1 +; SSE-NEXT: movdqa 464(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -2812,23 +2807,26 @@ ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 624(%rdi), %xmm1 +; SSE-NEXT: movdqa 544(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 544(%rdi), %xmm1 +; SSE-NEXT: movdqa 624(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2837,7 +2835,7 @@ ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 784(%rdi), %xmm1 +; SSE-NEXT: movdqa 704(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -2845,12 +2843,12 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 704(%rdi), %xmm1 +; SSE-NEXT: movdqa 784(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -2863,7 +2861,7 @@ ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 944(%rdi), %xmm1 +; SSE-NEXT: movdqa 864(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -2873,22 +2871,34 @@ ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 864(%rdi), %xmm1 +; SSE-NEXT: movdqa 944(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa 1024(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 1104(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] @@ -2897,12 +2907,12 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 1024(%rdi), %xmm1 +; SSE-NEXT: movdqa 1184(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -2910,7 +2920,7 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] @@ -2923,24 +2933,12 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 1184(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] @@ -2949,10 +2947,10 @@ ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2961,36 +2959,35 @@ ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,1,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,0,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] @@ -2999,13 +2996,13 @@ ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -3019,8 +3016,8 @@ ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] @@ -3036,7 +3033,8 @@ ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] @@ -3047,18 +3045,9 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,1,1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] @@ -3073,190 +3062,201 @@ ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,1,1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa %xmm15, %xmm13 +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] @@ -3277,63 +3277,63 @@ ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm10[0],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm2[0],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -3341,16 +3341,15 @@ ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -3377,14 +3376,6 @@ ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 240(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 176(%rsi) @@ -3393,13 +3384,13 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 192(%rsi) +; SSE-NEXT: movaps %xmm15, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 128(%rsi) +; SSE-NEXT: movaps %xmm15, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 64(%rsi) +; SSE-NEXT: movaps %xmm15, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, (%rsi) +; SSE-NEXT: movaps %xmm15, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -3409,38 +3400,46 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 224(%rdx) +; SSE-NEXT: movaps %xmm15, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 240(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 192(%rdx) +; SSE-NEXT: movaps %xmm15, 224(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 208(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 160(%rdx) +; SSE-NEXT: movaps %xmm15, 192(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 128(%rdx) +; SSE-NEXT: movaps %xmm15, 160(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 96(%rdx) +; SSE-NEXT: movaps %xmm15, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 64(%rdx) +; SSE-NEXT: movaps %xmm15, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 32(%rdx) +; SSE-NEXT: movaps %xmm15, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, (%rdx) +; SSE-NEXT: movaps %xmm15, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 240(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 224(%rcx) @@ -3491,7 +3490,7 @@ ; SSE-NEXT: movaps %xmm13, 112(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: movaps %xmm13, 96(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm13 # 16-byte Reload ; SSE-NEXT: movaps %xmm13, 80(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: movaps %xmm13, 64(%r8) @@ -3519,316 +3518,297 @@ ; SSE-NEXT: movapd %xmm14, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: addq $1928, %rsp # imm = 0x788 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride5_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2488, %rsp # imm = 0x9B8 -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm3 +; AVX1-ONLY-NEXT: subq $2456, %rsp # imm = 0x998 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 768(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1088(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 896(%rdi), %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 768(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 1088(%rdi), %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm5[1,3],ymm0[6,5],ymm5[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm6[1,3],ymm0[6,5],ymm6[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm4[1,3],ymm0[6,5],ymm4[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm9[1,3],ymm0[6,5],ymm9[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 624(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss 464(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm1[1,3],ymm0[6,5],ymm1[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm15[2,3],ymm1[4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 944(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss 784(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm13[1,3],ymm0[6,5],ymm13[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm8[1,3],ymm0[6,5],ymm8[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm13[2,3],ymm8[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm13 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 1264(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss 1104(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm8[1,3],ymm0[6,5],ymm8[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm11[1,3],ymm0[6,5],ymm11[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm14[1,3],ymm0[6,5],ymm14[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm7[2,3],ymm14[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm7[1,3],ymm0[6,5],ymm7[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 464(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss 624(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm12[1,3],ymm0[6,5],ymm12[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm6[2,3],ymm12[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm12, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm3[2,3],ymm12[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm12, %ymm15 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 784(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss 944(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm14[1,3],ymm0[6,5],ymm14[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm2[2,3],ymm14[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 1104(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss 1264(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[2,0],ymm0[7,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,1],ymm0[6,4],ymm1[6,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm10[2,0],ymm0[7,4],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm10[2,1],ymm0[6,4],ymm10[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vblendps $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3836,18 +3816,20 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm4[2,0],ymm0[7,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,1],ymm0[6,4],ymm4[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[2,0],ymm0[7,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,1],ymm0[6,4],ymm1[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vblendps $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm1[0,0],ymm2[5,4],ymm1[4,4] @@ -3862,10 +3844,10 @@ ; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3881,100 +3863,92 @@ ; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm9[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm6[0,0],ymm1[5,4],ymm6[4,4] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm1[0,0],ymm2[5,4],ymm1[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm8[2,0],ymm0[7,4],ymm8[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,1],ymm0[6,4],ymm8[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm11[2,0],ymm0[7,4],ymm11[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm11[2,1],ymm0[6,4],ymm11[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps %xmm6, %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,0],ymm0[0,0],ymm4[5,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm4[0,0],ymm5[5,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,0],ymm14[2,0],ymm0[7,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm14[2,1],ymm1[6,4],ymm14[6,5] -; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,0],ymm7[2,0],ymm0[7,4],ymm7[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm7[2,1],ymm1[6,4],ymm7[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3],ymm0[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2],xmm2[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm12[1,0],ymm0[0,0],ymm12[5,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[1,0],ymm7[0,0],ymm6[5,4],ymm7[4,4] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,0],ymm7[2,0],ymm0[7,4],ymm7[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm7[2,1],ymm2[6,4],ymm7[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,0],ymm15[2,0],ymm0[7,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm15[2,1],ymm2[6,4],ymm15[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2,3],ymm0[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] +; AVX1-ONLY-NEXT: vmovaps %xmm12, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm12[2],xmm3[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm13[1,0],ymm0[0,0],ymm13[5,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm12[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[1,0],ymm13[0,0],ymm12[5,4],ymm13[4,4] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm10[2,0],ymm0[7,4],ymm10[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm10[2,1],ymm3[6,4],ymm10[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm9[2,0],ymm0[7,4],ymm9[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm9[2,1],ymm3[6,4],ymm9[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = mem[0,1,2,3],ymm0[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2],xmm11[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm11 = xmm11[1,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[1,0],ymm1[0,0],ymm11[5,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[0,1,2,3],ymm0[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0,1],mem[2],xmm15[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm15 = xmm15[1,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[0,0],ymm1[3,0],ymm5[4,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm1[2,2],ymm15[6,4],ymm1[6,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm1[2,0],mem[1,0],ymm1[6,4],mem[5,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,0],ymm2[0,0],ymm1[5,4],ymm2[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3982,9 +3956,8 @@ ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,0],ymm1[3,0],ymm2[4,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm1[2,2],ymm15[6,4],ymm1[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm10[3,0],ymm1[4,4],ymm10[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm10[2,2],ymm15[6,4],ymm10[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -3993,786 +3966,789 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm11[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm2[3,0],ymm1[4,4],ymm2[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm2[2,2],ymm15[6,4],ymm2[6,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm1[2,0],mem[1,0],ymm1[6,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[2,0],ymm4[1,0],ymm5[6,4],ymm4[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,0],ymm1[3,0],ymm15[4,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm1[2,2],ymm15[6,4],ymm1[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm2[3,0],ymm1[4,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm2[2,2],ymm15[6,4],ymm2[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,0],ymm6[1,0],ymm1[6,4],ymm6[5,4] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm1[2,0],mem[1,0],ymm1[6,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm10[3,0],ymm1[4,4],ymm10[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm10[2,2],ymm15[6,4],ymm10[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm2[3,0],ymm1[4,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm2[2,2],ymm15[6,4],ymm2[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[2,0],ymm3[1,0],ymm11[6,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[2,0],ymm7[1,0],ymm6[6,4],ymm7[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[0,0],ymm7[3,0],ymm9[4,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm7[2,2],ymm15[6,4],ymm7[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,0],ymm3[3,0],ymm2[4,4],ymm3[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm3[2,2],ymm15[6,4],ymm3[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm13[2,0],ymm10[1,0],ymm13[6,4],ymm10[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm7[2,0],mem[1,0],ymm7[6,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,0],ymm4[3,0],ymm3[4,4],ymm4[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm4[2,2],ymm15[6,4],ymm4[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[2,0],ymm13[1,0],ymm12[6,4],ymm13[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[0,0],ymm8[3,0],ymm6[4,4],ymm8[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm8[2,2],ymm15[6,4],ymm8[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[0,0],ymm3[3,0],ymm8[4,4],ymm3[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm3[2,2],ymm15[6,4],ymm3[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[2,0],ymm3[1,0],ymm12[6,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[2,0],ymm12[1,0],ymm11[6,4],ymm12[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm8[3,0],ymm1[4,4],ymm8[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm8[2,2],ymm15[6,4],ymm8[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,0],ymm9[3,0],ymm3[4,4],ymm9[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm9[2,2],ymm15[6,4],ymm9[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,0],ymm2[1,0],ymm4[6,4],ymm2[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[2,0],ymm10[1,0],ymm9[6,4],ymm10[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4],mem[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],mem[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4],mem[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],mem[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5,6],ymm5[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm4[0,1,2,3,4],mem[5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],mem[1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5,6],ymm5[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm12[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm1[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],mem[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $128, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3,4,5,6],mem[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm5[0,1,2,3,4],mem[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],mem[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5,6],ymm13[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,1,2,3,4],mem[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 896(%rdi), %ymm2, %ymm5 +; AVX1-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm5[0,1,2,3,4],mem[5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,1,2,3,4],mem[5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5,6],ymm11[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $128, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[0,1,2,3,4,5,6],mem[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,1,2,3,4],mem[5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],mem[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm11[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $128, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,1,2,3,4,5,6],mem[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm15, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) -; AVX1-ONLY-NEXT: addq $2488, %rsp # imm = 0x9B8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5,6],ymm9[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 192(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm15, (%r9) +; AVX1-ONLY-NEXT: addq $2456, %rsp # imm = 0x998 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride5_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovdqu %ymm12, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,2,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 +; AVX2-ONLY-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm2 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm10, %ymm12 +; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm13 -; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, 608(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 768(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm7, %ymm12 -; AVX2-ONLY-NEXT: vmovdqa %ymm6, %ymm14 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,0,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4],ymm2[5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, 928(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 1088(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm11[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4],ymm2[5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, 1248(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm14[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 608(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, 768(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 928(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, 1088(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 1248(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm7[2,3],ymm2[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm15[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm6[0,1,0,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <1,6,3,u> -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm11[0,1],mem[2,3],ymm11[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <1,6,3,u> +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm7[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm5, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,2,7,0,5,2,7,0] ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 304(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpbroadcastd 144(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm5, %ymm2 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm9[0,1],mem[2,3],ymm9[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 624(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpbroadcastd 464(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm12, %ymm14 -; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm12[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm5, %ymm2 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = ymm12[0,1],mem[2,3],ymm12[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 944(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpbroadcastd 784(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm5, %ymm2 +; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 1264(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpbroadcastd 1104(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm10[2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm15[2,3],ymm4[4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm13[0,1],mem[2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm5, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 144(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpbroadcastd 304(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm7[0,1],mem[2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm10[2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm5, %ymm2 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 464(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpbroadcastd 624(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm10[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm13[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm5, %ymm2 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 784(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpbroadcastd 944(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm5, %ymm0 +; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm8[0,1],mem[2,3],ymm8[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 1104(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpbroadcastd 1264(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <2,7,4,u> -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm3, %ymm0 -; AVX2-ONLY-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <2,7,4,u> +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3],ymm7[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX2-ONLY-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 576(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm4[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 416(%rdi), %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm11[4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 896(%rdi), %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 736(%rdi), %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm6[4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 1216(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vmovdqa %ymm13, %ymm9 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 1056(%rdi), %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendd $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm6[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX2-ONLY-NEXT: vinserti128 $1, 416(%rdi), %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5,6],ymm4[7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm10[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm5, %ymm2 +; AVX2-ONLY-NEXT: vinserti128 $1, 576(%rdi), %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5,6],ymm3[7] +; AVX2-ONLY-NEXT: vmovdqa %ymm15, %ymm11 +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3],ymm10[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm10, %ymm14 -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm3, %ymm4 -; AVX2-ONLY-NEXT: vinserti128 $1, 736(%rdi), %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5,6],ymm15[7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1,2],ymm15[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vinserti128 $1, 1056(%rdi), %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm5, %ymm3 +; AVX2-ONLY-NEXT: vinserti128 $1, 896(%rdi), %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5,6],ymm13[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3,4,5,6],ymm15[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0,1,2],ymm15[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1,2,3],ymm12[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vinserti128 $1, 1216(%rdi), %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5,6],ymm13[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1,2],ymm13[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2],ymm13[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermd %ymm5, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $207, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm5[4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = mem[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm13 = ymm8[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm8[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,6,1,6,1,6,1,6] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm9[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,2,3],ymm4[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm6[12,13,14,15],ymm9[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm9[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm11[0,1,2,3],mem[4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm12[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm7[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm7[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm11[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm12[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr $12, (%rsp), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm4[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm9[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm5[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm3[12,13,14,15],ymm12[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm12[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm12 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm7[0,1],ymm6[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm5[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3],ymm8[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <4,1,6,u> -; AVX2-ONLY-NEXT: vpermd %ymm5, %ymm8, %ymm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm8, %ymm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7] -; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm7, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2i128 $2, (%rsp), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1],ymm11[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm11[5],ymm5[6,7] +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[0,1],ymm1[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm8, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm6[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[0,1],ymm9[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm9[5],ymm4[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm8, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm9[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[0,1],ymm4[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm6[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm8, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm7, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1],ymm9[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm9[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm12[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm10[0,1,2,3],mem[4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm10[0,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm8, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm10[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm8, %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm9[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,2,3],ymm3[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm12[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm11[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm8, %ymm9 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm7, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm11[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm8, %ymm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,2,3],ymm12[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm10, %ymm8, %ymm8 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm10[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm6[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm7, %ymm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -4782,13 +4758,13 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -4797,29 +4773,38 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm13, (%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 224(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 192(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 160(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 224(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm9, 192(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 160(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm2, 128(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 96(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, 32(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 96(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 64(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm13, (%r9) ; AVX2-ONLY-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -4827,203 +4812,202 @@ ; AVX512F-LABEL: load_i32_stride5_vf64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $584, %rsp # imm = 0x248 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm15 ; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm21 ; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm11 ; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm26 ; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm20 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm30 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] ; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <0,5,10,15,20,25,30,u> -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm15, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm19, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm19, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm20, %zmm1, %zmm19 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm19, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,5,10,15,20,25,30,u> +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm16, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm19, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm19, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm15, %zmm1, %zmm19 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] +; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm17, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = <17,22,27,0,5,10,15,u> -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm24 ; AVX512F-NEXT: vpermt2d %zmm13, %zmm12, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm16, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm16, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm20, %zmm1, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm17, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm17, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm15, %zmm1, %zmm17 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm25 = <2,7,12,17,22,27,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm25, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] ; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm23 ; AVX512F-NEXT: vpermt2d %zmm30, %zmm14, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm14, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm14, %zmm22 -; AVX512F-NEXT: vpermi2d %zmm20, %zmm1, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm14, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm14, %zmm22 +; AVX512F-NEXT: vpermi2d %zmm15, %zmm1, %zmm14 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] ; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm28, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm28, %zmm27 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm28, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm28, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm26 ; AVX512F-NEXT: vpermt2d %zmm2, %zmm28, %zmm26 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm20, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <3,8,13,18,23,28,u,u> -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm31 = <3,8,13,18,23,28,u,u> +; AVX512F-NEXT: vpermi2d %zmm1, %zmm15, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm5, %zmm29 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <4,9,14,19,24,29,u,u> -; AVX512F-NEXT: vpermt2d %zmm21, %zmm9, %zmm13 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm15, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm31, %zmm29 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <4,9,14,19,24,29,u,u> +; AVX512F-NEXT: vpermt2d %zmm20, %zmm8, %zmm13 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm16, %zmm21 ; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm12, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm25, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm5, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm9, %zmm17 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm1 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm12, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm25, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm31, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm8, %zmm15 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm15, %zmm2 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm21 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm21, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm12, %zmm3 -; AVX512F-NEXT: vpermi2d %zmm21, %zmm0, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm16, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm12, %zmm5 ; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm25, %zmm6 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm21, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 ; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm5, %zmm7 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm21, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm9, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm31, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm11 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm16 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm25 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 ; AVX512F-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm15 {%k1} -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] -; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm31 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm19 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm15 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm16, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] -; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512F-NEXT: vpermt2d %zmm2, %zmm8, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm19 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm8, %zmm21 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm0 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm4 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm3 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm8, %zmm16 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm8, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm8, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm17, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512F-NEXT: vpermt2d %zmm2, %zmm8, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm8, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm8, %zmm12 ; AVX512F-NEXT: movb $7, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm22 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] -; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm4, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm14 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512F-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm6, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm6, %zmm14 ; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm30 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] -; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm4, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512F-NEXT: vpermt2d %zmm2, %zmm6, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm6, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm6, %zmm28 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] -; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} -; AVX512F-NEXT: vpermt2d %zmm9, %zmm4, %zmm17 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm21 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512F-NEXT: vpermt2d %zmm2, %zmm6, %zmm13 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} +; AVX512F-NEXT: vpermt2d %zmm19, %zmm6, %zmm15 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} +; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm11 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512F-NEXT: vpermt2d %zmm19, %zmm4, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm2, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm31, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm16, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rsi) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, (%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm12, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm5, 128(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm20, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm24, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm14, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm22, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm18, 128(%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm28, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm27, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm30, 64(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm26, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm30, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm27, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm21, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm15, 64(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm13, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%r9) ; AVX512F-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -5031,203 +5015,202 @@ ; AVX512BW-LABEL: load_i32_stride5_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $584, %rsp # imm = 0x248 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm15 ; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm21 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm26 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm20 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm30 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm19, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,5,10,15,20,25,30,u> -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm15, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm19, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm19, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm20, %zmm1, %zmm19 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm16, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm19, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,5,10,15,20,25,30,u> +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm16, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm19, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm19, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm15, %zmm1, %zmm19 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] +; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm17, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <17,22,27,0,5,10,15,u> -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm24 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm12, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm16, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm16, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm20, %zmm1, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm17, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm17, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm15, %zmm1, %zmm17 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm25 = <2,7,12,17,22,27,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm25, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm25, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm23 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm14, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm14, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm14, %zmm22 -; AVX512BW-NEXT: vpermi2d %zmm20, %zmm1, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm14, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm14, %zmm22 +; AVX512BW-NEXT: vpermi2d %zmm15, %zmm1, %zmm14 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] ; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm28, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm28, %zmm27 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm28, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm28, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm26 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm28, %zmm26 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm20, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <3,8,13,18,23,28,u,u> -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm31 = <3,8,13,18,23,28,u,u> +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm15, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm5, %zmm29 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <4,9,14,19,24,29,u,u> -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm9, %zmm13 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm15, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm31, %zmm29 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <4,9,14,19,24,29,u,u> +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm8, %zmm13 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm16, %zmm21 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm12, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm25, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm9, %zmm17 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm12, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm25, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm31, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm8, %zmm15 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm21 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm21, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm12, %zmm3 -; AVX512BW-NEXT: vpermi2d %zmm21, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm16, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm12, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm6 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm21, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm5, %zmm7 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm21, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm9, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm11 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm16 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm25 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 ; AVX512BW-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm31 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm15 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm8, %zmm21 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm4 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm16 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm8, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm12 ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm22 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm14 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm6, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm30 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm6, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm28 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm17 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm6, %zmm15 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm11 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 128(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm28, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm27, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 64(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm26, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm27, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%r9) ; AVX512BW-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -138,31 +138,32 @@ ; AVX512-FAST-LABEL: load_i32_stride6_vf2: ; AVX512-FAST: # %bb.0: ; AVX512-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,6,0,6] -; AVX512-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-FAST-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512-FAST-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 +; AVX512-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512-FAST-NEXT: vpextrd $2, %xmm1, %r10d +; AVX512-FAST-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3 ; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,7,1,7] -; AVX512-FAST-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 -; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4] -; AVX512-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm2 +; AVX512-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm4 +; AVX512-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm1 +; AVX512-FAST-NEXT: vmovd %xmm2, %r10d +; AVX512-FAST-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1 ; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,5,3,5] -; AVX512-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm5 -; AVX512-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] -; AVX512-FAST-NEXT: # xmm1 = mem[0,0] -; AVX512-FAST-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX512-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512-FAST-NEXT: vpermps %ymm3, %ymm1, %ymm1 +; AVX512-FAST-NEXT: vpermi2d %xmm2, %xmm0, %xmm5 +; AVX512-FAST-NEXT: vmovddup {{.*#+}} xmm0 = [4,2,4,2] +; AVX512-FAST-NEXT: # xmm0 = mem[0,0] +; AVX512-FAST-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX512-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX512-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm0 ; AVX512-FAST-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3] ; AVX512-FAST-NEXT: # xmm6 = mem[0,0] -; AVX512-FAST-NEXT: vpermps %ymm3, %ymm6, %ymm3 -; AVX512-FAST-NEXT: vmovq %xmm0, (%rsi) +; AVX512-FAST-NEXT: vpermps %ymm2, %ymm6, %ymm2 +; AVX512-FAST-NEXT: vmovq %xmm3, (%rsi) ; AVX512-FAST-NEXT: vmovq %xmm4, (%rdx) -; AVX512-FAST-NEXT: vmovq %xmm2, (%rcx) +; AVX512-FAST-NEXT: vmovq %xmm1, (%rcx) ; AVX512-FAST-NEXT: vmovq %xmm5, (%r8) -; AVX512-FAST-NEXT: vmovlps %xmm1, (%r9) -; AVX512-FAST-NEXT: vmovlps %xmm3, (%rax) +; AVX512-FAST-NEXT: vmovlps %xmm0, (%r9) +; AVX512-FAST-NEXT: vmovlps %xmm2, (%rax) ; AVX512-FAST-NEXT: vzeroupper ; AVX512-FAST-NEXT: retq ; @@ -928,11 +929,11 @@ ; SSE-NEXT: movdqa 192(%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm4 -; SSE-NEXT: movdqa 336(%rdi), %xmm14 +; SSE-NEXT: movdqa 144(%rdi), %xmm14 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm5 -; SSE-NEXT: movdqa 288(%rdi), %xmm15 -; SSE-NEXT: movdqa 304(%rdi), %xmm7 +; SSE-NEXT: movdqa 160(%rdi), %xmm5 +; SSE-NEXT: movdqa 96(%rdi), %xmm15 +; SSE-NEXT: movdqa 112(%rdi), %xmm7 ; SSE-NEXT: movdqa 64(%rdi), %xmm12 ; SSE-NEXT: movdqa (%rdi), %xmm8 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 @@ -971,16 +972,16 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm4 -; SSE-NEXT: movdqa 112(%rdi), %xmm1 +; SSE-NEXT: movdqa 288(%rdi), %xmm4 +; SSE-NEXT: movdqa 304(%rdi), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: movdqa %xmm1, %xmm10 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 144(%rdi), %xmm9 -; SSE-NEXT: movdqa 160(%rdi), %xmm3 +; SSE-NEXT: movdqa 336(%rdi), %xmm9 +; SSE-NEXT: movdqa 352(%rdi), %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] ; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1036,13 +1037,13 @@ ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa 368(%rdi), %xmm0 +; SSE-NEXT: movdqa 176(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa %xmm15, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] -; SSE-NEXT: movdqa 320(%rdi), %xmm8 +; SSE-NEXT: movdqa 128(%rdi), %xmm8 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1056,12 +1057,12 @@ ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] -; SSE-NEXT: movdqa 176(%rdi), %xmm11 +; SSE-NEXT: movdqa 368(%rdi), %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] -; SSE-NEXT: movdqa 128(%rdi), %xmm4 +; SSE-NEXT: movdqa 320(%rdi), %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1173,43 +1174,43 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rcx) +; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movapd %xmm15, 16(%r8) +; SSE-NEXT: movapd %xmm15, 48(%r8) ; SSE-NEXT: movapd %xmm12, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movapd %xmm2, 16(%r9) +; SSE-NEXT: movapd %xmm2, 48(%r9) ; SSE-NEXT: movapd %xmm3, 32(%r9) -; SSE-NEXT: movapd %xmm4, 48(%r9) +; SSE-NEXT: movapd %xmm4, 16(%r9) ; SSE-NEXT: movapd %xmm5, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm13, 16(%rax) +; SSE-NEXT: movapd %xmm13, 48(%rax) ; SSE-NEXT: movapd %xmm9, 32(%rax) -; SSE-NEXT: movapd %xmm8, 48(%rax) +; SSE-NEXT: movapd %xmm8, 16(%rax) ; SSE-NEXT: movapd %xmm10, (%rax) ; SSE-NEXT: addq $408, %rsp # imm = 0x198 ; SSE-NEXT: retq @@ -2131,366 +2132,369 @@ ; SSE-LABEL: load_i32_stride6_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $1032, %rsp # imm = 0x408 -; SSE-NEXT: movdqa 64(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm9 +; SSE-NEXT: movdqa 144(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 528(%rdi), %xmm7 +; SSE-NEXT: movdqa 160(%rdi), %xmm5 +; SSE-NEXT: movdqa 96(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 432(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 544(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 480(%rdi), %xmm8 -; SSE-NEXT: movdqa 496(%rdi), %xmm4 +; SSE-NEXT: movdqa 448(%rdi), %xmm3 +; SSE-NEXT: movdqa 384(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 400(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm10 +; SSE-NEXT: movdqa 64(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm2 -; SSE-NEXT: movdqa 96(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 384(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 400(%rdi), %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 480(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 496(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 432(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm9 +; SSE-NEXT: movdqa 528(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 544(%rdi), %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 304(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa 192(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 336(%rdi), %xmm3 +; SSE-NEXT: movdqa 240(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 256(%rdi), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 688(%rdi), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa 576(%rdi), %xmm5 +; SSE-NEXT: movdqa 592(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 720(%rdi), %xmm3 +; SSE-NEXT: movdqa 624(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 736(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 640(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm1 +; SSE-NEXT: movdqa 288(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm0 +; SSE-NEXT: movdqa 304(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 240(%rdi), %xmm2 +; SSE-NEXT: movdqa 336(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm0 +; SSE-NEXT: movdqa 352(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm7 -; SSE-NEXT: movdqa 592(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 624(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 640(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa 672(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 688(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa 720(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 736(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm5 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa 176(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] +; SSE-NEXT: movdqa 80(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa 128(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa 32(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa 80(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: movdqa 176(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa 32(%rdi), %xmm13 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa 128(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa 368(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa 272(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] -; SSE-NEXT: movdqa 320(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa 224(%rdi), %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] -; SSE-NEXT: movdqa 272(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa 368(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] -; SSE-NEXT: movdqa %xmm12, %xmm14 -; SSE-NEXT: movdqa 224(%rdi), %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa 320(%rdi), %xmm14 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa 560(%rdi), %xmm0 +; SSE-NEXT: movdqa 464(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa 512(%rdi), %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa 416(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa 464(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa 560(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa 416(%rdi), %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa 512(%rdi), %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa 752(%rdi), %xmm0 +; SSE-NEXT: movdqa 656(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa 704(%rdi), %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa 608(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa 656(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa 752(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa 608(%rdi), %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa %xmm13, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa 704(%rdi), %xmm13 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] @@ -2498,118 +2502,117 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, %xmm15 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2617,9 +2620,10 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: movapd %xmm14, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2631,16 +2635,16 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, %xmm10 +; SSE-NEXT: movdqa %xmm14, %xmm13 +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, %xmm9 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2648,13 +2652,10 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] @@ -2663,7 +2664,8 @@ ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] @@ -2671,54 +2673,54 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rcx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rcx) +; SSE-NEXT: movaps %xmm0, 96(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) +; SSE-NEXT: movaps %xmm0, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%r8) @@ -2737,39 +2739,40 @@ ; SSE-NEXT: movapd %xmm2, 112(%r9) ; SSE-NEXT: movapd %xmm3, 96(%r9) ; SSE-NEXT: movapd %xmm4, 80(%r9) -; SSE-NEXT: movapd %xmm6, 64(%r9) -; SSE-NEXT: movapd %xmm8, 48(%r9) -; SSE-NEXT: movapd %xmm9, 32(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r9) +; SSE-NEXT: movapd %xmm5, 64(%r9) +; SSE-NEXT: movapd %xmm7, 48(%r9) +; SSE-NEXT: movapd %xmm8, 32(%r9) +; SSE-NEXT: movapd %xmm10, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm14, 112(%rax) ; SSE-NEXT: movapd %xmm13, 96(%rax) -; SSE-NEXT: movapd %xmm15, 80(%rax) -; SSE-NEXT: movapd %xmm10, 64(%rax) +; SSE-NEXT: movapd %xmm6, 80(%rax) +; SSE-NEXT: movapd %xmm9, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movapd %xmm12, 32(%rax) -; SSE-NEXT: movapd %xmm11, 16(%rax) -; SSE-NEXT: movapd %xmm7, (%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movapd %xmm15, (%rax) ; SSE-NEXT: addq $1032, %rsp # imm = 0x408 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride6_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX1-ONLY-NEXT: subq $1048, %rsp # imm = 0x418 ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovupd %ymm3, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm7 @@ -2786,20 +2789,20 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm2[0,1] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[3],ymm3[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm5[2,3],ymm4[0,1] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm11[0],ymm5[0],ymm11[3],ymm5[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm5, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm6[0,0],ymm4[6,4],ymm6[4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm3, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm6[0,0],ymm2[6,4],ymm6[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[2,2],ymm0[6,4],ymm6[6,6] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm9[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm1 @@ -2823,9 +2826,9 @@ ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm1 @@ -2859,62 +2862,64 @@ ; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3],ymm10[0,1] +; AVX1-ONLY-NEXT: vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm10[0],ymm0[0],ymm10[3],ymm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[3,0],ymm8[1,0],ymm15[7,4],ymm8[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm10[3,0],ymm8[1,0],ymm10[7,4],ymm8[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm14[2,0],ymm8[2,3],ymm14[6,4],ymm8[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[1,0],xmm7[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm12[0,2],xmm7[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1],ymm12[1,3],ymm0[7,5],ymm12[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[3,1],ymm12[1,3],ymm11[7,5],ymm12[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[3,0],ymm6[1,0],ymm8[7,4],ymm6[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm11[3,0],ymm6[1,0],ymm11[7,4],ymm6[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm13[1,0],xmm5[3,0] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[1,0],xmm5[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm7[0,2],xmm5[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[3,1],ymm14[1,3],ymm0[7,5],ymm14[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[3,1],ymm8[1,3],ymm0[7,5],ymm8[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[3,0],ymm3[1,0],ymm0[7,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm14[3,0],ymm3[1,0],ymm14[7,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,0],ymm3[2,3],ymm5[6,4],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm11[1,0],xmm2[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm13[1,0],xmm2[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[0,2],xmm2[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm0[3,1],mem[1,3],ymm0[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,1],ymm13[1,3],ymm0[7,5],ymm13[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,0],ymm9[1,0],ymm11[7,4],ymm9[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm15[3,0],ymm9[1,0],ymm15[7,4],ymm9[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[2,0],ymm9[2,3],ymm2[6,4],ymm9[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,0],xmm1[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm1[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,1],ymm13[1,3],ymm10[7,5],ymm13[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,1],mem[1,3],ymm1[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm15[2,1],mem[2,0],ymm15[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm10[2,1],mem[2,0],ymm10[6,5],mem[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2928,47 +2933,45 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm8[2,1],mem[2,0],ymm8[6,5],mem[6,4] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm0 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm11[2,1],mem[2,0],ymm11[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[2,0],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm14[0,1,2,3],mem[4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm0[0,0],ymm2[2,0],ymm0[4,4],ymm2[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,0],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm7[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm8[0,0],ymm7[2,0],ymm8[4,4],ymm7[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm9[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm11[2,1],mem[2,0],ymm11[6,5],mem[6,4] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm7[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm0 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm14[2,1],mem[2,0],ymm14[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm9[2,3,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm9[2,0],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2],ymm10[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm10[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[0,0],ymm10[2,0],ymm7[4,4],ymm10[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm13[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm6[2,0],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,0],ymm11[2,0],ymm12[4,4],ymm11[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm13[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm5[2,1],ymm8[2,0],ymm5[6,5],ymm8[6,4] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm15[2,1],ymm9[2,0],ymm15[6,5],ymm9[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm12[2,0],xmm14[2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm10[2,0],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload @@ -2982,203 +2985,205 @@ ; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm4[3,1],mem[3,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[3,1],ymm13[2,1],ymm11[7,5],ymm13[6,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[3,1],ymm5[2,1],ymm13[7,5],ymm5[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm2[3,1],ymm1[4,5],ymm2[7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1],ymm7[3,1],ymm8[4,5],ymm7[7,5] +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm8[2,1],ymm4[7,5],ymm8[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1],ymm11[3,1],ymm12[4,5],ymm11[7,5] ; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm6[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm6[2,1],ymm4[7,5],ymm6[6,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm12[3,1],ymm6[2,1],ymm12[7,5],ymm6[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm15[3,1],ymm0[4,5],ymm15[7,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[3,1],xmm14[3,3] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm8[2,1],ymm5[7,5],ymm8[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,1],ymm10[3,1],ymm7[4,5],ymm10[7,5] -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm9[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm10[3,1],xmm14[3,3] +; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm14 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1],ymm12[2,1],ymm2[7,5],ymm12[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1],ymm9[2,1],ymm2[7,5],ymm9[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm5[0],ymm1[2],ymm5[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,0],ymm13[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm7[0,0],ymm2[6,4],ymm7[4,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[2,0],ymm7[0,0],ymm11[6,4],ymm7[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm1[2,0],ymm7[4,6],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm10[1],ymm13[0],ymm10[2],ymm13[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,0],ymm11[4,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[2,0],ymm1[0,0],ymm6[6,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm2[2,0],ymm1[4,6],ymm2[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3] -; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm8[0],ymm2[2],ymm8[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,0],ymm2[0,0],ymm5[6,4],ymm2[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,2],ymm3[2,0],ymm2[4,6],ymm3[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm8[0],ymm1[2],ymm8[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm5[0,0],ymm2[6,4],ymm5[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2],ymm1[2,0],ymm5[4,6],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vmovapd 656(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm9[1],ymm12[0],ymm9[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm4[0,1],ymm14[2,0],ymm4[4,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1],ymm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm3[2,0],ymm0[0,0],ymm3[6,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,2],ymm12[2,0],ymm0[4,6],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm1[1,0],ymm6[7,4],ymm1[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,3],ymm6[2,0],ymm1[4,7],ymm6[6,4] -; AVX1-ONLY-NEXT: vblendps $12, (%rsp), %xmm15, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm15[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,1],ymm10[2,0],ymm12[5,5],ymm10[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm9[2,2,3,3] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3] +; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm4[1],ymm6[0],ymm4[2],ymm6[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,0],ymm12[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm2[2,0],ymm0[0,0],ymm2[6,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[0,2],ymm6[2,0],ymm0[4,6],ymm6[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm7[1,0],ymm1[7,4],ymm7[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,3],ymm1[2,0],ymm7[4,7],ymm1[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,1],ymm10[2,0],ymm12[5,5],ymm10[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,0],ymm2[1,0],ymm5[7,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,3],ymm1[2,0],ymm2[4,7],ymm1[6,4] -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm13[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm5[3,1],mem[1,3],ymm5[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,1],ymm5[2,0],ymm10[5,5],ymm5[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm0[1,0],ymm3[7,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,2,3,3] +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0],xmm1[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vmovapd 656(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm8[1],ymm14[0],ymm8[2],ymm14[2] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm6[0,1],ymm14[2,0],ymm6[4,5],ymm14[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[2,0],ymm1[0,0],ymm12[6,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,2],ymm15[2,0],ymm1[4,6],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,0],ymm7[1,0],ymm11[7,4],ymm7[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,3],ymm11[2,0],ymm7[4,7],ymm11[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm11[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,1],mem[1,3],ymm14[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,1],ymm14[2,0],ymm15[5,5],ymm14[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm0[1,0],ymm2[7,4],ymm0[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm2[2,0],ymm0[4,7],ymm2[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm11[2,3] -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm9[3,1],mem[1,3],ymm9[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm4[3,1],mem[1,3],ymm4[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,1],ymm3[2,0],ymm4[5,5],ymm3[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm5[1,0],ymm2[7,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,3],ymm2[2,0],ymm5[4,7],ymm2[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm4[3,1],mem[1,3],ymm4[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1],ymm4[2,0],ymm5[5,5],ymm4[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm12[3,0],ymm1[1,0],ymm12[7,4],ymm1[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,3],ymm3[2,0],ymm1[4,7],ymm3[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm10[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,1],ymm4[2,0],ymm6[5,5],ymm4[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) -; AVX1-ONLY-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rax) +; AVX1-ONLY-NEXT: addq $1048, %rsp # imm = 0x418 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3188,8 +3193,8 @@ ; AVX2-SLOW-NEXT: vmovaps 480(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vmovaps 448(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 416(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 416(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm3 @@ -3202,33 +3207,33 @@ ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm12 = <0,6,4,u> -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm12, %ymm0 +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm13 = <0,6,4,u> +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm13, %ymm0 ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4,2,4,2,4,2,4,2] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm4, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[0,1],ymm6[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3,4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm12, %ymm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm13, %ymm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm4, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm0 @@ -3236,140 +3241,139 @@ ; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm0[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm12, %ymm10 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,2,2,2,4,6,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm10[0,1,2],ymm15[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm13, %ymm12 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm12[0,1,2],ymm15[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm4, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm4, %ymm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 608(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm12, %ymm15 -; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm13, %ymm1 +; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm1[0,1],ymm0[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[0,1],ymm2[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2],ymm13[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm9 = [5,3,5,3,5,3,5,3] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm9, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm9, %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm9, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm8 = [5,3,5,3,5,3,5,3] +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm8, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm8, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm8, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm3 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm9[2,3],ymm15[4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps $0, (%rsp), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm10[2,3],mem[4,5],ymm10[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm2[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpermilps $224, (%rsp), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm11[2,3],ymm3[4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm5[0,1],mem[2,3],ymm5[4,5],mem[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] @@ -3382,219 +3386,214 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2,3],ymm1[4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2,3,4],ymm9[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3,4],ymm15[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm11[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm13, %ymm14 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm15[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vmovaps %ymm3, %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm11[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm11 = [4,2,4,2] -; AVX2-SLOW-NEXT: # xmm11 = mem[0,0] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm11, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1],ymm10[2,3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm3 = [4,2,4,2] +; AVX2-SLOW-NEXT: # xmm3 = mem[0,0] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vmovaps %ymm3, %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm7 = [0,2,0,6,0,2,0,6] +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vmovaps %ymm11, %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm5[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 656(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm8[2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm15, %ymm1 +; AVX2-SLOW-NEXT: vmovaps %ymm15, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm0[2,3],ymm15[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = mem[0,1],ymm6[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm3 = [5,3,5,3] +; AVX2-SLOW-NEXT: # xmm3 = mem[0,0] +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm3, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,3,1,7,0,3,1,7] +; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm9, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm3, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm9, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm9, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm11, %ymm2 -; AVX2-SLOW-NEXT: vmovaps %ymm11, %ymm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm15[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 656(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm13, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1],ymm1[2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm13, %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm10[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3] -; AVX2-SLOW-NEXT: # xmm4 = mem[0,0] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm13 = [0,3,1,7,0,3,1,7] -; AVX2-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm13, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps $8, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm13, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm4, %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm13, %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm13, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r8) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) ; AVX2-SLOW-NEXT: addq $1160, %rsp # imm = 0x488 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -3603,10 +3602,10 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $1160, %rsp # imm = 0x488 ; AVX2-FAST-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm3 @@ -3619,33 +3618,33 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm12 = <0,6,4,u> -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm13 = <0,6,4,u> +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm13, %ymm0 ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4,2,4,2,4,2,4,2] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[0,1],ymm6[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm6[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FAST-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm12[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm0 @@ -3653,79 +3652,79 @@ ; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm0[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,2,2,2,4,6,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm10[0,1,2],ymm15[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm13, %ymm12 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm12[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm4, %ymm14 +; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm4, %ymm14 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 608(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm12, %ymm15 -; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm1 +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[0,1],ymm2[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm1[0,1],ymm0[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 736(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm9 = [5,3,5,3,5,3,5,3] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm8 = [5,3,5,3,5,3,5,3] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm8, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm8, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm9[2,3],ymm15[4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [2,0,6,4,2,0,6,7] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] @@ -3738,10 +3737,10 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vmovaps 384(%rdi), %xmm11 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm11[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload @@ -3751,21 +3750,20 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, (%rsp), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 576(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm2 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm10[2,3],mem[4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vpermps %ymm5, %ymm3, %ymm5 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] @@ -3775,32 +3773,33 @@ ; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm11[2,3],mem[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm3, %ymm4 -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovaps 576(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,3,2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm2[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vpermilps $224, (%rsp), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm12[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2,3,4],ymm9[5],ymm4[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3,4],ymm15[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] @@ -3810,205 +3809,199 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm11[3,3,3,3] +; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm7 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps %ymm13, %ymm14 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm3[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[3,3,3,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovaps %ymm15, %ymm12 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm7[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm11 = [4,2,4,2] -; AVX2-FAST-NEXT: # xmm11 = mem[0,0] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm11, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1],ymm10[2,3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vmovaps %ymm11, %ymm13 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 656(%rdi), %xmm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm13, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm15 = [4,2,4,2] +; AVX2-FAST-NEXT: # xmm15 = mem[0,0] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm13[2,3],ymm14[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm7 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1],ymm10[2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 656(%rdi), %xmm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm8[2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vmovaps %ymm15, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1],ymm6[2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm0[2,3],ymm15[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[0,1],ymm6[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm3 = [5,3,5,3] +; AVX2-FAST-NEXT: # xmm3 = mem[0,0] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm3, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,3,1,7,0,3,1,7] +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm9, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm3, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1],ymm1[2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm13, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm10[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3] -; AVX2-FAST-NEXT: # xmm4 = mem[0,0] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm13 = [0,3,1,7,0,3,1,7] -; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm13, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm13, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm13, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm13, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%r9) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm5, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm7, 32(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-NEXT: addq $1160, %rsp # imm = 0x488 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -4019,8 +4012,8 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 480(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovaps 448(%rdi), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 416(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 416(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm3 @@ -4033,33 +4026,33 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm12 = <0,6,4,u> -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm12, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm13 = <0,6,4,u> +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm13, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4,2,4,2,4,2,4,2] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm4, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[0,1],ymm6[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm12, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm4, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm0 @@ -4067,140 +4060,139 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm0[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm12, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,2,2,2,4,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm10[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm13, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm12[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm4, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm4, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 608(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm12, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm1[0,1],ymm0[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[0,1],ymm2[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm9 = [5,3,5,3,5,3,5,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm9, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm9, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm8 = [5,3,5,3,5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm8, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm8, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm9[2,3],ymm15[4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $0, (%rsp), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1],ymm10[2,3],mem[4,5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm2[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, (%rsp), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm11[2,3],ymm3[4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0,1],mem[2,3],ymm5[4,5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] @@ -4213,219 +4205,214 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2,3],ymm1[4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2,3,4],ymm9[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3,4],ymm15[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm11[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm15[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm11[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm11 = [4,2,4,2] -; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm11, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1],ymm10[2,3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm3 = [4,2,4,2] +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0,0] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm7 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm5[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 656(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm8[2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm0[2,3],ymm15[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[0,1],ymm6[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm3 = [5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0,0] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm3, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,3,1,7,0,3,1,7] +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm9, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm3, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm9, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm9, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm11, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 656(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm13, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1],ymm1[2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm13, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm10[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3] -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm13 = [0,3,1,7,0,3,1,7] -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm13, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm13, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm4, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm13, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm13, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: addq $1160, %rsp # imm = 0x488 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -4739,32 +4726,31 @@ ; SSE-LABEL: load_i32_stride6_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $2184, %rsp # imm = 0x888 -; SSE-NEXT: movdqa 912(%rdi), %xmm7 +; SSE-NEXT: movdqa 816(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 928(%rdi), %xmm3 +; SSE-NEXT: movdqa 832(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 864(%rdi), %xmm8 +; SSE-NEXT: movdqa 768(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 880(%rdi), %xmm4 +; SSE-NEXT: movdqa 784(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 528(%rdi), %xmm9 +; SSE-NEXT: movdqa 432(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 544(%rdi), %xmm5 +; SSE-NEXT: movdqa 448(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 480(%rdi), %xmm10 +; SSE-NEXT: movdqa 384(%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 496(%rdi), %xmm6 +; SSE-NEXT: movdqa 400(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm2 +; SSE-NEXT: movdqa 64(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,0,1,1] @@ -4787,590 +4773,590 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1248(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1264(%rdi), %xmm0 +; SSE-NEXT: movdqa 1152(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1168(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1296(%rdi), %xmm3 +; SSE-NEXT: movdqa 1200(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1312(%rdi), %xmm0 +; SSE-NEXT: movdqa 1216(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 96(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 112(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm2 +; SSE-NEXT: movdqa 144(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 384(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 480(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 400(%rdi), %xmm0 +; SSE-NEXT: movdqa 496(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 432(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm0 +; SSE-NEXT: movdqa 528(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 544(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 768(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 864(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 784(%rdi), %xmm0 +; SSE-NEXT: movdqa 880(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 816(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 832(%rdi), %xmm0 +; SSE-NEXT: movdqa 912(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 928(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1152(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1248(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1168(%rdi), %xmm0 +; SSE-NEXT: movdqa 1264(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1200(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1216(%rdi), %xmm0 +; SSE-NEXT: movdqa 1296(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1312(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm2 -; SSE-NEXT: movdqa 304(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm2 +; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm15 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 336(%rdi), %xmm7 -; SSE-NEXT: movdqa 352(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,1,1] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 688(%rdi), %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm13 +; SSE-NEXT: movdqa 256(%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,0,1,1] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 576(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 592(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 720(%rdi), %xmm3 +; SSE-NEXT: movdqa 624(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 736(%rdi), %xmm0 +; SSE-NEXT: movdqa 640(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1056(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1072(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 960(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 976(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1104(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1120(%rdi), %xmm0 +; SSE-NEXT: movdqa 1008(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1024(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1440(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1456(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1344(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1360(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1488(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1504(%rdi), %xmm0 +; SSE-NEXT: movdqa 1392(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1408(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm5 -; SSE-NEXT: movdqa 208(%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa 240(%rdi), %xmm2 -; SSE-NEXT: movdqa 256(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 288(%rdi), %xmm4 +; SSE-NEXT: movdqa 304(%rdi), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa 336(%rdi), %xmm6 +; SSE-NEXT: movdqa 352(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm10 -; SSE-NEXT: movdqa 592(%rdi), %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,3,2,3] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 672(%rdi), %xmm10 +; SSE-NEXT: movdqa 688(%rdi), %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movdqa 624(%rdi), %xmm11 -; SSE-NEXT: movdqa 640(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,2,3,3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa 720(%rdi), %xmm11 +; SSE-NEXT: movdqa 736(%rdi), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm11[0,0,1,1] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 960(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 976(%rdi), %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm3[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1056(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1072(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movdqa 1008(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa 1104(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1024(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] +; SSE-NEXT: movdqa 1120(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1344(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1360(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movdqa 1392(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1408(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, (%rsp), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm8[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm3[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1440(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1456(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa 1488(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1504(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm8[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm3[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm3[0],xmm11[1] +; SSE-NEXT: movdqa %xmm8, %xmm14 +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm2[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: movdqa %xmm6, %xmm13 +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm2[0],xmm11[1] ; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: movdqa 80(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] ; SSE-NEXT: movdqa 176(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, (%rsp), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] ; SSE-NEXT: movdqa 128(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: movdqa 272(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: movdqa 224(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa 224(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 368(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: movdqa 320(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa 368(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: movdqa 320(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa 464(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa 416(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa 464(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa 416(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa 560(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,3,2,3] -; SSE-NEXT: movdqa 512(%rdi), %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; SSE-NEXT: movdqa 560(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa 512(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 656(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: movdqa 608(%rdi), %xmm9 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movdqa 656(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: movdqa 608(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 752(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[2,3,2,3] -; SSE-NEXT: movdqa 704(%rdi), %xmm2 +; SSE-NEXT: movdqa 752(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa 704(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 848(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa 800(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa 848(%rdi), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,1,1] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa 800(%rdi), %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa 944(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa 896(%rdi), %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa 944(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: movdqa 896(%rdi), %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa 1040(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd $238, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: movdqa 992(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] -; SSE-NEXT: movdqa 1040(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa 992(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] -; SSE-NEXT: movdqa 1136(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa 1136(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] ; SSE-NEXT: movdqa 1088(%rdi), %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] ; SSE-NEXT: movdqa 1232(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa 1184(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa 1184(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 1328(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: movdqa 1280(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movdqa 1328(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: movdqa 1280(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 1424(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: movdqa 1376(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: movdqa 1424(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: movdqa 1376(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 1520(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: movdqa 1472(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movdqa 1520(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: movdqa 1472(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,3,3,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[1,1,1,1] @@ -5380,7 +5366,17 @@ ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -5408,8 +5404,7 @@ ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] @@ -5425,81 +5420,69 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] @@ -5509,8 +5492,8 @@ ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] @@ -5520,8 +5503,8 @@ ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] @@ -5577,6 +5560,17 @@ ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[0,0,1,1] @@ -5608,7 +5602,8 @@ ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] @@ -5623,43 +5618,32 @@ ; SSE-NEXT: # xmm11 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] @@ -5667,16 +5651,16 @@ ; SSE-NEXT: # xmm5 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -5706,7 +5690,7 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm15, %xmm6 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5714,10 +5698,10 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5725,10 +5709,10 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5736,10 +5720,10 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5747,10 +5731,10 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5758,10 +5742,10 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5769,21 +5753,20 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5791,10 +5774,10 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5802,13 +5785,14 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] @@ -5829,7 +5813,8 @@ ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] @@ -5838,17 +5823,16 @@ ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] @@ -5856,14 +5840,6 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rsi) @@ -5872,13 +5848,13 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movaps %xmm0, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps %xmm0, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -5888,38 +5864,46 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rdx) +; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rdx) +; SSE-NEXT: movaps %xmm0, 224(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rdx) +; SSE-NEXT: movaps %xmm0, 192(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rdx) +; SSE-NEXT: movaps %xmm0, 160(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%rcx) @@ -5961,7 +5945,7 @@ ; SSE-NEXT: movaps %xmm0, 192(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%r8) @@ -5979,18 +5963,19 @@ ; SSE-NEXT: movaps %xmm0, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r8) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: movapd %xmm2, 240(%r9) ; SSE-NEXT: movapd %xmm3, 224(%r9) ; SSE-NEXT: movapd %xmm5, 208(%r9) -; SSE-NEXT: movapd %xmm7, 192(%r9) -; SSE-NEXT: movapd %xmm13, 176(%r9) -; SSE-NEXT: movapd %xmm8, 160(%r9) -; SSE-NEXT: movapd %xmm9, 144(%r9) -; SSE-NEXT: movapd %xmm11, 128(%r9) +; SSE-NEXT: movapd %xmm8, 192(%r9) +; SSE-NEXT: movapd %xmm10, 176(%r9) +; SSE-NEXT: movapd %xmm13, 160(%r9) +; SSE-NEXT: movapd %xmm11, 144(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -6033,42 +6018,42 @@ ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movapd %xmm6, 16(%rax) ; SSE-NEXT: movapd %xmm4, (%rax) ; SSE-NEXT: addq $2184, %rsp # imm = 0x888 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride6_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2584, %rsp # imm = 0xA18 -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm6 +; AVX1-ONLY-NEXT: subq $2600, %rsp # imm = 0xA28 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm9 ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm9[0,0],ymm1[6,4],ymm9[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm9[2,2],ymm0[6,4],ymm9[6,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm4[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6076,11 +6061,11 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm1[0,0],ymm2[6,4],ymm1[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,2],ymm0[6,4],ymm1[6,6] -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6089,9 +6074,9 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 736(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6099,17 +6084,17 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm2[0,0],ymm1[6,4],ymm2[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,2],ymm0[6,4],ymm2[6,6] -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6117,9 +6102,9 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm13[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1120(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6127,16 +6112,16 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1440(%rdi), %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm11[0,0],ymm1[6,4],ymm11[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm11[2,2],ymm0[6,4],ymm11[6,6] -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6144,9 +6129,9 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm10[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6154,16 +6139,16 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm8[0,0],ymm1[6,4],ymm8[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,2],ymm0[6,4],ymm8[6,6] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6171,9 +6156,9 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6181,26 +6166,26 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm6[0,0],ymm1[6,4],ymm6[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[2,2],ymm0[6,4],ymm6[6,6] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 736(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6208,16 +6193,16 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm3[0,0],ymm1[6,4],ymm3[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,2],ymm0[6,4],ymm3[6,6] -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6225,9 +6210,9 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 1120(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6235,52 +6220,51 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[2,0],ymm1[0,0],ymm5[6,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm0[2,0],ymm1[2,2],ymm0[6,4],ymm1[6,6] -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1440(%rdi), %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm9[0,0],ymm1[6,4],ymm9[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm0[2,0],ymm9[2,2],ymm0[6,4],ymm9[6,6] +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm0[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm12[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,2],xmm5[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm12[0,1] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[0,1] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm0[0],ymm15[0],ymm0[3],ymm15[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm12[3,0],ymm0[1,0],ymm12[7,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm0[2,3],ymm14[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,0],ymm15[1,0],ymm0[7,4],ymm15[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm15[2,3],ymm14[6,4],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,0],xmm9[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,2],xmm9[1,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,0],xmm1[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,2],xmm1[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm0[3,1],ymm12[1,3],ymm0[7,5],ymm12[5,7] +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm0[3,1],mem[1,3],ymm0[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm9[3,0],ymm0[1,0],ymm9[7,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm0[2,3],ymm14[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,0],ymm15[1,0],ymm0[7,4],ymm15[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm15[2,3],ymm14[6,4],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,0],xmm1[3,0] @@ -6326,16 +6310,16 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm0[1,0],xmm7[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[0,2],xmm7[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm0[3,1],mem[1,3],ymm0[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1],ymm10[1,3],ymm0[7,5],ymm10[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[3,0],ymm6[1,0],ymm8[7,4],ymm6[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[1,0],xmm4[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm7[0,2],xmm4[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] @@ -6358,17 +6342,15 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm0[1,0],ymm3[7,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,0],xmm5[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,0],ymm9[1,0],ymm0[7,4],ymm9[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm9[2,3],ymm2[6,4],ymm9[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[1,0],xmm5[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm5[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm2[3,1],mem[1,3],ymm2[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,1],mem[1,3],ymm1[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6384,19 +6366,22 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm12[2,0],ymm1[4,4],ymm12[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[2,0],ymm1[4,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm9[2,1],mem[2,0],ymm9[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6414,7 +6399,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4] @@ -6442,180 +6427,182 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm10[2,0],ymm1[4,4],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[2,0],ymm1[4,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm11[2,1],mem[2,0],ymm11[6,5],mem[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm10[0,1,2,3],mem[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[2,0],ymm1[4,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm5[2,0],ymm1[4,4],ymm5[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm8[2,1],mem[2,0],ymm8[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,1],ymm13[2,0],ymm8[6,5],ymm13[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm9[2,0],ymm1[4,4],ymm9[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm3[2,0],ymm1[4,4],ymm3[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm6[2,1],mem[2,0],ymm6[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,1],ymm11[2,0],ymm6[6,5],ymm11[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,0],ymm6[2,0],ymm13[4,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0],ymm2[2,0],ymm1[4,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[2,1],ymm3[2,0],ymm4[6,5],ymm3[6,4] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm5[2,0],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm15[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm2[0,0],ymm15[2,0],ymm2[4,4],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm12[3,1],ymm0[4,5],ymm12[7,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm1[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm6[2,0],ymm0[6,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm10[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm1[3,1],ymm14[2,1],ymm1[7,5],ymm14[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm4[2,0],ymm1[4,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm1[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[2,1],ymm12[7,5],mem[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm7[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm8[3,1],ymm7[2,1],ymm8[7,5],ymm7[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm5[3,1],ymm0[4,5],ymm5[7,5] +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm9[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm14[3,1],ymm8[2,1],ymm14[7,5],ymm8[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm0[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm0[3,1],mem[2,1],ymm0[7,5],mem[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,1],ymm15[2,1],ymm9[7,5],ymm15[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm10[3,1],ymm0[4,5],ymm10[7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm3[3,1],ymm0[4,5],ymm3[7,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm1[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm11[3,1],mem[2,1],ymm11[7,5],mem[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm15[3,1],ymm2[4,5],ymm15[7,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[3,1],xmm7[3,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm3[2,1],ymm4[7,5],ymm3[6,5] -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm15 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm1[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm9[3,1],ymm13[2,1],ymm9[7,5],ymm13[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[0,1],ymm6[3,1],ymm13[4,5],ymm6[7,5] -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm8[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,1],ymm6[2,1],ymm3[7,5],ymm6[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1],ymm13[2,1],ymm5[7,5],ymm13[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm9[3,1],ymm0[4,5],ymm9[7,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm8[2,1],ymm4[7,5],ymm8[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm2[3,1],ymm0[4,5],ymm2[7,5] +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm12[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm12[3,1],ymm11[2,1],ymm12[7,5],ymm11[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm7[2,1],ymm5[7,5],ymm7[6,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm3[3,1],mem[2,1],ymm3[7,5],mem[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm4[3,1],ymm0[4,5],ymm4[7,5] +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm10[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1],ymm6[2,1],ymm2[7,5],ymm6[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -6625,11 +6612,12 @@ ; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm7[0],ymm1[2],ymm7[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm10[0,0],ymm2[6,4],ymm10[4,4] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,0],ymm10[0,0],ymm3[6,4],ymm10[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm1[2,0],ymm10[4,6],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6645,9 +6633,8 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm14[0],ymm1[2],ymm14[2] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm8[0],ymm1[2],ymm8[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,0],ymm14[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm7[0,0],ymm2[6,4],ymm7[4,4] @@ -6665,9 +6652,10 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm8[0],ymm1[2],ymm8[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm15[0],ymm1[2],ymm15[2] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,0],ymm3[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm5[0,0],ymm2[6,4],ymm5[4,4] @@ -6675,8 +6663,9 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -6687,11 +6676,10 @@ ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[1],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,0],ymm9[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm12[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,0],ymm4[0,0],ymm12[6,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm4[0,0],ymm2[6,4],ymm4[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2],ymm1[2,0],ymm4[4,6],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6706,8 +6694,9 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 848(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,0],ymm3[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm13[0],ymm1[2],ymm13[2] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm11[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[2,0],ymm3[0,0],ymm11[6,4],ymm3[4,4] @@ -6727,8 +6716,7 @@ ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[1],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,0],ymm12[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm14[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,0],ymm2[0,0],ymm14[6,4],ymm2[4,4] @@ -6748,7 +6736,8 @@ ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[1],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,0],ymm15[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,0],ymm6[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[2,0],ymm1[0,0],ymm8[6,4],ymm1[4,4] @@ -6768,8 +6757,8 @@ ; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = ymm9[1],mem[0],ymm9[2],mem[2] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[0,1],ymm15[2,0],ymm9[4,5],ymm15[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[0,1],ymm15[2,0],ymm12[4,5],ymm15[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm15[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm6[2,0],ymm0[0,0],ymm6[6,4],ymm0[4,4] @@ -6810,7 +6799,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = xmm10[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload @@ -6818,7 +6807,8 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,0],ymm4[1,0],ymm12[7,4],ymm4[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm10[3,0],ymm4[1,0],ymm10[7,4],ymm4[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,3],ymm12[2,0],ymm4[4,7],ymm12[6,4] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload @@ -6884,14 +6874,6 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rsi) @@ -6900,13 +6882,13 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -6916,13 +6898,13 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -6932,22 +6914,30 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%r9) @@ -6972,204 +6962,204 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rax) -; AVX1-ONLY-NEXT: addq $2584, %rsp # imm = 0xA18 +; AVX1-ONLY-NEXT: addq $2600, %rsp # imm = 0xA28 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i32_stride6_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $2504, %rsp # imm = 0x9C8 -; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm3 +; AVX2-SLOW-NEXT: subq $2440, %rsp # imm = 0x988 +; AVX2-SLOW-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovaps 448(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 416(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 608(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm8 = <0,6,4,u> -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm7[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm9 = <0,6,4,u> +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm5 = [4,2,4,2,4,2,4,2] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm7 = [4,2,4,2,4,2,4,2] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm7, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[0,1],ymm6[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[0,1],ymm2[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm9, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm7, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1056(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1024(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 832(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 992(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 800(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 960(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm9, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 1088(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1120(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 928(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm7, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1440(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1408(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 1216(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1376(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 1184(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1344(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 1152(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm9, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 1472(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1504(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 1312(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm7, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm9, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm7, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovaps 416(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm9, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm7, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 864(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 1056(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 832(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 1024(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovaps 800(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovaps 992(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 960(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 896(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 1088(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 928(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 1120(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm7, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1184(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 1376(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1152(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 1344(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vmovaps 1248(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1216(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vmovaps 1440(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 1408(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm9[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 1280(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 1472(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1312(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovaps 1504(%rdi), %ymm13 ; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm5 = <1,7,5,u> -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm7 = <1,7,5,u> +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm7, %ymm0 ; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm13 = mem[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7] @@ -7177,53 +7167,53 @@ ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm13 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm13 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm13 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm5, %ymm12 +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm7, %ymm12 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm15[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm0, %ymm11 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm5, %ymm9 +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm7, %ymm8 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm7, %ymm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm7, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm4[2,3,2,3] +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] @@ -7234,15 +7224,15 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $224, (%rsp), %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -7257,20 +7247,21 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 960(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] @@ -7283,148 +7274,179 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1344(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3],ymm0[4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5],ymm13[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovaps 1152(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3],ymm0[4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm5[2,3],mem[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm6[2,3,2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm3[2,3],ymm12[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1,2,3],ymm0[4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm5[2,3],mem[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm13[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3],ymm8[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovaps 960(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1152(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm12[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm0[2,3],ymm11[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1,2,3],ymm1[4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm6[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm5[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1,2,3],ymm4[4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm3[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpermilps $244, (%rsp), %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 1344(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[0,1],ymm9[2,3],mem[4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm8[1,2,3],ymm2[4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm6[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1,2,3],ymm2[4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm3[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm12[1],ymm6[2,3,4],ymm12[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm4[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[3,3,3,3] +; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps $34, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm13[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[3,3,3,3] ; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] @@ -7443,92 +7465,60 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm12[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -7538,226 +7528,219 @@ ; AVX2-SLOW-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] -; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2] +; AVX2-SLOW-NEXT: # xmm5 = mem[0,0] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,2,0,6,0,2,0,6] -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,2,0,6,0,2,0,6] +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 464(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 464(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm2[2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 656(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 656(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 848(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 848(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 1040(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1040(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm8[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 1232(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm11[2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[0,1],ymm13[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = ymm7[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1424(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm8[2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm9[2,3],ymm15[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm3[2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm3[2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [5,3,5,3] ; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps (%rsp), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm15[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1],ymm4[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm15[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps $8, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1],ymm4[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm15[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1],ymm4[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm15[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm15[0,1,2],mem[3],ymm15[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm15[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm4[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3,4],ymm10[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 160(%rsi) @@ -7766,13 +7749,13 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7782,13 +7765,13 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7798,22 +7781,30 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r8) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%r8) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 160(%r8) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%r8) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%r9) @@ -7827,12 +7818,11 @@ ; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm13, 160(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 224(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm10, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm14, 160(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7842,285 +7832,267 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) -; AVX2-SLOW-NEXT: addq $2504, %rsp # imm = 0x9C8 +; AVX2-SLOW-NEXT: addq $2440, %rsp # imm = 0x988 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i32_stride6_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $2504, %rsp # imm = 0x9C8 -; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm3 +; AVX2-FAST-NEXT: subq $2472, %rsp # imm = 0x9A8 +; AVX2-FAST-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 608(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm8 = <0,6,4,u> -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm7[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm9 = <0,6,4,u> +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm5 = [4,2,4,2,4,2,4,2] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4,2,4,2,4,2,4,2] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[0,1],ymm6[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[0,1],ymm2[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 576(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1056(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1024(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 832(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 992(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 800(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 960(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1088(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1120(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 928(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1440(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1408(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 1216(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1376(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1344(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 1184(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 1152(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1472(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1504(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 1312(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm9, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 864(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 1056(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 832(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 1024(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps 800(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 992(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 768(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 960(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm9, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 896(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 1088(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 928(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 1120(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1184(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 1376(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1152(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 1344(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vmovaps 1248(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1216(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovaps 1440(%rdi), %ymm14 +; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 1408(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm14[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1280(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 1472(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1312(%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovaps 1504(%rdi), %ymm14 +; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = <1,7,5,u> -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm4 = <1,7,5,u> +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [5,3,5,3,5,3,5,3] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm5, %ymm12 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm14[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm4, %ymm12 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm11, %ymm0, %ymm11 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] ; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm5, %ymm9 +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm4, %ymm8 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm4, %ymm5 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = [2,0,6,4,2,0,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 576(%rdi), %xmm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [2,0,6,4,2,0,6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm1, %ymm3 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload @@ -8131,422 +8103,445 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 960(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1344(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps $224, (%rsp), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3],ymm0[4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1],ymm14[2,3],ymm4[4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3],ymm0[4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm1, %ymm5 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 384(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps 1152(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1,2,3],ymm0[4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1,2,3],ymm0[4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm7 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm7[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm3[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1,2,3],ymm0[4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm6, %ymm7 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm4[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 576(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm12[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1,2,3],ymm0[4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm6[2,3],ymm15[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm1, %ymm8 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm13[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps 960(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm9[1,2,3],ymm7[4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4],ymm9[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps $51, (%rsp), %ymm0, %ymm9 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm9 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm6, %ymm9 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm1, %ymm9 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm9 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm10 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm6, %ymm7 -; AVX2-FAST-NEXT: vmovaps 1152(%rdi), %xmm8 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm8[2,3,2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm6[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = ymm8[0,1],mem[2,3],ymm8[4,5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm1, %ymm8 +; AVX2-FAST-NEXT: vmovaps 1344(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4],ymm10[5,6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm0[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm7[1,2,3],ymm2[4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = mem[3,3,3,3] +; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1,2,3],ymm1[4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm9 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm3[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1,2,3],ymm1[4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm2[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm11[1],ymm7[2,3,4],ymm11[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4],ymm13[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = mem[3,3,3,3] +; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3],ymm7[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = mem[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3,4],ymm15[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = mem[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps $244, (%rsp), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps $34, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm2 = [4,2,4,2] -; AVX2-FAST-NEXT: # xmm2 = mem[0,0] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2] +; AVX2-FAST-NEXT: # xmm5 = mem[0,0] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm3[2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 656(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 848(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1040(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 1040(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm8[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 1232(%rdi), %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 1424(%rdi), %xmm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,1],ymm10[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm3[2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -8586,10 +8581,10 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps $8, (%rsp), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] @@ -8607,21 +8602,14 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 160(%rsi) @@ -8630,13 +8618,13 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -8646,13 +8634,13 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -8662,22 +8650,30 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 160(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%r9) @@ -8691,13 +8687,12 @@ ; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%r9) +; AVX2-FAST-NEXT: vmovaps %ymm4, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovaps %ymm6, 224(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm9, 192(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm13, 160(%rax) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rax) @@ -8706,204 +8701,204 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-NEXT: addq $2504, %rsp # imm = 0x9C8 +; AVX2-FAST-NEXT: addq $2472, %rsp # imm = 0x9A8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i32_stride6_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $2504, %rsp # imm = 0x9C8 -; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: subq $2440, %rsp # imm = 0x988 +; AVX2-FAST-PERLANE-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps 448(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 416(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 608(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm8 = <0,6,4,u> -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm7[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm9 = <0,6,4,u> +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm5 = [4,2,4,2,4,2,4,2] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm7 = [4,2,4,2,4,2,4,2] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm7, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[0,1],ymm6[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[0,1],ymm2[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm9, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm7, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1056(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1024(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 832(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 992(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 800(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 960(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm9, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 1088(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1120(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 928(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm7, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1440(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1408(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1216(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1376(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1184(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1344(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1152(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm9, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 1472(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1504(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1312(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm7, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm9, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm7, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 416(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm9, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm7, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 864(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1056(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 832(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1024(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 800(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 992(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 960(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 896(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1088(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 928(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1120(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm7, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1184(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1376(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1152(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1344(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps 1248(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1216(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1440(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 1408(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm9[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 1280(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1472(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1312(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1504(%rdi), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm5 = <1,7,5,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm7 = <1,7,5,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm7, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7] @@ -8911,53 +8906,53 @@ ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm5, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm7, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm15[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm0, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm5, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm7, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm7, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm4[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] @@ -8968,15 +8963,15 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, (%rsp), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -8991,20 +8986,21 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 960(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] @@ -9017,148 +9013,179 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1344(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3],ymm0[4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1152(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3],ymm0[4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1],ymm5[2,3],mem[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm6[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm3[2,3],ymm12[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1,2,3],ymm0[4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm5[2,3],mem[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm13[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3],ymm8[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vmovaps 960(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1152(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm12[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm0[2,3],ymm11[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1,2,3],ymm1[4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm6[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm5[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1,2,3],ymm4[4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm3[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps $244, (%rsp), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 1344(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4],ymm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1],ymm9[2,3],mem[4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm8[1,2,3],ymm2[4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm6[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1,2,3],ymm2[4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm3[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm12[1],ymm6[2,3,4],ymm12[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm4[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm13[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] @@ -9177,92 +9204,60 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm12[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -9272,226 +9267,219 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2] +; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[0,0] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 464(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 464(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1],ymm2[2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 656(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 656(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 848(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2,3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 848(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 1040(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1040(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm8[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 1232(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm11[2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1],ymm13[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm7[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1424(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm8[2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm9[2,3],ymm15[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm3[2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm3[2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm1 = [5,3,5,3] ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps (%rsp), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1],ymm4[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1],ymm4[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm15[0,1,2],mem[3],ymm15[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm15[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm4[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3,4],ymm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 160(%rsi) @@ -9500,13 +9488,13 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 192(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -9516,13 +9504,13 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 192(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -9532,22 +9520,30 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 192(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 160(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 192(%r9) @@ -9561,12 +9557,11 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -9576,707 +9571,701 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $2504, %rsp # imm = 0x9C8 +; AVX2-FAST-PERLANE-NEXT: addq $2440, %rsp # imm = 0x988 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-LABEL: load_i32_stride6_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm21 +; AVX512F-NEXT: subq $2568, %rsp # imm = 0xA08 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm17 ; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm22 ; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm27 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm25 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm27 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm20 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm5, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] ; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 ; AVX512F-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm6, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 +; AVX512F-NEXT: vpermi2d %zmm17, %zmm1, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm7, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] ; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm25, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm25, %zmm7, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm25, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm11, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm6, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm17, %zmm1, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm17, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm1, %zmm17, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm17, %zmm1, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] +; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm22, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] +; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm31, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm22 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm2 -; AVX512F-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm27 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm6, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm22, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm31, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm6, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm22, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm28 ; AVX512F-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm20 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512F-NEXT: vpermi2d %zmm17, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm17, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm1, %zmm17, %zmm22 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm17, %zmm31 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm17, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm17 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <0,6,12,18,24,30,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <1,7,13,19,25,31,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm3, %zmm19 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <1,7,13,19,25,31,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = <2,8,14,20,26,u,u,u> +; AVX512F-NEXT: vpermt2d %zmm4, %zmm5, %zmm21 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <2,8,14,20,26,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm11, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = <3,9,15,21,27,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <20,26,0,6,12,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm24 = <21,27,1,7,13,u,u,u> -; AVX512F-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <20,26,0,6,12,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm23 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <21,27,1,7,13,u,u,u> +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm3, %zmm13 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm3, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm4, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm12, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm14, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm3, %zmm8 -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm5, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm11, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm14, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm3, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm5, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm11, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm14, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm9 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm0 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm4, %zmm9 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm12, %zmm13 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm14, %zmm18 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm24, %zmm10 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm5 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512F-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: movw $31, %ax ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm13, %zmm28 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm30 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm18, %zmm26 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm24 -; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm22 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm18, %zmm29 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm11 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm12 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm18 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm14, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm28 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm14, %zmm31 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} ; AVX512F-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} ; AVX512F-NEXT: movb $-32, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm17, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm20, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm15, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm16, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm9, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm31, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm28, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm25, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm12, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm29, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm17, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm23, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm19, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm15, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm21, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm30, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm1, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm31, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm28, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm11, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm16, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm23, (%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512F-NEXT: addq $2568, %rsp # imm = 0xA08 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride6_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm21 +; AVX512BW-NEXT: subq $2568, %rsp # imm = 0xA08 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm17 ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm22 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm20 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm6, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm6 +; AVX512BW-NEXT: vpermi2d %zmm17, %zmm1, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm7, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm11, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm6, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm17, %zmm1, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm17, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm17, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm17, %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] -; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm31, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm22, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm25, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] +; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm3, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm26 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm2 -; AVX512BW-NEXT: vpermi2d %zmm19, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm8, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm19, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm22, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm31, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm22, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm28 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm31, %zmm28 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm29 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm19, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm20 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512BW-NEXT: vpermi2d %zmm17, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm17, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm17, %zmm22 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm17, %zmm31 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm17, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <0,6,12,18,24,30,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm20 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <1,7,13,19,25,31,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm3, %zmm19 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <1,7,13,19,25,31,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm4, %zmm21 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <2,8,14,20,26,u,u,u> +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm5, %zmm21 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <2,8,14,20,26,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <3,9,15,21,27,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <20,26,0,6,12,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm23 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = <21,27,1,7,13,u,u,u> -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <20,26,0,6,12,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm23 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <21,27,1,7,13,u,u,u> +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm3, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm3, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm4, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm14, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm6 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm3, %zmm8 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm5, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm11, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm14, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm5, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm11, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm14, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm4, %zmm9 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm12, %zmm13 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm14, %zmm18 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm5, %zmm11 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm24, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm24, %zmm10 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm5 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm14 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512BW-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: movw $31, %ax ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm30 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm26 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm24 -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm22 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm29 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm18 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm31 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} ; AVX512BW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm12, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm17, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm21, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512BW-NEXT: addq $2568, %rsp # imm = 0xA08 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <384 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -171,12 +171,10 @@ ; AVX512-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512-FAST-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512-FAST-NEXT: vpermi2d %ymm1, %ymm6, %ymm0 -; AVX512-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [13,4,6,7,13,4,6,7] -; AVX512-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [13,4,13,4,13,4,13,4] ; AVX512-FAST-NEXT: vpermi2d %ymm6, %ymm1, %ymm7 ; AVX512-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,6,7,6,13,6,7] -; AVX512-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [6,13,6,13,6,13,6,13] ; AVX512-FAST-NEXT: vpermi2d %ymm1, %ymm6, %ymm8 ; AVX512-FAST-NEXT: vextracti128 $1, %ymm8, %xmm1 ; AVX512-FAST-NEXT: vmovq %xmm2, (%rsi) @@ -272,53 +270,54 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = mem[0],xmm3[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[2] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm4[1],xmm9[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm7[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0],xmm5[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0],xmm6[1],xmm9[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,1,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm5[0,1,2],xmm10[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[0,0],ymm1[1,0],ymm0[4,4],ymm1[5,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm6[0,1,2],xmm10[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = mem[0],xmm7[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,0],ymm0[1,0],ymm1[4,4],ymm0[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm10[0,1],xmm5[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[0,1],xmm6[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm9[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[2,0],ymm1[5,4],ymm0[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %xmm8, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm4, (%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm5, (%r10) +; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %xmm4, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm8, (%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm7, (%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm6, (%r10) ; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -340,19 +339,19 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vbroadcastss 8(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3] +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vbroadcastss 8(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm6[1],xmm7[2,3] ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm8 ; AVX2-SLOW-NEXT: vbroadcastss %xmm8, %xmm9 ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm6[0],xmm8[1],xmm6[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm7[0],xmm8[1],xmm7[2,3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,1,0] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3] ; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm9 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3] ; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3] @@ -360,21 +359,21 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3] ; AVX2-SLOW-NEXT: vbroadcastss 80(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] ; AVX2-SLOW-NEXT: vmovaps %xmm2, (%rsi) ; AVX2-SLOW-NEXT: vmovaps %xmm3, (%rdx) ; AVX2-SLOW-NEXT: vmovaps %xmm4, (%rcx) -; AVX2-SLOW-NEXT: vmovaps %xmm7, (%r8) +; AVX2-SLOW-NEXT: vmovaps %xmm6, (%r8) ; AVX2-SLOW-NEXT: vmovaps %xmm9, (%r9) ; AVX2-SLOW-NEXT: vmovaps %xmm5, (%r10) ; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rax) @@ -392,26 +391,26 @@ ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vbroadcastss 84(%rdi), %xmm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [1,0,7,0,1,0,7,0] ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm5 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm5[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vbroadcastss 8(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3] +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX2-FAST-NEXT: vbroadcastss 8(%rdi), %xmm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm6[1],xmm7[2,3] ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm8 ; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm9 ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm6[0],xmm8[1],xmm6[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm7[0],xmm8[1],xmm7[2,3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,1,0] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],mem[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],mem[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3] ; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3] ; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3] @@ -419,21 +418,21 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm11, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] ; AVX2-FAST-NEXT: vbroadcastss 80(%rdi), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] ; AVX2-FAST-NEXT: vmovaps %xmm2, (%rsi) ; AVX2-FAST-NEXT: vmovaps %xmm3, (%rdx) ; AVX2-FAST-NEXT: vmovaps %xmm5, (%rcx) -; AVX2-FAST-NEXT: vmovaps %xmm7, (%r8) +; AVX2-FAST-NEXT: vmovaps %xmm6, (%r8) ; AVX2-FAST-NEXT: vmovaps %xmm9, (%r9) ; AVX2-FAST-NEXT: vmovaps %xmm4, (%r10) ; AVX2-FAST-NEXT: vmovaps %xmm0, (%rax) @@ -457,19 +456,19 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 8(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 8(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm6[1],xmm7[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm8, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm6[0],xmm8[1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm7[0],xmm8[1],xmm7[2,3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,1,0] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3] @@ -477,21 +476,21 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm10, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm4, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm7, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm6, (%r8) ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm9, (%r9) ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm5, (%r10) ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, (%rax) @@ -549,101 +548,101 @@ ; SSE-LABEL: load_i32_stride7_vf8: ; SSE: # %bb.0: ; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movdqa 144(%rdi), %xmm9 -; SSE-NEXT: movdqa 80(%rdi), %xmm5 -; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: movdqa 16(%rdi), %xmm7 -; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: movdqa 192(%rdi), %xmm8 -; SSE-NEXT: movdqa 160(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm15 -; SSE-NEXT: movdqa 128(%rdi), %xmm0 +; SSE-NEXT: movdqa 192(%rdi), %xmm5 +; SSE-NEXT: movdqa 160(%rdi), %xmm9 +; SSE-NEXT: movdqa 112(%rdi), %xmm11 +; SSE-NEXT: movdqa 128(%rdi), %xmm7 +; SSE-NEXT: movdqa 80(%rdi), %xmm6 +; SSE-NEXT: movdqa (%rdi), %xmm15 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm12 +; SSE-NEXT: movdqa 48(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] ; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa %xmm11, %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm4 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 32(%rdi), %xmm4 +; SSE-NEXT: movdqa 144(%rdi), %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa 176(%rdi), %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,0,1,1] +; SSE-NEXT: movdqa 64(%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm9[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm2[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm14 +; SSE-NEXT: movdqa 176(%rdi), %xmm14 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: movdqa 208(%rdi), %xmm3 +; SSE-NEXT: movdqa 96(%rdi), %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] -; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm9, %xmm7 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] -; SSE-NEXT: movdqa 96(%rdi), %xmm5 +; SSE-NEXT: movdqa 208(%rdi), %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,1,1] ; SSE-NEXT: movdqa %xmm14, %xmm15 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: movdqa %xmm6, %xmm12 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm12[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm12[0],xmm11[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm12 +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm4[2],xmm10[3],xmm4[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: movdqa %xmm6, %xmm10 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm4[0],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm4[0],xmm9[1] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm4[2],xmm14[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,1,1,1] @@ -652,7 +651,7 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm4[0],xmm14[1] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -668,25 +667,25 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm12[0],xmm3[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movapd %xmm8, (%rcx) -; SSE-NEXT: movapd %xmm13, 16(%rcx) -; SSE-NEXT: movapd %xmm15, (%r8) -; SSE-NEXT: movapd %xmm7, 16(%r8) -; SSE-NEXT: movapd %xmm2, (%r9) -; SSE-NEXT: movapd %xmm9, 16(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movapd %xmm8, 16(%rcx) +; SSE-NEXT: movapd %xmm13, (%rcx) +; SSE-NEXT: movapd %xmm15, 16(%r8) +; SSE-NEXT: movapd %xmm7, (%r8) +; SSE-NEXT: movapd %xmm2, 16(%r9) +; SSE-NEXT: movapd %xmm11, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm14, (%rax) -; SSE-NEXT: movapd %xmm10, 16(%rax) +; SSE-NEXT: movapd %xmm14, 16(%rax) +; SSE-NEXT: movapd %xmm9, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm3, (%rax) -; SSE-NEXT: movapd %xmm4, 16(%rax) +; SSE-NEXT: movapd %xmm3, 16(%rax) +; SSE-NEXT: movapd %xmm4, (%rax) ; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: retq ; @@ -694,95 +693,96 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm11 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],ymm2[0],ymm11[2],ymm2[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm13[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm7 = zero,xmm7[1,2],xmm10[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1],ymm11[2,2],ymm12[5,5],ymm11[6,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm9[0],xmm13[1],xmm9[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1,2],ymm7[3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm8[2,3],ymm4[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm8[0,0],ymm14[3,3],ymm8[4,4],ymm14[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm10[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm9[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm2[3,1],ymm11[0,3],ymm2[7,5],ymm11[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[2,1],ymm15[2,0],ymm12[6,5],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm15[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1,2],xmm10[3] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2,3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,0],ymm12[0,0],ymm11[5,4],ymm12[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm12[3,1],ymm11[0,2],ymm12[7,5],ymm11[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm9[0,1,2],xmm13[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm14[0,1],ymm4[1,3],ymm14[4,5],ymm4[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm8[0,2],ymm12[2,0],ymm8[4,6],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm14[1,0],ymm4[2,0],ymm14[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm9[1,2],xmm12[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm9[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm11[1,1],ymm10[2,2],ymm11[5,5],ymm10[6,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = mem[0],xmm7[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm8[2,3],ymm4[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm8[0,0],ymm9[3,3],ymm8[4,4],ymm9[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm9[1,2],xmm12[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm14[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm2[3,1],ymm10[0,3],ymm2[7,5],ymm10[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[2,1],ymm15[2,0],ymm11[6,5],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm15[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1,2],xmm12[3] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,0],ymm11[0,0],ymm10[5,4],ymm11[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[3,1],ymm10[0,2],ymm11[7,5],ymm10[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm14[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,0],ymm4[1,1],ymm12[4,4],ymm4[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm8[0,2],ymm11[2,0],ymm8[4,6],ymm11[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm12[1,0],ymm4[2,0],ymm12[5,4],ymm4[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm8[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0],ymm13[0,0],ymm8[7,4],ymm13[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm12[2,0],ymm8[6,4],ymm12[6,4] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm11[2,0],ymm8[6,4],ymm11[6,4] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm11[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm14[2,1],ymm4[3,3],ymm14[6,5],ymm4[7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0],xmm6[1],xmm5[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[1,0],ymm4[2,0],ymm9[5,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm12[0,1,2],xmm15[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm0[0,0],ymm1[1,0],ymm0[4,4],ymm1[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm13[0,1],xmm9[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm14[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm14[3,0],ymm9[0,0],ymm14[7,4],ymm9[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm12[2,0],ymm4[3,1],ymm12[6,4],ymm4[7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm5[0],xmm6[1],xmm5[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,0],ymm4[2,0],ymm13[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm11[0,1,2],xmm15[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[0,0],ymm0[1,0],ymm1[4,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm14[0,1],xmm13[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm12[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,0],ymm13[0,0],ymm12[7,4],ymm13[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1],ymm9[2,0],ymm5[4,5],ymm9[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,0],ymm5[4,5],ymm12[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[2,0],ymm1[5,4],ymm0[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm11, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm8, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) @@ -1183,21 +1183,21 @@ ; SSE-LABEL: load_i32_stride7_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $440, %rsp # imm = 0x1B8 -; SSE-NEXT: movdqa 304(%rdi), %xmm3 -; SSE-NEXT: movdqa 272(%rdi), %xmm5 -; SSE-NEXT: movdqa 224(%rdi), %xmm15 -; SSE-NEXT: movdqa 240(%rdi), %xmm6 +; SSE-NEXT: movdqa 416(%rdi), %xmm3 +; SSE-NEXT: movdqa 384(%rdi), %xmm5 +; SSE-NEXT: movdqa 336(%rdi), %xmm15 +; SSE-NEXT: movdqa 352(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm7 -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm9 -; SSE-NEXT: movdqa 192(%rdi), %xmm14 -; SSE-NEXT: movdqa 160(%rdi), %xmm11 -; SSE-NEXT: movdqa 112(%rdi), %xmm4 -; SSE-NEXT: movdqa 128(%rdi), %xmm0 +; SSE-NEXT: movdqa 192(%rdi), %xmm7 +; SSE-NEXT: movdqa 160(%rdi), %xmm8 +; SSE-NEXT: movdqa 112(%rdi), %xmm2 +; SSE-NEXT: movdqa 128(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm14 +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -1207,12 +1207,12 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1225,14 +1225,14 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm1 +; SSE-NEXT: movdqa 224(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 416(%rdi), %xmm8 -; SSE-NEXT: movdqa 384(%rdi), %xmm5 +; SSE-NEXT: movdqa 304(%rdi), %xmm8 +; SSE-NEXT: movdqa 272(%rdi), %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] ; SSE-NEXT: movdqa %xmm5, %xmm13 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1245,7 +1245,7 @@ ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: movdqa 144(%rdi), %xmm4 +; SSE-NEXT: movdqa 32(%rdi), %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] @@ -1257,7 +1257,7 @@ ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: movdqa 32(%rdi), %xmm7 +; SSE-NEXT: movdqa 144(%rdi), %xmm7 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] @@ -1269,7 +1269,7 @@ ; SSE-NEXT: movdqa %xmm15, %xmm11 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] -; SSE-NEXT: movdqa 256(%rdi), %xmm2 +; SSE-NEXT: movdqa 368(%rdi), %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movdqa %xmm2, %xmm15 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] @@ -1280,14 +1280,14 @@ ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa 368(%rdi), %xmm2 +; SSE-NEXT: movdqa 256(%rdi), %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 176(%rdi), %xmm4 +; SSE-NEXT: movdqa 64(%rdi), %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] @@ -1297,7 +1297,7 @@ ; SSE-NEXT: movdqa %xmm10, %xmm14 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm9 +; SSE-NEXT: movdqa 176(%rdi), %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] @@ -1307,7 +1307,7 @@ ; SSE-NEXT: movdqa %xmm15, %xmm12 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 288(%rdi), %xmm15 +; SSE-NEXT: movdqa 400(%rdi), %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] @@ -1318,13 +1318,13 @@ ; SSE-NEXT: movdqa %xmm2, %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: movdqa 400(%rdi), %xmm13 +; SSE-NEXT: movdqa 288(%rdi), %xmm13 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm10 +; SSE-NEXT: movdqa 96(%rdi), %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[0,0,1,1] ; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm4, %xmm0 @@ -1334,7 +1334,7 @@ ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm0 +; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,1,1] ; SSE-NEXT: movdqa %xmm9, %xmm3 @@ -1346,7 +1346,7 @@ ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm0 +; SSE-NEXT: movdqa 432(%rdi), %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,1,1] ; SSE-NEXT: movdqa %xmm0, %xmm7 ; SSE-NEXT: movdqa %xmm15, %xmm0 @@ -1357,7 +1357,7 @@ ; SSE-NEXT: movdqa %xmm12, %xmm14 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm0 +; SSE-NEXT: movdqa 320(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE-NEXT: movdqa %xmm13, %xmm4 @@ -1467,292 +1467,294 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rcx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movaps %xmm0, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movapd %xmm9, 48(%r9) -; SSE-NEXT: movapd %xmm14, 32(%r9) -; SSE-NEXT: movapd %xmm12, (%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r9) -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm13, 48(%rax) -; SSE-NEXT: movapd %xmm15, 32(%rax) +; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movapd %xmm9, 32(%r9) +; SSE-NEXT: movapd %xmm14, 48(%r9) +; SSE-NEXT: movapd %xmm12, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: movaps %xmm0, (%r9) +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movapd %xmm13, 32(%rax) +; SSE-NEXT: movapd %xmm15, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm4, 48(%rax) -; SSE-NEXT: movapd %xmm6, 32(%rax) -; SSE-NEXT: movapd %xmm7, (%rax) -; SSE-NEXT: movapd %xmm10, 16(%rax) +; SSE-NEXT: movapd %xmm4, 32(%rax) +; SSE-NEXT: movapd %xmm6, 48(%rax) +; SSE-NEXT: movapd %xmm7, 16(%rax) +; SSE-NEXT: movapd %xmm10, (%rax) ; SSE-NEXT: addq $440, %rsp # imm = 0x1B8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride7_vf16: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm10 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm11 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm3[6],ymm1[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm12[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1,2],xmm6[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm12 +; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm7[6],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm8[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1,2],xmm9[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1],ymm5[2,2],ymm7[5,5],ymm5[6,6] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm7[2,2],ymm5[5,5],ymm7[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0],xmm13[1],xmm11[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = mem[0],xmm3[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm15 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm6[0,1] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm15[0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0],ymm3[3,3],ymm1[4,4],ymm3[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm12[2] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm6[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1],ymm3[2,2],ymm15[5,5],ymm3[6,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0],xmm10[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm10[1,1],ymm3[2,2],ymm10[5,5],ymm3[6,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm4[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[0,0],ymm14[3,3],ymm0[4,4],ymm14[7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm8[2,3],ymm2[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm8[0,0],ymm14[3,3],ymm8[4,4],ymm14[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm8[2] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm9[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm5[0,3],ymm14[7,5],ymm5[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm7[2,1],ymm14[2,0],ymm7[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm3[0,3],ymm12[7,5],ymm3[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm15[2,1],ymm12[2,0],ymm15[6,5],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm7[0,0],ymm5[5,4],ymm7[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,1],ymm2[0,2],ymm7[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm13[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm2[0,1],ymm6[1,3],ymm2[4,5],ymm6[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm11[3,1],ymm7[0,3],ymm11[7,5],ymm7[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm5[2,1],ymm13[2,0],ymm5[6,5],ymm13[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm15[0],ymm1[2],ymm15[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm6[1],xmm13[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm12[3,1],ymm3[0,3],ymm12[7,5],ymm3[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm10[2,1],ymm11[2,0],ymm10[6,5],ymm11[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm13[0,1,2],xmm9[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0],ymm5[0,0],ymm7[5,4],ymm5[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1],ymm7[0,2],ymm5[7,5],ymm7[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm14[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm14[0,0],ymm15[1,1],ymm14[4,4],ymm15[5,5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[0,2],ymm7[2,0],ymm1[4,6],ymm7[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0],ymm15[0,0],ymm3[5,4],ymm15[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm15[3,1],ymm3[0,2],ymm15[7,5],ymm3[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1,2],xmm10[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0],ymm10[0,0],ymm3[5,4],ymm10[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm10[3,1],ymm3[0,2],ymm10[7,5],ymm3[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,1],ymm4[1,3],ymm5[4,5],ymm4[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm0[0,2],ymm8[2,0],ymm0[4,6],ymm8[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm3[0,0],ymm1[7,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,0],ymm6[2,0],ymm2[5,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,0],ymm1[6,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm10[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm3[0,1,2],xmm8[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = mem[0],xmm11[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm8[0,0],ymm0[7,4],ymm8[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm5[1,0],ymm4[2,0],ymm5[5,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,0],ymm0[6,4],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm2[2,1],ymm6[3,3],ymm2[6,5],ymm6[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,0],ymm2[1,1],ymm5[4,4],ymm2[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,2],ymm3[2,0],ymm8[4,6],ymm3[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,0],ymm15[2,0],ymm14[5,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,0],ymm1[0,0],ymm8[7,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,0],ymm2[2,0],ymm5[5,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm4[2,0],ymm1[6,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,0],ymm2[3,1],ymm5[6,4],ymm2[7,5] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,0],ymm6[2,0],ymm9[5,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm10[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm14[0,0],ymm13[1,0],ymm14[4,4],ymm13[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[0,1],xmm3[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[2,1],ymm4[3,3],ymm5[6,5],ymm4[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm3[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm12[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm0[1],xmm3[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm1[2,0],ymm2[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1,2],xmm8[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,0],ymm12[1,0],ymm11[4,4],ymm12[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm9[0,1],xmm6[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm6[0,0],ymm2[7,4],ymm6[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm1[1],xmm6[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,0],ymm6[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm10[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm14[1,0],ymm13[2,0],ymm14[5,4],ymm13[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0],ymm6[0,0],ymm5[7,4],ymm6[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm11[0,0],ymm4[1,0],ymm11[4,4],ymm4[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm6[0,1],xmm2[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,0],ymm15[3,1],ymm14[6,4],ymm15[7,5] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm13[0],xmm15[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,0],ymm6[4,5],ymm5[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[1,0],ymm12[2,0],ymm11[5,4],ymm12[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm1[2,0],ymm6[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1,2],xmm9[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,0],ymm10[1,0],ymm9[4,4],ymm10[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm8[0,1],xmm6[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,0],ymm1[0,0],ymm5[7,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, (%rsp), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm11[1,0],ymm4[2,0],ymm11[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm14[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm14[3,0],ymm5[0,0],ymm14[7,4],ymm5[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,0],ymm7[4,5],ymm5[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm12[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,0],ymm10[2,0],ymm9[5,4],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -1986,6 +1988,7 @@ ; AVX2-FAST-LABEL: load_i32_stride7_vf16: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $296, %rsp # imm = 0x128 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm11 ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 ; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm3 @@ -1993,7 +1996,6 @@ ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm11 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 80(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] @@ -2690,421 +2692,437 @@ ; SSE-LABEL: load_i32_stride7_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $1160, %rsp # imm = 0x488 -; SSE-NEXT: movdqa 80(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm13 +; SSE-NEXT: movdqa 192(%rdi), %xmm8 +; SSE-NEXT: movdqa 160(%rdi), %xmm9 +; SSE-NEXT: movdqa 112(%rdi), %xmm13 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: movdqa 128(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 640(%rdi), %xmm3 -; SSE-NEXT: movdqa 608(%rdi), %xmm4 -; SSE-NEXT: movdqa 560(%rdi), %xmm10 -; SSE-NEXT: movdqa 576(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm7 -; SSE-NEXT: movdqa 160(%rdi), %xmm9 -; SSE-NEXT: movdqa 112(%rdi), %xmm2 +; SSE-NEXT: movdqa 528(%rdi), %xmm4 +; SSE-NEXT: movdqa 496(%rdi), %xmm12 +; SSE-NEXT: movdqa 448(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 464(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm7 +; SSE-NEXT: movdqa (%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa 48(%rdi), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa %xmm10, %xmm15 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movdqa %xmm4, %xmm7 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] ; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 464(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 528(%rdi), %xmm9 -; SSE-NEXT: movdqa 496(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa %xmm9, %xmm13 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 560(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 576(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa 640(%rdi), %xmm5 +; SSE-NEXT: movdqa 608(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 240(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa 304(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 272(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 672(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 688(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa 752(%rdi), %xmm8 +; SSE-NEXT: movdqa 720(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 336(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 416(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 384(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 352(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa 416(%rdi), %xmm6 +; SSE-NEXT: movdqa 384(%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 784(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 800(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 800(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa 864(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 832(%rdi), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm11 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 864(%rdi), %xmm3 -; SSE-NEXT: movdqa 832(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa 32(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: movdqa 480(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,2,2] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa 144(%rdi), %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: movdqa 592(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa 256(%rdi), %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,2,2] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa 704(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa 368(%rdi), %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm8[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[2,2,2,2] +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE-NEXT: movdqa (%rsp), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,1,1] +; SSE-NEXT: movdqa 816(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm8[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE-NEXT: movdqa 64(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE-NEXT: movdqa 176(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa 304(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 688(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm15[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE-NEXT: movdqa 400(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa 752(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 720(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE-NEXT: movdqa 512(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa 144(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa 592(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa 32(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa 480(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa 368(%rdi), %xmm11 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] -; SSE-NEXT: movdqa 816(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: movdqa (%rsp), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa 256(%rdi), %xmm13 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: movdqa 704(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: movdqa 176(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE-NEXT: movdqa 624(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm3 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE-NEXT: movdqa 736(%rdi), %xmm14 +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: movdqa 400(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: movdqa 288(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: movdqa 624(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: movdqa 512(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: movdqa 848(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: movdqa 736(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE-NEXT: movdqa 848(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 320(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd $250, (%rsp), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm13, %xmm10 -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 432(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 544(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 656(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 768(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 880(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $250, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -3112,51 +3130,51 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] @@ -3164,58 +3182,46 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movapd %xmm1, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -3225,8 +3231,9 @@ ; SSE-NEXT: # xmm9 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] @@ -3234,7 +3241,7 @@ ; SSE-NEXT: # xmm8 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -3244,7 +3251,7 @@ ; SSE-NEXT: # xmm7 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -3254,8 +3261,9 @@ ; SSE-NEXT: # xmm6 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] @@ -3263,83 +3271,83 @@ ; SSE-NEXT: # xmm5 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps %xmm1, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rdx) +; SSE-NEXT: movaps %xmm1, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps %xmm1, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rdx) +; SSE-NEXT: movaps %xmm1, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rcx) +; SSE-NEXT: movaps %xmm1, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rcx) +; SSE-NEXT: movaps %xmm1, 96(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps %xmm1, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps %xmm1, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 96(%r8) @@ -3349,7 +3357,7 @@ ; SSE-NEXT: movaps %xmm1, 64(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%r8) -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%r8) @@ -3372,9 +3380,9 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movapd %xmm14, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rax) -; SSE-NEXT: movapd %xmm15, 96(%rax) +; SSE-NEXT: movaps %xmm1, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -3401,593 +3409,587 @@ ; ; AVX1-ONLY-LABEL: load_i32_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1464, %rsp # imm = 0x5B8 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 +; AVX1-ONLY-NEXT: subq $1448, %rsp # imm = 0x5A8 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm10 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1,2],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm1[0],ymm13[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm11[1] -; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm5[6],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm15 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm4[1,2],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm13 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 752(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm4[1,2],xmm11[1] +; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,1],ymm3[2,2],ymm7[5,5],ymm3[6,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm9[2,3],ymm10[0,1] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,0],ymm3[3,3],ymm9[4,4],ymm3[7,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 752(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1],ymm3[2,2],ymm12[5,5],ymm3[6,6] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm14 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,0],ymm1[3,3],ymm3[4,4],ymm1[7,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm7[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1],ymm0[2,2],ymm6[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1],ymm0[2,2],ymm15[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0],xmm2[1],mem[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm15[0,1] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,0],ymm1[3,3],ymm5[4,4],ymm1[7,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm10[2] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm13[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1],ymm0[2,2],ymm7[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[1,1],ymm0[2,2],ymm8[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0],xmm8[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0],xmm6[1],mem[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm12[2,3],ymm14[0,1] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0],ymm2[3,3],ymm12[4,4],ymm2[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1,2],xmm4[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm13[0,1] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0],ymm4[3,3],ymm1[4,4],ymm4[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm4[1,2],xmm11[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1],ymm0[2,2],ymm13[5,5],ymm0[6,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0],xmm5[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3],ymm0[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm2[0,0],ymm5[3,3],ymm2[4,4],ymm5[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm11[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm8[2,2],ymm3[5,5],ymm8[6,6] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm9 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm6[0],xmm2[1],xmm6[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm15[2,3],ymm5[0,1] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,0],ymm13[3,3],ymm15[4,4],ymm13[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,xmm13[1,2],xmm4[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm13[3,1],mem[0,3],ymm13[7,5],mem[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm11[2,1],ymm13[2,0],ymm11[6,5],ymm13[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1],ymm10[0,3],ymm13[7,5],ymm10[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[2,1],ymm13[2,0],ymm7[6,5],ymm13[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1],ymm8[0,3],ymm13[7,5],ymm8[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm9[2,1],ymm13[2,0],ymm9[6,5],ymm13[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm5[0],ymm15[2],ymm5[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm13[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,1],ymm14[0,3],ymm8[7,5],ymm14[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm4[2,1],ymm8[2,0],ymm4[6,5],ymm8[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm11[0,0],ymm0[5,4],ymm11[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,1],ymm0[0,2],ymm11[7,5],ymm0[4,6] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm1[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[0,1],ymm9[1,3],ymm1[4,5],ymm9[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm3[0,2],ymm8[2,0],ymm3[4,6],ymm8[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,0],ymm7[0,0],ymm10[5,4],ymm7[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,1],ymm0[0,2],ymm7[7,5],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm11[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1],ymm8[1,3],ymm1[4,5],ymm8[5,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[0,2],ymm10[2,0],ymm12[4,6],ymm10[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,0],ymm4[0,0],ymm14[5,4],ymm4[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,1],ymm0[0,2],ymm4[7,5],ymm0[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm2[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,1],ymm13[1,3],ymm7[4,5],ymm13[5,7] -; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm14 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm6[0,2],ymm10[2,0],ymm6[4,6],ymm10[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,1],ymm8[0,3],ymm1[7,5],ymm8[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm2[0,0],ymm0[5,4],ymm2[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,1],ymm0[0,2],ymm2[7,5],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[0,1],ymm7[1,3],ymm5[4,5],ymm7[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,2],ymm6[2,0],ymm15[4,6],ymm6[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,0],ymm0[0,0],ymm12[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[1,0],ymm8[2,0],ymm1[5,4],ymm8[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm6[0,1,2],xmm10[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = mem[0],xmm12[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm2[2,1],ymm4[2,0],ymm2[6,5],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm0[0,0],ymm2[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm3[1,0],ymm9[2,0],ymm3[5,4],ymm9[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm0[0,1,2],xmm12[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = mem[0],xmm13[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[3,0],ymm10[0,0],ymm1[7,4],ymm10[4,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm14[1,0],ymm11[2,0],ymm14[5,4],ymm11[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[2,0],ymm10[2,0],ymm2[6,4],ymm10[6,4] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm2[0,1,2],xmm13[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm15[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,0],ymm12[0,0],ymm15[7,4],ymm12[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm5[1,0],ymm7[2,0],ymm5[5,4],ymm7[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm1[2,0],ymm12[2,0],ymm1[6,4],ymm12[6,4] -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm13[0,1,2],xmm14[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = mem[0],xmm15[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm3[2,1],ymm9[3,3],ymm3[6,5],ymm9[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm1[0],mem[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm15[1,0],ymm12[2,0],ymm15[5,4],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm1[0,0],mem[1,0],ymm1[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[0,1],xmm0[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,1],ymm14[0,3],ymm1[7,5],ymm14[4,7] +; AVX1-ONLY-NEXT: vmovaps %ymm12, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm12[2,1],ymm4[2,0],ymm12[6,5],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[2,1],ymm8[3,3],ymm3[6,5],ymm8[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0],xmm7[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,0],ymm0[2,0],ymm15[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[0,0],ymm12[1,0],ymm9[4,4],ymm12[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm15[0,1],xmm6[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm3[3,1],ymm4[0,3],ymm3[7,5],ymm4[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm3[2,1],ymm10[2,0],ymm3[6,5],ymm10[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm5[0],ymm15[0],ymm5[2],ymm15[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm10[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[3,1],ymm7[0,3],ymm15[7,5],ymm7[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[2,1],ymm15[2,0],ymm5[6,5],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm15[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm11[3,3],ymm0[6,5],ymm11[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0],xmm1[1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,0],ymm0[2,0],ymm15[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm1[0,0],mem[1,0],ymm1[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[0,1],xmm2[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm5[2,1],mem[3,3],ymm5[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm14 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,0],ymm0[2,0],ymm2[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm8[1,0],ymm2[0,0],ymm8[5,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm2[3,1],ymm10[0,2],ymm2[7,5],ymm10[4,6] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2],xmm0[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm5[0,0],ymm13[1,0],ymm5[4,4],ymm13[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[0,1],xmm2[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,0],ymm0[0,0],ymm3[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm10[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,0],ymm12[2,0],ymm9[5,4],ymm12[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,0],ymm0[0,0],ymm14[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm13[2,0],ymm5[5,4],ymm13[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm4[1,0],mem[2,0],ymm4[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,0],ymm13[1,1],ymm2[4,4],ymm13[5,5] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[0,2],ymm15[2,0],ymm8[4,6],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[1,0],ymm6[0,0],ymm11[5,4],ymm6[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm6[3,1],ymm10[0,2],ymm6[7,5],ymm10[4,6] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2],xmm0[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm0[0,0],ymm1[1,1],ymm0[4,4],ymm1[5,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,2],ymm15[2,0],ymm3[4,6],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,0],ymm0[0,0],ymm4[5,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[3,1],ymm6[0,2],ymm0[7,5],ymm6[4,6] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm12[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm11[0,0],ymm15[1,1],ymm11[4,4],ymm15[5,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm10[0,2],ymm6[2,0],ymm10[4,6],ymm6[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,0],ymm5[0,0],ymm7[5,4],ymm5[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[3,1],ymm4[0,2],ymm5[7,5],ymm4[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0,1,2],xmm14[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0],ymm12[1,1],ymm1[4,4],ymm12[5,5] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,2],ymm4[2,0],ymm14[4,6],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[3,0],ymm0[0,0],ymm8[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm2[1,0],ymm13[2,0],ymm2[5,4],ymm13[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,0],ymm0[0,0],ymm3[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[1,0],ymm4[2,0],ymm8[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[3,0],ymm0[0,0],ymm10[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[1,0],ymm15[2,0],ymm11[5,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,0],ymm0[6,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,0],ymm0[0,0],ymm14[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vmovaps %ymm12, %ymm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,0],ymm12[2,0],ymm7[5,4],ymm12[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0,1,2],xmm12[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[2,0],ymm4[3,1],ymm8[6,4],ymm4[7,5] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm0[0],mem[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,0],ymm6[2,0],ymm12[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm0[0,0],mem[1,0],ymm0[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm12[0,1],xmm3[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm11[2,0],ymm15[3,1],ymm11[6,4],ymm15[7,5] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm9[0],xmm0[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm12[1,0],ymm3[2,0],ymm12[5,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm2[0,0],ymm4[7,4],ymm2[4,4] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm4[0,0],ymm6[1,0],ymm4[4,4],ymm6[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm12[0,1],xmm5[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[2,0],ymm13[3,1],ymm7[6,4],ymm13[7,5] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm13[0],xmm12[1],xmm13[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[1,0],ymm3[2,0],ymm5[5,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,0],ymm15[1,0],ymm14[4,4],ymm15[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm11, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[0,1],xmm1[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $114, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[2,0],mem[3,1],ymm1[6,4],mem[7,5] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm5[1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,0],ymm1[2,0],ymm3[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,0],mem[1,0],ymm8[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[0,1],xmm3[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,0],ymm1[0,0],ymm11[7,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,0],ymm8[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1,2],xmm10[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm4[1,0],ymm6[2,0],ymm4[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,0],ymm8[0,0],ymm0[7,4],ymm8[4,4] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,0],ymm9[4,5],ymm8[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm9[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm0[1,0],mem[2,0],ymm0[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm7[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0],ymm8[0,0],ymm7[7,4],ymm8[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm13[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm14[1,0],ymm15[2,0],ymm14[5,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm2[3,0],ymm7[0,0],ymm2[7,4],ymm7[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm8[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm2[1,0],mem[2,0],ymm2[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX1-ONLY-NEXT: addq $1464, %rsp # imm = 0x5B8 +; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) +; AVX1-ONLY-NEXT: addq $1448, %rsp # imm = 0x5A8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i32_stride7_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $1224, %rsp # imm = 0x4C8 -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX2-SLOW-NEXT: subq $1192, %rsp # imm = 0x4A8 +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastq 80(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm14 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm2[6],ymm15[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm2 @@ -3999,13 +4001,12 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpbroadcastq 528(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %xmm3 @@ -4016,14 +4017,13 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm12 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm13 -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm3 @@ -4034,17 +4034,17 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm9 ; AVX2-SLOW-NEXT: vmovdqa 672(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm9[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm7 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpbroadcastq 752(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %xmm2 @@ -4064,13 +4064,13 @@ ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm9 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm14[2,3],ymm10[4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -4086,12 +4086,14 @@ ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm8[2,3],ymm11[4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm12[1],ymm6[2,3,4],ymm12[5],ymm6[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -4101,21 +4103,18 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm13 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm15 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm13[2,3],ymm3[4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -4125,361 +4124,358 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm6 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm15[1],mem[2,3,4],ymm15[5],mem[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = ymm4[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = ymm4[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm11[0],ymm6[0],ymm11[2],ymm6[2] +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 528(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 456(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX2-SLOW-NEXT: vpbroadcastd 652(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm14[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = ymm5[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 232(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm2[1],xmm14[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 456(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX2-SLOW-NEXT: vpbroadcastd 428(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 752(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm14 = ymm10[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm1[3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 680(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] -; AVX2-SLOW-NEXT: vpbroadcastd 652(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm12[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 752(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm4 = ymm11[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 680(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm4[1],xmm12[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm13[0],ymm15[0],ymm13[2],ymm15[2] -; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm10 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] +; AVX2-SLOW-NEXT: vpbroadcastd 876(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd 876(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5,6,7] +; AVX2-SLOW-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm11[0,2],ymm15[1,3],ymm11[4,6],ymm15[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 208(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm12 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 232(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm4[0],ymm13[2],ymm4[2] -; AVX2-SLOW-NEXT: vpbroadcastd 428(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,2],ymm15[1,3],ymm14[4,6],ymm15[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 208(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm8[0],mem[1],ymm8[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm13 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,2],ymm9[1,3],ymm12[4,6],ymm9[5,7] +; AVX2-SLOW-NEXT: vmovaps %ymm12, %ymm9 +; AVX2-SLOW-NEXT: vbroadcastss 656(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,2],ymm9[1,3],ymm7[4,6],ymm9[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 656(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,2],ymm6[1,3],ymm8[4,6],ymm6[5,7] +; AVX2-SLOW-NEXT: vmovaps %ymm6, %ymm12 +; AVX2-SLOW-NEXT: vbroadcastss 432(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2],ymm4[1,3],ymm13[4,6],ymm4[5,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm8 -; AVX2-SLOW-NEXT: vbroadcastss 432(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $2, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm9[1,3],ymm10[4,6],ymm9[5,7] +; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm10[0],mem[1],ymm10[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm14 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm5[1,3],ymm7[4,6],ymm5[5,7] +; AVX2-SLOW-NEXT: vmovaps %ymm5, %ymm10 ; AVX2-SLOW-NEXT: vbroadcastss 880(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,3,4,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastd 548(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7] -; AVX2-SLOW-NEXT: vpermd %ymm7, %ymm11, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 660(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd 100(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm2 = [4,3,4,3] +; AVX2-SLOW-NEXT: # xmm2 = mem[0,0] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vmovaps %ymm14, %ymm10 -; AVX2-SLOW-NEXT: vpermd %ymm14, %ymm11, %ymm3 -; AVX2-SLOW-NEXT: vmovaps %ymm15, %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 212(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vpbroadcastd 324(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpermd %ymm13, %ymm11, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm13 -; AVX2-SLOW-NEXT: vpbroadcastd 436(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm11, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm15, %ymm5 +; AVX2-SLOW-NEXT: vbroadcastss 212(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vbroadcastss 548(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 660(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastd 772(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermd %ymm15, %ymm11, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 884(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vbroadcastss 324(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm11, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm12, %ymm15 +; AVX2-SLOW-NEXT: vbroadcastss 436(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vbroadcastss 772(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm10, %ymm12 +; AVX2-SLOW-NEXT: vbroadcastss 884(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-SLOW-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-SLOW-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-SLOW-NEXT: vpbroadcastd 664(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-SLOW-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0],ymm9[1],ymm13[2,3,4],ymm9[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 664(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-SLOW-NEXT: vpbroadcastd 440(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-SLOW-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-SLOW-NEXT: vpbroadcastd 888(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 584(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpermd 640(%rdi), %ymm11, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 528(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 808(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 440(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 888(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 136(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpermd 864(%rdi), %ymm11, %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 752(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermps 192(%rdi), %ymm11, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 80(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 360(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpermps 416(%rdi), %ymm11, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 304(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 136(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 584(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpermd 192(%rdi), %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vpermps 640(%rdi), %ymm11, %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 80(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3] -; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vbroadcastss 528(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 360(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vbroadcastss 808(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpermd 416(%rdi), %ymm11, %ymm6 +; AVX2-SLOW-NEXT: vpermps 864(%rdi), %ymm11, %ymm10 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 304(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3] -; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%r9) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%r9) +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 752(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rcx) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm9, (%r8) +; AVX2-SLOW-NEXT: vmovaps %ymm8, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, 64(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm8, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm8, 64(%rax) -; AVX2-SLOW-NEXT: addq $1224, %rsp # imm = 0x4C8 +; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rax) +; AVX2-SLOW-NEXT: addq $1192, %rsp # imm = 0x4A8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -4488,24 +4484,23 @@ ; AVX2-FAST-NEXT: subq $1192, %rsp # imm = 0x4A8 ; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm10 ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm14 +; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq 80(%rdi), %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm9[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5],ymm8[6],ymm11[7] +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm2 @@ -4517,13 +4512,13 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm7[6],ymm13[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm4[6],ymm13[7] ; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 528(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %xmm3 @@ -4534,9 +4529,9 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm11 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm7[6],ymm14[7] +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 304(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] @@ -4550,11 +4545,12 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm14 -; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm10[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm15 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4579,15 +4575,14 @@ ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm12 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,6,5,6,5,6,5,6] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,7,5,4,7,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] @@ -4595,19 +4590,20 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm15[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm15[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm10[2,3],ymm5[4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm4[2,3],ymm11[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm13[1],ymm8[2,3,4],ymm13[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm13[1],ymm5[2,3,4],ymm13[5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] @@ -4622,13 +4618,13 @@ ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3],ymm9[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm14[1],ymm7[2,3,4],ymm14[5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] @@ -4636,180 +4632,178 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm11 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm14 +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm4[2,3],ymm8[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm3[2,3],ymm13[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm14[0],ymm1[2],ymm14[2] -; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 456(%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm15[0],ymm12[2],ymm15[2] +; AVX2-FAST-NEXT: vpbroadcastd 456(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] ; AVX2-FAST-NEXT: vpbroadcastd 652(%rdi), %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 752(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm8[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 680(%rdi), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm10[1],xmm11[2,3] +; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm1[1],xmm11[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm13[0],ymm6[0],ymm13[2],ymm6[2] -; AVX2-FAST-NEXT: vpbroadcastd 876(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-FAST-NEXT: vpbroadcastd 428(%rdi), %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm15[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm11 = ymm9[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vmovdqa 752(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm11 = ymm13[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpbroadcastd 680(%rdi), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %xmm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm3[0],ymm8[0],ymm3[2],ymm8[2] -; AVX2-FAST-NEXT: vpbroadcastd 428(%rdi), %ymm13 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm7[0],ymm14[2],ymm7[2] +; AVX2-FAST-NEXT: vpbroadcastd 876(%rdi), %ymm13 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm5 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm14[0,2],ymm5[1,3],ymm14[4,6],ymm5[5,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm13 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm10[0,2],ymm9[1,3],ymm10[4,6],ymm9[5,7] ; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm7 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,2],ymm6[1,3],ymm8[4,6],ymm6[5,7] +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm15 +; AVX2-FAST-NEXT: vbroadcastss 656(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd $2, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm9 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,2],ymm13[1,3],ymm12[4,6],ymm13[5,7] -; AVX2-FAST-NEXT: vbroadcastss 656(%rdi), %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm8 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm9 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,2],ymm5[1,3],ymm4[4,6],ymm5[5,7] +; AVX2-FAST-NEXT: vbroadcastss 432(%rdi), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm4[1],mem[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm12[1],mem[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm7 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm8[1,3],ymm3[4,6],ymm8[5,7] -; AVX2-FAST-NEXT: vbroadcastss 432(%rdi), %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm12[1,3],ymm10[4,6],ymm12[5,7] +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm3 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,2],ymm12[1,3],ymm14[4,6],ymm12[5,7] ; AVX2-FAST-NEXT: vbroadcastss 880(%rdi), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,3,4,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastd 548(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm9 -; AVX2-FAST-NEXT: vpbroadcastd 660(%rdi), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd 100(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vpbroadcastd 100(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,3,4,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpbroadcastd 212(%rdi), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 212(%rdi), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpbroadcastd 548(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %xmm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 660(%rdi), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastd 324(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm15 ; AVX2-FAST-NEXT: vpbroadcastd 436(%rdi), %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] @@ -4818,147 +4812,148 @@ ; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %xmm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm2 ; AVX2-FAST-NEXT: vpbroadcastd 884(%rdi), %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm14[1],ymm4[2,3,4],ymm14[5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [1,0,3,3,1,0,7,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm12, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastd 216(%rdi), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,0,3,3,1,0,7,7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpbroadcastd 216(%rdi), %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1,2],xmm14[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1,2],xmm15[3] +; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %xmm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] ; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm12, %ymm5 -; AVX2-FAST-NEXT: vpbroadcastd 664(%rdi), %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpbroadcastd 664(%rdi), %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm10[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm9[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm14[0,1,2],xmm6[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2] ; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm12, %ymm6 -; AVX2-FAST-NEXT: vpbroadcastd 440(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpbroadcastd 440(%rdi), %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %xmm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm14[0,1,2],xmm8[3] +; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %xmm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0,1,2],xmm8[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] ; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm8 -; AVX2-FAST-NEXT: vpbroadcastd 888(%rdi), %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 584(%rdi), %xmm8 -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpermd 640(%rdi), %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 528(%rdi), %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 808(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm2[1],ymm12[2,3,4],ymm2[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastd 888(%rdi), %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 136(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermd 192(%rdi), %ymm11, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 80(%rdi), %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] +; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 360(%rdi), %xmm0 ; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermd 864(%rdi), %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 752(%rdi), %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3] +; AVX2-FAST-NEXT: vpermd 416(%rdi), %ymm11, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 304(%rdi), %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1,2],xmm14[3] ; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 136(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 584(%rdi), %xmm0 ; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermd 192(%rdi), %ymm11, %ymm12 +; AVX2-FAST-NEXT: vpermd 640(%rdi), %ymm11, %ymm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 80(%rdi), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1,2],xmm7[3] +; AVX2-FAST-NEXT: vpbroadcastd 528(%rdi), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3] ; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 360(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX2-FAST-NEXT: vpermd 416(%rdi), %ymm11, %ymm11 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 304(%rdi), %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 808(%rdi), %xmm9 +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX2-FAST-NEXT: vpermd 864(%rdi), %ymm11, %ymm11 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 752(%rdi), %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] ; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm9, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm9, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm9, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm9, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm10, 96(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm10, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm10, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm10, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm10, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm10, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm10, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm10, (%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm9, (%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm10, (%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -4968,47 +4963,45 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 96(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r9) ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%r9) +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm6, 96(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm4, 32(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm5, 64(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm7, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm9, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rax) ; AVX2-FAST-NEXT: addq $1192, %rsp # imm = 0x4A8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i32_stride7_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $1224, %rsp # imm = 0x4C8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: subq $1192, %rsp # imm = 0x4A8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 80(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm2[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm2 @@ -5020,13 +5013,12 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 528(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %xmm3 @@ -5037,14 +5029,13 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm3 @@ -5055,17 +5046,17 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 672(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm9[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 752(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %xmm2 @@ -5085,13 +5076,13 @@ ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm14[2,3],ymm10[4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -5107,12 +5098,14 @@ ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm8[2,3],ymm11[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm12[1],ymm6[2,3,4],ymm12[5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -5122,21 +5115,18 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm13[2,3],ymm3[4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -5146,361 +5136,358 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0],ymm15[1],mem[2,3,4],ymm15[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm2 = ymm4[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm2 = ymm4[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm11[0],ymm6[0],ymm11[2],ymm6[2] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 528(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 456(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 652(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm2 = ymm5[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 232(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm2[1],xmm14[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 456(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 428(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 752(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm14 = ymm10[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm1[3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 680(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 652(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 752(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm4 = ymm11[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 680(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm4[1],xmm12[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm13[0],ymm15[0],ymm13[2],ymm15[2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 876(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 876(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm11[0,2],ymm15[1,3],ymm11[4,6],ymm15[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm12 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 232(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm13[0],ymm4[0],ymm13[2],ymm4[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 428(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,2],ymm15[1,3],ymm14[4,6],ymm15[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm8[0],mem[1],ymm8[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,2],ymm9[1,3],ymm12[4,6],ymm9[5,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 656(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,2],ymm9[1,3],ymm7[4,6],ymm9[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 656(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,2],ymm6[1,3],ymm8[4,6],ymm6[5,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 432(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2],ymm4[1,3],ymm13[4,6],ymm4[5,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 432(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $2, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm9[1,3],ymm10[4,6],ymm9[5,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm10[0],mem[1],ymm10[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm5[1,3],ymm7[4,6],ymm5[5,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 880(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,3,4,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 548(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm11, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 660(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm2 = [4,3,4,3] +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[0,0] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm14, %ymm11, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 212(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 324(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm13, %ymm11, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 436(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm11, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 212(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 548(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 660(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 772(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm11, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 884(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 324(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm11, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 436(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 772(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 884(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 664(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0],ymm9[1],ymm13[2,3,4],ymm9[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 664(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 440(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 888(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 584(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermd 640(%rdi), %ymm11, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 528(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 808(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 440(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 888(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 136(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermd 864(%rdi), %ymm11, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 752(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 136(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermps 192(%rdi), %ymm11, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 360(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermps 416(%rdi), %ymm11, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 304(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 584(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermd 192(%rdi), %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermps 640(%rdi), %ymm11, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 80(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 528(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 360(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 808(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermd 416(%rdi), %ymm11, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermps 864(%rdi), %ymm11, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 304(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 752(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $1224, %rsp # imm = 0x4C8 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $1192, %rsp # imm = 0x4A8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -5908,687 +5895,688 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i32_stride7_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $2456, %rsp # imm = 0x998 -; SSE-NEXT: movdqa 1088(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1056(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1008(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1024(%rdi), %xmm5 +; SSE-NEXT: subq $2440, %rsp # imm = 0x988 +; SSE-NEXT: movdqa 976(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 640(%rdi), %xmm13 -; SSE-NEXT: movdqa 608(%rdi), %xmm6 +; SSE-NEXT: movdqa 944(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 896(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 912(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 528(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 496(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 560(%rdi), %xmm10 +; SSE-NEXT: movdqa 448(%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm7 +; SSE-NEXT: movdqa 464(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm15 -; SSE-NEXT: movdqa 112(%rdi), %xmm1 +; SSE-NEXT: movdqa 80(%rdi), %xmm14 +; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1456(%rdi), %xmm1 +; SSE-NEXT: movdqa 1344(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1472(%rdi), %xmm0 +; SSE-NEXT: movdqa 1360(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1536(%rdi), %xmm2 +; SSE-NEXT: movdqa 1424(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1504(%rdi), %xmm0 +; SSE-NEXT: movdqa 1392(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 112(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa 192(%rdi), %xmm13 +; SSE-NEXT: movdqa 160(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 560(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 576(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa 640(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 608(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 80(%rdi), %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1008(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: movdqa 1024(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 464(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa 1088(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1056(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 528(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1456(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 496(%rdi), %xmm0 +; SSE-NEXT: movdqa 1472(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 896(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 912(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa 1536(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1504(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 976(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm12 +; SSE-NEXT: movdqa 240(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa 304(%rdi), %xmm9 +; SSE-NEXT: movdqa 272(%rdi), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 672(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 944(%rdi), %xmm0 +; SSE-NEXT: movdqa 688(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1344(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1360(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa 752(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa 720(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1424(%rdi), %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1120(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 1136(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa 1200(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1168(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1568(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1392(%rdi), %xmm0 +; SSE-NEXT: movdqa 1584(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa 1648(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1616(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 336(%rdi), %xmm5 ; SSE-NEXT: movdqa 352(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movdqa 416(%rdi), %xmm4 -; SSE-NEXT: movdqa 384(%rdi), %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movdqa 416(%rdi), %xmm0 +; SSE-NEXT: movdqa 384(%rdi), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 784(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, (%rsp) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 784(%rdi), %xmm11 ; SSE-NEXT: movdqa 800(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] ; SSE-NEXT: movdqa 864(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 832(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1232(%rdi), %xmm6 +; SSE-NEXT: movdqa 832(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1232(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1248(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE-NEXT: movdqa 1312(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1280(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; SSE-NEXT: movdqa 1312(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1680(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1280(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1680(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1696(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; SSE-NEXT: movdqa 1760(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1728(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm8 -; SSE-NEXT: movdqa 240(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm7[0],xmm8[1] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa 144(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa 256(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE-NEXT: movdqa 304(%rdi), %xmm2 -; SSE-NEXT: movdqa 272(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm7[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa 368(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm11 -; SSE-NEXT: movdqa 688(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm11, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; SSE-NEXT: movdqa 752(%rdi), %xmm14 -; SSE-NEXT: movdqa 720(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm7[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,1,1] +; SSE-NEXT: movdqa 480(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1120(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1136(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; SSE-NEXT: movdqa 1200(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1168(%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1568(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1584(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; SSE-NEXT: movdqa 1648(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1616(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa 592(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm7[2],xmm15[3],xmm7[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: movdqa 144(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movdqa 704(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm7[0],xmm15[1] ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa 816(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa 32(%rdi), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,1,1] +; SSE-NEXT: movdqa 928(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa 368(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm7[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,1,1] -; SSE-NEXT: movdqa 256(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm7[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: movdqa 592(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm7[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa 480(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa 816(%rdi), %xmm5 +; SSE-NEXT: movdqa 1040(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,2,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: movdqa 704(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1040(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm7[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: movdqa 928(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE-NEXT: pshufd $85, (%rsp), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1264(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] ; SSE-NEXT: movdqa 1152(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm7[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,2,2,2] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,1,1] +; SSE-NEXT: movdqa 1264(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm7[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1488(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa 1376(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1376(%rdi), %xmm6 +; SSE-NEXT: movdqa 1488(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm11[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1712(%rdi), %xmm6 +; SSE-NEXT: movdqa 1600(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1600(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm15[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa 176(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1712(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm7[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movdqa 176(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movdqa 288(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movdqa 400(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa 288(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movdqa 512(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movdqa 624(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa 400(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa 512(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movdqa 736(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa 624(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa 736(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa 848(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movdqa 848(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa 960(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movdqa 960(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; SSE-NEXT: movdqa 1072(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: pshufd $238, (%rsp), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; SSE-NEXT: movdqa 1184(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; SSE-NEXT: movdqa 1296(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; SSE-NEXT: movdqa 1408(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; SSE-NEXT: movdqa 1520(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; SSE-NEXT: movdqa 1632(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: movdqa 1744(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movdqa 1744(%rdi), %xmm5 +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,1,1] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 320(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 432(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 544(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 656(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 768(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 880(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd $250, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 992(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] @@ -6601,33 +6589,35 @@ ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1104(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1216(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $250, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1328(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] @@ -6647,8 +6637,7 @@ ; SSE-NEXT: movdqa 1552(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] @@ -6659,42 +6648,30 @@ ; SSE-NEXT: movdqa 1664(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1776(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] @@ -6704,16 +6681,17 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] @@ -6723,26 +6701,25 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] @@ -6754,7 +6731,8 @@ ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] @@ -6763,19 +6741,20 @@ ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,2,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa (%rsp), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6784,23 +6763,32 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] @@ -6810,14 +6798,6 @@ ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6833,15 +6813,16 @@ ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,2,2] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -6863,27 +6844,36 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: movdqa %xmm6, %xmm11 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6892,6 +6882,15 @@ ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] @@ -6902,8 +6901,8 @@ ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6921,11 +6920,11 @@ ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6934,8 +6933,7 @@ ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6944,7 +6942,8 @@ ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6953,34 +6952,26 @@ ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -6988,11 +6979,11 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -7000,11 +6991,11 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -7027,26 +7018,25 @@ ; SSE-NEXT: # xmm14 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -7056,18 +7046,16 @@ ; SSE-NEXT: # xmm11 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -7077,18 +7065,18 @@ ; SSE-NEXT: # xmm9 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -7103,7 +7091,7 @@ ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[0,0,1,1] @@ -7120,7 +7108,8 @@ ; SSE-NEXT: # xmm5 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -7141,7 +7130,8 @@ ; SSE-NEXT: # xmm3 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -7152,14 +7142,6 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rsi) @@ -7168,13 +7150,13 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movaps %xmm0, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps %xmm0, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7184,38 +7166,46 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rdx) +; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rdx) +; SSE-NEXT: movaps %xmm0, 224(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rdx) +; SSE-NEXT: movaps %xmm0, 192(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rdx) +; SSE-NEXT: movaps %xmm0, 160(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%rcx) @@ -7363,240 +7353,292 @@ ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: addq $2456, %rsp # imm = 0x998 +; SSE-NEXT: addq $2440, %rsp # imm = 0x988 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $3176, %rsp # imm = 0xC68 -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 +; AVX1-ONLY-NEXT: subq $3112, %rsp # imm = 0xC28 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 752(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm0[0],ymm10[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1,2],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm15 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm5[6],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 1424(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm4[1,2],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6],ymm2[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6],ymm2[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 752(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm7 = zero,xmm7[1,2],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6],ymm2[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm11[0],ymm5[0],ymm11[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm7 = zero,xmm7[1,2],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6],ymm2[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm14[0],ymm7[0],ymm14[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm7 = zero,xmm7[1,2],xmm9[1] +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm10[1,1],ymm2[2,2],ymm10[5,5],ymm2[6,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3],ymm2[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,0],ymm2[3,3],ymm4[4,4],ymm2[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1,2],xmm15[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1],ymm0[2,2],ymm15[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm10[1] -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm10[0,1] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,0],ymm1[3,3],ymm4[4,4],ymm1[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = zero,xmm1[1,2],mem[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 1424(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1],ymm0[2,2],ymm7[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0],xmm6[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm14[1] -; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,0],ymm1[3,3],ymm5[4,4],ymm1[7,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm4[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1],ymm0[2,2],ymm6[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1],ymm0[2,2],ymm3[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0],xmm13[1],mem[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7] @@ -7606,37 +7648,52 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm0[2,2],ymm5[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1],ymm0[2,2],ymm13[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0],xmm12[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,0],ymm1[3,3],ymm8[4,4],ymm1[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm6[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[2,2],ymm0[5,5],ymm1[6,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,0],ymm1[3,3],ymm3[4,4],ymm1[7,7] +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = zero,xmm1[1,2],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm12[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[1,1],ymm0[2,2],ymm9[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1],ymm0[2,2],ymm11[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0],xmm9[1],xmm11[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %ymm1 @@ -7653,661 +7710,585 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[2,2],ymm1[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1],ymm0[2,2],ymm14[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7] +; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,0],ymm1[3,3],ymm3[4,4],ymm1[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = zero,xmm1[1,2],mem[0] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm9[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1],ymm0[2,2],ymm15[5,5],ymm0[6,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,1],ymm11[0,3],ymm1[7,5],ymm11[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,1],ymm1[2,0],ymm2[6,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,1],mem[0,3],ymm1[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,1],ymm1[2,0],ymm15[6,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,1],mem[0,3],ymm1[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,1],ymm1[2,0],ymm7[6,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,1],mem[0,3],ymm1[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,1],ymm1[2,0],ymm2[6,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm2[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0],ymm2[3,3],ymm0[4,4],ymm2[7,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1],ymm4[0,3],ymm2[7,5],ymm4[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[2,1],ymm2[2,0],ymm13[6,5],ymm2[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = zero,xmm2[1,2],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm6[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1],ymm0[2,2],ymm6[5,5],ymm0[6,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0],xmm15[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm0[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0],ymm3[3,3],ymm1[4,4],ymm3[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,1],ymm9[0,3],ymm0[7,5],ymm9[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[2,1],ymm3[2,0],ymm1[6,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm6[0],mem[0],ymm6[2],mem[2] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = zero,xmm3[1,2],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm12[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1],ymm0[2,2],ymm4[5,5],ymm0[6,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm12[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm0[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0],ymm4[3,3],ymm1[4,4],ymm4[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm4[1,2],xmm10[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[1,1],ymm0[2,2],ymm8[5,5],ymm0[6,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm0[0],xmm3[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm1[2,3],ymm0[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,0],ymm10[3,3],ymm1[4,4],ymm10[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm10[1,2],xmm14[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm5[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm0[3,1],ymm14[0,3],ymm0[7,5],ymm14[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[2,1],ymm15[2,0],ymm12[6,5],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm15[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm13[3,1],mem[0,3],ymm13[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm0[2,1],ymm13[2,0],ymm0[6,5],ymm13[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm11[1,0],ymm15[0,0],ymm11[5,4],ymm15[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm15[3,1],ymm13[0,2],ymm15[7,5],ymm13[4,6] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,1,2],xmm0[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm3[0,0],ymm15[1,1],ymm3[4,4],ymm15[5,5] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm0[0,2],ymm13[2,0],ymm0[4,6],ymm13[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[2,1],ymm12[2,0],ymm2[6,5],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm4[1,0],ymm0[0,0],ymm4[5,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm0[3,1],ymm11[0,2],ymm0[7,5],ymm11[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1,2],xmm10[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm11[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm4[0,0],ymm7[1,1],ymm4[4,4],ymm7[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm8[0,2],ymm13[2,0],ymm8[4,6],ymm13[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm11[0],mem[0],ymm11[2],mem[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[2,1],ymm12[2,0],ymm2[6,5],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm6[2,1],ymm12[2,0],ymm6[6,5],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[2,1],ymm12[2,0],ymm2[6,5],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm11[0,0],ymm0[5,4],ymm11[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,1],ymm0[0,2],ymm11[7,5],ymm0[4,6] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm4[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm6[0,3],ymm12[7,5],ymm6[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[2,1],ymm12[2,0],ymm2[6,5],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,0],ymm12[0,0],ymm10[5,4],ymm12[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[3,1],ymm10[0,2],ymm12[7,5],ymm10[4,6] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm14[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[0,1],ymm8[1,3],ymm12[4,5],ymm8[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[0,2],ymm15[2,0],ymm8[4,6],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm8[1,0],ymm12[0,0],ymm8[5,4],ymm12[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[3,1],ymm10[0,2],ymm12[7,5],ymm10[4,6] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm5[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[0,1],ymm1[1,3],ymm5[4,5],ymm1[5,7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,2],ymm15[2,0],ymm1[4,6],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4],ymm15[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,0],ymm1[0,0],ymm5[5,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[3,1],ymm10[0,2],ymm1[7,5],ymm10[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm4[0,0],mem[1,1],ymm4[4,4],mem[5,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm11[0,2],ymm13[2,0],ymm11[4,6],ymm13[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[1,0],ymm1[0,0],ymm9[5,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm8[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,1],ymm0[1,3],ymm1[4,5],ymm0[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,2],ymm15[2,0],ymm13[4,6],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm4[1,1],ymm1[4,4],ymm4[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm1[2,0],ymm6[4,6],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[3,1],ymm10[0,2],ymm0[7,5],ymm10[4,6] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm9[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm0[0,1],mem[1,3],ymm0[4,5],mem[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[0,2],ymm15[2,0],ymm11[4,6],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm15[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[0,0],ymm6[5,4],ymm2[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm4[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[0,1],ymm3[1,3],ymm0[4,5],ymm3[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm7[0,2],ymm9[2,0],ymm7[4,6],ymm9[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm9[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm15[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,1],ymm8[1,3],ymm12[4,5],ymm8[5,7] -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm11[0,2],ymm7[2,0],ymm11[4,6],ymm7[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[0,0],ymm0[5,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,0],ymm13[1,1],ymm10[4,4],ymm13[5,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2],ymm1[2,0],ymm5[4,6],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,0],ymm12[0,0],ymm14[5,4],ymm12[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,1],ymm0[0,2],ymm12[7,5],ymm0[4,6] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm14[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm10[1,3],ymm4[4,5],ymm10[5,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm3[0,2],ymm7[2,0],ymm3[4,6],ymm7[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm14[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm5[1,1],ymm1[4,4],ymm5[5,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,2],ymm1[2,0],ymm12[4,6],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm5[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm13 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm13[0,1],ymm1[1,3],ymm13[4,5],ymm1[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm2[0,2],ymm9[2,0],ymm2[4,6],ymm9[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm9[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[0,0],ymm0[5,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm9[1,1],ymm1[4,4],ymm9[5,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm1[2,0],ymm6[4,6],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm7[0,0],ymm2[7,4],ymm7[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,0],ymm1[2,0],ymm13[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm7[2,0],ymm0[6,4],ymm7[6,4] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[0,0],ymm0[5,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[1,1],ymm1[4,4],ymm2[5,5] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,0],ymm2[4,6],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm7[1,0],mem[2,0],ymm7[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[2,0],ymm0[6,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,0],ymm0[0,0],ymm3[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm10[2,0],ymm4[5,4],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,0],ymm15[2,0],ymm3[5,4],ymm15[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = mem[0],xmm5[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = mem[0],xmm2[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm7[2,0],ymm2[5,4],ymm7[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = mem[0],xmm3[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,0],ymm0[0,0],ymm11[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,0],ymm8[2,0],ymm12[5,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm7[1,0],mem[2,0],ymm7[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = mem[0],xmm3[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm8[1,0],mem[2,0],ymm8[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,0],ymm4[2,0],ymm11[5,4],ymm4[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[2,0],ymm1[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm1[0,1,2],xmm14[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,0],ymm0[0,0],ymm3[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm10[1,0],ymm13[2,0],ymm10[5,4],ymm13[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,0],ymm0[0,0],ymm12[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0],ymm5[2,0],ymm3[5,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = mem[0],xmm5[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm0[0,0],ymm2[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm2[1,0],mem[2,0],ymm2[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[3,0],ymm0[0,0],ymm6[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0,1,2],xmm14[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = mem[0],xmm11[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm15[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = mem[0],xmm12[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm13[2,1],mem[3,3],ymm13[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0],xmm10[1],xmm12[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[1,0],ymm4[2,0],ymm11[5,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[0,1,2],xmm9[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm9[0,0],ymm15[1,0],ymm9[4,4],ymm15[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm14[0,1],xmm11[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[3,0],ymm4[0,0],ymm5[7,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[1,0],ymm14[2,0],ymm5[5,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm4[2,0],ymm6[2,0],ymm4[6,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm6[0,1,2],xmm12[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = mem[0],xmm13[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm7[2,1],mem[3,3],ymm7[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm14[0],xmm9[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[1,0],ymm4[2,0],ymm11[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vshufps $114, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm2[2,0],mem[3,1],ymm2[6,4],mem[7,5] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm13[0],xmm4[1],xmm13[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm12[1,0],ymm9[2,0],ymm12[5,4],ymm9[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm12[0,0],mem[1,0],ymm12[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[0,1],xmm2[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $114, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm7[2,0],mem[3,1],ymm7[6,4],mem[7,5] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0],xmm12[1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,0],ymm2[2,0],ymm9[5,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2],xmm7[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm11[0,0],mem[1,0],ymm11[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm11[0,1],xmm7[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm4[2,1],mem[3,3],ymm4[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0],xmm11[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm9[0,0],mem[1,0],ymm9[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm9[0,1],xmm7[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $114, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm11[2,0],mem[3,1],ymm11[6,4],mem[7,5] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0],xmm7[1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,0],ymm4[2,0],ymm7[5,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[1,0],ymm2[2,0],ymm7[5,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm9[0,0],mem[1,0],ymm9[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[0,1],xmm1[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $114, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm10[2,0],mem[3,1],ymm10[6,4],mem[7,5] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0],xmm2[1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm1[2,0],ymm2[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm10[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,0],mem[1,0],ymm7[4,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[0,1],xmm6[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm5[2,1],mem[3,3],ymm5[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0],xmm7[1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,0],mem[1,0],ymm6[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1],xmm5[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm4[2,1],mem[3,3],ymm4[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0],xmm5[1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,0],ymm4[2,0],ymm5[5,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm5[0,0],mem[1,0],ymm5[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[0,1],xmm3[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm8[2,1],mem[3,3],ymm8[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0],xmm6[1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm5[0,0],mem[1,0],ymm5[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm8[2,1],mem[3,3],ymm8[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm5[0],mem[1],xmm5[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm4[0,0],mem[1,0],ymm4[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,1],xmm1[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[0,1],xmm2[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm2[2,1],mem[3,3],ymm2[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm3[1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,0],ymm1[2,0],ymm3[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $114, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[2,0],mem[3,1],ymm1[6,4],mem[7,5] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0],xmm2[1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm1[2,0],ymm2[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,0],mem[1,0],ymm3[4,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[0,1],xmm0[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,1],xmm2[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $114, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm8[2,0],mem[3,1],ymm8[6,4],mem[7,5] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0],xmm3[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm1[2,0],ymm2[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[0,0],mem[1,0],ymm2[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,1],xmm0[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[2,0],ymm14[3,1],ymm5[6,4],ymm14[7,5] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[2,0],ymm1[5,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[0,0],mem[1,0],ymm2[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0],ymm0[0,0],ymm13[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm12[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $114, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[2,0],mem[3,1],ymm0[6,4],mem[7,5] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[2,0],ymm1[5,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[0,0],mem[1,0],ymm2[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -8315,7 +8296,8 @@ ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0],ymm15[2,0],ymm2[5,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[1,0],mem[2,0],ymm2[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] @@ -8323,277 +8305,273 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,0],ymm1[0,0],ymm0[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm14[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm0[1,0],mem[2,0],ymm0[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm0[1,0],mem[2,0],ymm0[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,0],ymm1[0,0],ymm0[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,0],ymm3[4,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm0[1,0],mem[2,0],ymm0[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,0],ymm1[0,0],ymm3[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,0],ymm3[4,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, (%rsp), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3] +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm9[1,0],mem[2,0],ymm9[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm4[1,0],mem[2,0],ymm4[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm2[0,0],ymm4[7,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,0],ymm6[4,5],ymm2[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm10[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[0,0],ymm4[7,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm7[1,0],mem[2,0],ymm7[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm4[1,0],mem[2,0],ymm4[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm7[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[3,0],ymm4[0,0],ymm7[7,4],ymm4[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm2[0,0],ymm4[7,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,0],ymm9[4,5],ymm4[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,0],ymm9[4,5],ymm2[6,4] ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm9[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm4[1,0],mem[2,0],ymm4[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm8[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[3,0],ymm4[0,0],ymm8[7,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm4[3,0],ymm9[0,0],ymm4[7,4],ymm9[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm8[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,0],ymm10[4,5],ymm9[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm10[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm3[1,0],mem[2,0],ymm3[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm3[3,0],ymm9[0,0],ymm3[7,4],ymm9[4,4] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm5[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,0],ymm10[4,5],ymm9[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm10[0,1,2],xmm14[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm3[1,0],mem[2,0],ymm3[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm3[3,0],ymm9[0,0],ymm3[7,4],ymm9[4,4] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1],ymm4[2,0],ymm10[4,5],ymm4[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,0],ymm10[4,5],ymm9[6,4] ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm10 = xmm10[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm3[1,0],mem[2,0],ymm3[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm5[3,0],ymm10[0,0],ymm5[7,4],ymm10[4,4] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm11[0],mem[1],xmm11[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm11[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm11, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm11, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm11, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm11, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 192(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rax) -; AVX1-ONLY-NEXT: addq $3176, %rsp # imm = 0xC68 +; AVX1-ONLY-NEXT: vmovaps %ymm13, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rax) +; AVX1-ONLY-NEXT: addq $3112, %rsp # imm = 0xC28 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i32_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $2680, %rsp # imm = 0xA78 -; AVX2-SLOW-NEXT: vmovdqa 1216(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 1152(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 1120(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 672(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm11 +; AVX2-SLOW-NEXT: subq $2648, %rsp # imm = 0xA58 +; AVX2-SLOW-NEXT: vmovdqa 992(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 928(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 896(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq 80(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpbroadcastd 420(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpbroadcastd 196(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8603,155 +8581,151 @@ ; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm8 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastq 752(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq 528(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpbroadcastd 868(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpbroadcastd 644(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastq 1200(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq 976(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 1248(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 1280(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 1024(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 1056(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpbroadcastd 1316(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpbroadcastd 1092(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 1600(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 1568(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm14 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 1376(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 1344(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm7[6],ymm11[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 1664(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 1440(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 1648(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpbroadcastq 1424(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 1696(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 1728(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 1472(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 1504(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpbroadcastd 1764(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpbroadcastd 1540(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 80(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vpbroadcastq 304(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpbroadcastd 196(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpbroadcastd 420(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm2[6],ymm13[7] +; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vmovdqa 672(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm14[6],ymm13[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 528(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq 752(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpbroadcastd 644(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpbroadcastd 868(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 928(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 1152(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 896(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 1120(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6],ymm3[7] ; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm15 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 992(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 1216(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 976(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpbroadcastq 1200(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 1024(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 1056(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 1248(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 1280(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpbroadcastd 1092(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpbroadcastd 1316(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 1376(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 1600(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 1344(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 1568(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 1440(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vpbroadcastq 1424(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 1664(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq 1648(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 1472(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 1504(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 1696(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 1728(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastd 1540(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpbroadcastd 1764(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8760,20 +8734,20 @@ ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 1056(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 960(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3],ymm8[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8782,20 +8756,20 @@ ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 1280(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 1504(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 1248(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 1184(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm11[1],ymm7[2,3,4],ymm11[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8804,21 +8778,22 @@ ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 1728(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 1696(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 1632(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8827,20 +8802,20 @@ ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8849,17 +8824,17 @@ ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 1056(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 1280(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 1248(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 960(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 1184(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3],ymm8[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload @@ -8872,16 +8847,17 @@ ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 1504(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 1728(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-SLOW-NEXT: vmovdqa 1696(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 1632(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -8898,230 +8874,199 @@ ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm13 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm12 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm14 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4],ymm2[5],mem[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 232(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm4[0],mem[0],ymm4[2],mem[2] -; AVX2-SLOW-NEXT: vpbroadcastd 428(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 752(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 680(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm11[0],mem[0],ymm11[2],mem[2] -; AVX2-SLOW-NEXT: vpbroadcastd 876(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] +; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 1200(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 1128(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 1152(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpbroadcastd 456(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-SLOW-NEXT: vpbroadcastd 1324(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] +; AVX2-SLOW-NEXT: vpbroadcastd 652(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 1648(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 1576(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 1600(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 976(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-SLOW-NEXT: vpbroadcastd 1772(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpbroadcastd 904(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 928(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-SLOW-NEXT: vpbroadcastd 1100(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 1424(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm14[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 456(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpbroadcastd 1352(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 1376(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-SLOW-NEXT: vpbroadcastd 652(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpbroadcastd 1548(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 976(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 904(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 928(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm12[1],xmm15[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 232(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] +; AVX2-SLOW-NEXT: vpbroadcastd 428(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 752(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm3 = ymm7[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 680(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX2-SLOW-NEXT: vpbroadcastd 1100(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 1424(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] +; AVX2-SLOW-NEXT: vpbroadcastd 876(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 1200(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm4 = ymm9[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 1128(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vmovdqa 1152(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm3[1],xmm15[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm4[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm14 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 1352(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 1376(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm9[0],ymm3[0],ymm9[2],ymm3[2] +; AVX2-SLOW-NEXT: vpbroadcastd 1324(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 1648(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm14 = ymm13[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm4[3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 1576(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vmovdqa 1600(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm4[1],xmm15[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX2-SLOW-NEXT: vpbroadcastd 1548(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX2-SLOW-NEXT: vpbroadcastd 1772(%rdi), %ymm13 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm4[0,2],ymm15[1,3],ymm4[4,6],ymm15[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 432(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm11[0,2],mem[1,3],ymm11[4,6],mem[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 880(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,2],ymm4[1,3],ymm15[4,6],ymm4[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 208(%rdi), %ymm13 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,2],ymm14[1,3],ymm13[4,6],ymm14[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 1328(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 1776(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm9[1],ymm10[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm1[1,3],ymm6[4,6],ymm1[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 1552(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 432(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm12[1,3],ymm6[4,6],ymm12[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 656(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm11[1],mem[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm7[1,3],ymm8[4,6],ymm7[5,7] -; AVX2-SLOW-NEXT: vmovaps %ymm7, %ymm9 -; AVX2-SLOW-NEXT: vmovaps %ymm8, %ymm11 -; AVX2-SLOW-NEXT: vbroadcastss 1104(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm7[1,3],ymm10[4,6],ymm7[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 880(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -9129,10 +9074,10 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 656(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm13[1,3],ymm8[4,6],ymm13[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 1104(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9144,215 +9089,249 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm4[1,3],ymm3[4,6],ymm4[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 208(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2],ymm3[1,3],ymm9[4,6],ymm3[5,7] +; AVX2-SLOW-NEXT: vmovaps %ymm3, %ymm11 +; AVX2-SLOW-NEXT: vbroadcastss 1328(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm12[0,2],mem[1,3],ymm12[4,6],mem[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 1552(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm10[0,2],mem[1,3],ymm10[4,6],mem[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 1776(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm7 = [4,3,4,3] -; AVX2-SLOW-NEXT: # xmm7 = mem[0,0] +; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm0 = [4,3,4,3] +; AVX2-SLOW-NEXT: # xmm0 = mem[0,0] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm7, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 212(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm14 = [0,7,0,7,0,7,0,7] +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm14, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 212(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vbroadcastss 324(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 436(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm7, %ymm3 -; AVX2-SLOW-NEXT: vbroadcastss 324(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vbroadcastss 548(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm14, %ymm4 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 436(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vbroadcastss 660(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm7, %ymm4 -; AVX2-SLOW-NEXT: vbroadcastss 548(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 660(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vbroadcastss 772(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm14, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 884(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm7, %ymm5 -; AVX2-SLOW-NEXT: vbroadcastss 772(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 884(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vbroadcastss 996(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vmovaps 960(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm14, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1108(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vbroadcastss 996(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vmovaps 960(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1108(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm7, %ymm8 -; AVX2-SLOW-NEXT: vbroadcastss 1220(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vbroadcastss 1220(%rdi), %xmm8 ; AVX2-SLOW-NEXT: vmovaps 1184(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm6[0,1,2],xmm9[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1332(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm12[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm7, %ymm8 -; AVX2-SLOW-NEXT: vbroadcastss 1444(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm14, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1332(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vbroadcastss 1444(%rdi), %xmm8 ; AVX2-SLOW-NEXT: vmovaps 1408(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm13[0,1,2],xmm9[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1556(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm12[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm7, %ymm7 -; AVX2-SLOW-NEXT: vbroadcastss 1668(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vmovaps 1632(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm13[0,1,2],xmm8[3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm14, %ymm8 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1780(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vbroadcastss 1556(%rdi), %ymm9 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vbroadcastss 1668(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vmovaps 1632(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm0[0,1,2],xmm8[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm14, %ymm8 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1780(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm5 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm5 = xmm12[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3,4],ymm5[5],mem[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 440(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 664(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm3[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 440(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 888(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] +; AVX2-SLOW-NEXT: vmovaps 992(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 664(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 1112(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vmovaps 1216(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 888(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 992(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1,2],xmm5[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 1112(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm10[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 1216(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] @@ -9360,164 +9339,155 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 1336(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 1440(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm13[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 1440(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0,1,2],xmm13[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 1560(%rdi), %ymm13 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1664(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm6[0,1,2],xmm14[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 1784(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 136(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpermps 192(%rdi), %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 80(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 360(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpermps 416(%rdi), %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 304(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 584(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 1784(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 136(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermps 192(%rdi), %ymm14, %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 80(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 360(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermps 416(%rdi), %ymm14, %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 304(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1,2],xmm8[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 584(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermps 640(%rdi), %ymm14, %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 528(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 808(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermps 864(%rdi), %ymm14, %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 752(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1032(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpermps 640(%rdi), %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 528(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpermps 1088(%rdi), %ymm14, %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 976(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 808(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vbroadcastss 1256(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpermps 864(%rdi), %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 752(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpermps 1312(%rdi), %ymm14, %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1200(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1],xmm3[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1032(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vbroadcastss 1480(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpermps 1088(%rdi), %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 976(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpermps 1536(%rdi), %ymm14, %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1424(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1256(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vbroadcastss 1704(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpermps 1760(%rdi), %ymm14, %ymm11 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpermps 1312(%rdi), %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1200(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1,2],xmm15[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1480(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpermps 1536(%rdi), %ymm0, %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1424(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1704(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] -; AVX2-SLOW-NEXT: vpermps 1760(%rdi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1648(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1648(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm11[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -9527,13 +9497,13 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -9543,13 +9513,13 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -9559,22 +9529,30 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%r8) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%r8) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%r8) +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%r8) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm6, 160(%r8) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%r8) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%r8) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%r9) @@ -9586,15 +9564,16 @@ ; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%r9) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm6, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps %ymm13, 224(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm7, 192(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm8, 160(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm10, 128(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -9604,46 +9583,47 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm5, 192(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm4, 160(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 128(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm14, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 224(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm2, 160(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm15, 32(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm12, (%rax) -; AVX2-SLOW-NEXT: addq $2680, %rsp # imm = 0xA78 +; AVX2-SLOW-NEXT: addq $2648, %rsp # imm = 0xA58 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i32_stride7_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $2680, %rsp # imm = 0xA78 -; AVX2-FAST-NEXT: vmovdqa 1216(%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovdqa 1152(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 1120(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm12 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqa 992(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqa 928(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 896(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX2-FAST-NEXT: vpbroadcastq 80(%rdi), %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm2[6],ymm10[7] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm11 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm8[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm11 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm15 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastd 420(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpbroadcastd 196(%rdi), %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9652,156 +9632,153 @@ ; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm9 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastq 752(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq 528(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastd 868(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpbroadcastd 644(%rdi), %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm8 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastq 1200(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm15 +; AVX2-FAST-NEXT: vpbroadcastq 976(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 1248(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 1280(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 1024(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 1056(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastd 1316(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpbroadcastd 1092(%rdi), %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 1600(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 1568(%rdi), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm13 +; AVX2-FAST-NEXT: vmovdqa 1376(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 1344(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 1664(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 1440(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 1648(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpbroadcastq 1424(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 1696(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 1728(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 1472(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 1504(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastd 1764(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpbroadcastd 1540(%rdi), %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpbroadcastq 80(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm14 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq 304(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastd 196(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpbroadcastd 420(%rdi), %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 528(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %ymm5 +; AVX2-FAST-NEXT: vpbroadcastq 752(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastd 644(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpbroadcastd 868(%rdi), %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 928(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 1152(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 896(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 1120(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 992(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 1216(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 976(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpbroadcastq 1200(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 1024(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 1056(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 1248(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 1280(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastd 1092(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpbroadcastd 1316(%rdi), %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 1376(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 1600(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 1344(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 1568(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 1440(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 1664(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 1424(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpbroadcastq 1648(%rdi), %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 1472(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 1504(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 1696(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 1728(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastd 1540(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpbroadcastd 1764(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,6,5,6,5,6,5,6] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,7,5,4,7,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm15[1],ymm11[2,3,4],ymm15[5],ymm11[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] @@ -9809,17 +9786,16 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm3[2,3],ymm11[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9829,20 +9805,20 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 1280(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 1056(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 1248(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 1024(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 1184(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 960(%rdi), %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm14[2,3],ymm10[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm8[1],mem[2,3,4],ymm8[5],mem[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] @@ -9850,19 +9826,20 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 1728(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 1504(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 1696(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 1472(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 1632(%rdi), %ymm15 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm15[2,3],ymm6[4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 1408(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm13[2,3],ymm9[4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm13[1],ymm5[2,3,4],ymm13[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm6[1],mem[2,3,4],ymm6[5],mem[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] @@ -9870,19 +9847,17 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm4[2,3],ymm14[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload @@ -9894,17 +9869,16 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm13[2,3],ymm9[4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload @@ -9916,17 +9890,17 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 1056(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 1024(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 1280(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vmovdqa 1248(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 960(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 1184(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3],ymm8[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload @@ -9938,17 +9912,17 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 1504(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 1728(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 1696(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 1408(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 1632(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -9958,208 +9932,181 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm4[0],ymm12[2],ymm4[2] -; AVX2-FAST-NEXT: vpbroadcastd 428(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 752(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 680(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-FAST-NEXT: vpbroadcastd 876(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 1200(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 1128(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 1152(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpbroadcastd 456(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FAST-NEXT: vpbroadcastd 1324(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpbroadcastd 652(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 1648(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vmovdqa 976(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 1576(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 1600(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpbroadcastd 904(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 928(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FAST-NEXT: vpbroadcastd 1772(%rdi), %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpbroadcastd 1100(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 1424(%rdi), %xmm0 ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 456(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] +; AVX2-FAST-NEXT: vpbroadcastd 1352(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 1376(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FAST-NEXT: vpbroadcastd 652(%rdi), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vpbroadcastd 1548(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 976(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 904(%rdi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa 928(%rdi), %xmm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm11[1],xmm15[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm11 +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] -; AVX2-FAST-NEXT: vpbroadcastd 1100(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 1424(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm9 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm5[0],ymm10[0],ymm5[2],ymm10[2] +; AVX2-FAST-NEXT: vpbroadcastd 428(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 752(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 680(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[2],ymm7[2] +; AVX2-FAST-NEXT: vpbroadcastd 876(%rdi), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 1200(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm8[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 1128(%rdi), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 1152(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm2[1],xmm15[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm13 = ymm5[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 1352(%rdi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa 1376(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX2-FAST-NEXT: vpbroadcastd 1548(%rdi), %ymm14 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] +; AVX2-FAST-NEXT: vpbroadcastd 1324(%rdi), %ymm14 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = ymm5[0],mem[1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2],ymm4[1,3],ymm12[4,6],ymm4[5,7] -; AVX2-FAST-NEXT: vbroadcastss 432(%rdi), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],mem[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,2],ymm3[1,3],ymm15[4,6],ymm3[5,7] -; AVX2-FAST-NEXT: vmovaps %ymm3, %ymm13 -; AVX2-FAST-NEXT: vbroadcastss 880(%rdi), %ymm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 1648(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm14 = ymm4[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm3[3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 1576(%rdi), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 1600(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm3[1],xmm15[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX2-FAST-NEXT: vpbroadcastd 1772(%rdi), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,2],ymm12[1,3],ymm14[4,6],ymm12[5,7] -; AVX2-FAST-NEXT: vbroadcastss 1328(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],mem[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7] -; AVX2-FAST-NEXT: vbroadcastss 1776(%rdi), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm3[0,2],ymm8[1,3],ymm3[4,6],ymm8[5,7] +; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm11[1],ymm6[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm1[1,3],ymm6[4,6],ymm1[5,7] -; AVX2-FAST-NEXT: vbroadcastss 1552(%rdi), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,2],ymm10[1,3],ymm5[4,6],ymm10[5,7] +; AVX2-FAST-NEXT: vbroadcastss 432(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],mem[3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],mem[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm15[0,2],ymm11[1,3],ymm15[4,6],ymm11[5,7] +; AVX2-FAST-NEXT: vbroadcastss 656(%rdi), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0],ymm9[1],mem[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm8 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm7[1,3],ymm10[4,6],ymm7[5,7] -; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm10 -; AVX2-FAST-NEXT: vbroadcastss 1104(%rdi), %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2],ymm7[1,3],ymm9[4,6],ymm7[5,7] +; AVX2-FAST-NEXT: vbroadcastss 880(%rdi), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10170,10 +10117,39 @@ ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm7[1,3],ymm6[4,6],ymm7[5,7] -; AVX2-FAST-NEXT: vbroadcastss 656(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,2],ymm10[1,3],ymm14[4,6],ymm10[5,7] +; AVX2-FAST-NEXT: vbroadcastss 1104(%rdi), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm2[1,3],ymm1[4,6],ymm2[5,7] +; AVX2-FAST-NEXT: vbroadcastss 1328(%rdi), %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,2],ymm13[1,3],ymm12[4,6],ymm13[5,7] +; AVX2-FAST-NEXT: vbroadcastss 1552(%rdi), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10185,16 +10161,17 @@ ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm5[1,3],ymm3[4,6],ymm5[5,7] -; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm5[1,3],ymm1[4,6],ymm5[5,7] +; AVX2-FAST-NEXT: vbroadcastss 1776(%rdi), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm4 = [4,3,4,3] ; AVX2-FAST-NEXT: # xmm4 = mem[0,0] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -10204,7 +10181,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7] ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vbroadcastss 212(%rdi), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -10220,82 +10197,83 @@ ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-NEXT: vbroadcastss 436(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vbroadcastss 436(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vbroadcastss 548(%rdi), %xmm5 +; AVX2-FAST-NEXT: vbroadcastss 548(%rdi), %xmm6 ; AVX2-FAST-NEXT: vmovaps 512(%rdi), %xmm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vbroadcastss 660(%rdi), %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vbroadcastss 660(%rdi), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vbroadcastss 772(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovaps 736(%rdi), %xmm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vmovaps 736(%rdi), %xmm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vbroadcastss 884(%rdi), %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vmovaps %ymm9, %ymm15 +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-FAST-NEXT: vbroadcastss 884(%rdi), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vbroadcastss 996(%rdi), %xmm7 -; AVX2-FAST-NEXT: vmovaps 960(%rdi), %xmm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1,2],xmm7[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1108(%rdi), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 996(%rdi), %xmm6 +; AVX2-FAST-NEXT: vmovaps 960(%rdi), %xmm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1108(%rdi), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vbroadcastss 1220(%rdi), %xmm8 -; AVX2-FAST-NEXT: vmovaps 1184(%rdi), %xmm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovaps %ymm12, %ymm15 +; AVX2-FAST-NEXT: vbroadcastss 1220(%rdi), %xmm6 +; AVX2-FAST-NEXT: vmovaps 1184(%rdi), %xmm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX2-FAST-NEXT: vbroadcastss 1332(%rdi), %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vbroadcastss 1444(%rdi), %xmm10 -; AVX2-FAST-NEXT: vmovaps 1408(%rdi), %xmm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1444(%rdi), %xmm6 +; AVX2-FAST-NEXT: vmovaps 1408(%rdi), %xmm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7] ; AVX2-FAST-NEXT: vbroadcastss 1556(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -10305,11 +10283,11 @@ ; AVX2-FAST-NEXT: vmovaps 1632(%rdi), %xmm12 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1780(%rdi), %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1780(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -10317,65 +10295,65 @@ ; AVX2-FAST-NEXT: # ymm4 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,3,3,1,0,7,7] ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vbroadcastss 216(%rdi), %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 216(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm13 +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm5 = xmm13[0,1,2],mem[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 320(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3,4],ymm4[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vbroadcastss 440(%rdi), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3,4],ymm5[5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm1, %ymm5 +; AVX2-FAST-NEXT: vbroadcastss 440(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 544(%rdi), %xmm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $34, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vbroadcastss 664(%rdi), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm5[3] +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $34, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vbroadcastss 664(%rdi), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm7[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] +; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0],ymm15[1],mem[2,3,4],ymm15[5],mem[6,7] ; AVX2-FAST-NEXT: vpermps %ymm5, %ymm1, %ymm5 -; AVX2-FAST-NEXT: vbroadcastss 888(%rdi), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vbroadcastss 888(%rdi), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 992(%rdi), %xmm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vmovaps 992(%rdi), %xmm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm8[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] @@ -10385,177 +10363,170 @@ ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpermps %ymm5, %ymm1, %ymm5 -; AVX2-FAST-NEXT: vbroadcastss 1112(%rdi), %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 1112(%rdi), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 1216(%rdi), %xmm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm7[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm9[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm1, %ymm7 +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[0],ymm14[1],mem[2,3,4],ymm14[5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm1, %ymm9 ; AVX2-FAST-NEXT: vbroadcastss 1336(%rdi), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 1440(%rdi), %xmm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm5[0,1,2],xmm10[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm15[0,1],xmm10[2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpermps %ymm15, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vbroadcastss 1560(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 1560(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 1664(%rdi), %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm15[0,1,2],xmm12[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1,2],xmm12[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm12 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[0],ymm11[1],mem[2,3,4],ymm11[5],mem[6,7] ; AVX2-FAST-NEXT: vpermps %ymm12, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vbroadcastss 1784(%rdi), %ymm12 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm6[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 136(%rdi), %xmm1 ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermps 192(%rdi), %ymm0, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vbroadcastss 80(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] +; AVX2-FAST-NEXT: vpermps 192(%rdi), %ymm0, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vbroadcastss 80(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm13[3] ; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 360(%rdi), %xmm1 ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermps 416(%rdi), %ymm0, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vbroadcastss 304(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm11 = xmm11[0,1,2],mem[3] +; AVX2-FAST-NEXT: vpermps 416(%rdi), %ymm0, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vbroadcastss 304(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 584(%rdi), %xmm4 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpermps 640(%rdi), %ymm0, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vbroadcastss 528(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm6[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 584(%rdi), %xmm1 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermps 640(%rdi), %ymm0, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vbroadcastss 528(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 808(%rdi), %xmm2 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpermps 864(%rdi), %ymm0, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vbroadcastss 752(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 808(%rdi), %xmm4 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpermps 864(%rdi), %ymm0, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vbroadcastss 752(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 1032(%rdi), %xmm2 ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpermps 1088(%rdi), %ymm0, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vbroadcastss 976(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1,2],xmm9[3] +; AVX2-FAST-NEXT: vpermps 1088(%rdi), %ymm0, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vbroadcastss 976(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] ; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 1256(%rdi), %xmm2 ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpermps 1312(%rdi), %ymm0, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1200(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpermps 1312(%rdi), %ymm0, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1200(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 1480(%rdi), %xmm3 ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpermps 1536(%rdi), %ymm0, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1424(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vpermps 1536(%rdi), %ymm0, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1424(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 1704(%rdi), %xmm5 ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] ; AVX2-FAST-NEXT: vpermps 1760(%rdi), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1648(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1648(%rdi), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 224(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -10565,13 +10536,13 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, (%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm5, (%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 224(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -10581,13 +10552,13 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, (%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm5, (%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 224(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -10597,22 +10568,30 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, (%r8) +; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%r8) +; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%r8) +; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%r8) +; AVX2-FAST-NEXT: vmovaps %ymm5, (%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 224(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 160(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, (%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 224(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%r9) @@ -10630,9 +10609,9 @@ ; AVX2-FAST-NEXT: vmovaps %ymm5, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovaps %ymm12, 224(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm8, 192(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm7, 160(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm6, 128(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm10, 192(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm9, 160(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm8, 128(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%rax) ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload @@ -10647,44 +10626,42 @@ ; AVX2-FAST-NEXT: vmovaps %ymm2, 160(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm14, 128(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm13, 32(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm10, (%rax) +; AVX2-FAST-NEXT: vmovaps %ymm13, 64(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm11, (%rax) ; AVX2-FAST-NEXT: addq $2680, %rsp # imm = 0xA78 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i32_stride7_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $2680, %rsp # imm = 0xA78 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1216(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1152(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1120(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 672(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm11 +; AVX2-FAST-PERLANE-NEXT: subq $2648, %rsp # imm = 0xA58 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 992(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 928(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 896(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 80(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 420(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 196(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10694,155 +10671,151 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 752(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 528(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 868(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 644(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 1200(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 976(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1248(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1280(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1024(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1056(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1316(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1092(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1600(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1568(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1376(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1344(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm7[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1664(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1440(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 1648(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 1424(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1696(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1728(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1472(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1504(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1764(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1540(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 80(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 304(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 196(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 420(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm2[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 672(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm14[6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 528(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 752(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 644(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 868(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 928(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1152(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 896(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1120(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 992(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1216(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 976(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 1200(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1024(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1056(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1248(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1280(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1092(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1316(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1376(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1600(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1344(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1568(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1440(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 1424(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1664(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 1648(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1472(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1504(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1696(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1728(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1540(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1764(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -10851,20 +10824,20 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1056(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1024(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 960(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3],ymm8[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -10873,20 +10846,20 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1280(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1504(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1248(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1472(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1184(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1408(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm11[1],ymm7[2,3,4],ymm11[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -10895,21 +10868,22 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1728(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1696(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1632(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -10918,20 +10892,20 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -10940,17 +10914,17 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1056(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1280(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1248(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 960(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1184(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3],ymm8[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload @@ -10963,16 +10937,17 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1504(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1472(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1728(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1696(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1408(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1632(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -10989,230 +10964,199 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4],ymm2[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 232(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm4[0],mem[0],ymm4[2],mem[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 428(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 752(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 680(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm11[0],mem[0],ymm11[2],mem[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 876(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1200(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1128(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1152(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 456(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1324(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 652(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1648(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1576(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1600(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 976(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1772(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 904(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 928(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1100(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1424(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm14[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 456(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1352(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1376(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 652(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1548(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 976(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 904(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 928(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm12[1],xmm15[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 232(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 428(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 752(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm3 = ymm7[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 680(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1100(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1424(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 876(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1200(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm4 = ymm9[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1128(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1152(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm3[1],xmm15[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm4[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm14 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1352(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1376(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm9[0],ymm3[0],ymm9[2],ymm3[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1324(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1648(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm14 = ymm13[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm4[3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1576(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1600(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm4[1],xmm15[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1548(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1772(%rdi), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm4[0,2],ymm15[1,3],ymm4[4,6],ymm15[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 432(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm11[0,2],mem[1,3],ymm11[4,6],mem[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 880(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,2],ymm4[1,3],ymm15[4,6],ymm4[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdi), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,2],ymm14[1,3],ymm13[4,6],ymm14[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1328(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1776(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm9[1],ymm10[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm1[1,3],ymm6[4,6],ymm1[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1552(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 432(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,2],ymm12[1,3],ymm6[4,6],ymm12[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 656(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm11[1],mem[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm7[1,3],ymm8[4,6],ymm7[5,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1104(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm7[1,3],ymm10[4,6],ymm7[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 880(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -11220,10 +11164,10 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 656(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm13[1,3],ymm8[4,6],ymm13[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1104(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11235,215 +11179,249 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm4[1,3],ymm3[4,6],ymm4[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2],ymm3[1,3],ymm9[4,6],ymm3[5,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1328(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm12[0,2],mem[1,3],ymm12[4,6],mem[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1552(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm10[0,2],mem[1,3],ymm10[4,6],mem[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1776(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm7 = [4,3,4,3] -; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[0,0] +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm0 = [4,3,4,3] +; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[0,0] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm7, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 212(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm14 = [0,7,0,7,0,7,0,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm14, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 212(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 324(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 436(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm7, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 324(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 548(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm14, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 436(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 660(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm7, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 548(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 660(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 772(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm14, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 884(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm7, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 772(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 884(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 996(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps 960(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm14, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1108(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 996(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps 960(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1108(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm7, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1220(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1220(%rdi), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovaps 1184(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm6[0,1,2],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1332(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm7, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1444(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm14, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1332(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1444(%rdi), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovaps 1408(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm13[0,1,2],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1556(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1668(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps 1632(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm13[0,1,2],xmm8[3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm14, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1780(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1556(%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1668(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1632(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm0[0,1,2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm14, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1780(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm12[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3,4],ymm5[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 440(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 664(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 440(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 888(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 992(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 664(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1112(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 1216(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 888(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 992(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1,2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1112(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 1216(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] @@ -11451,164 +11429,155 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1336(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 1440(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 1440(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0,1,2],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1560(%rdi), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1664(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm6[0,1,2],xmm14[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1784(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 136(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpermps 192(%rdi), %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 360(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpermps 416(%rdi), %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 304(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 584(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1784(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 136(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermps 192(%rdi), %ymm14, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 360(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermps 416(%rdi), %ymm14, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 304(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1,2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 584(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermps 640(%rdi), %ymm14, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 528(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 808(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermps 864(%rdi), %ymm14, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 752(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1032(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermps 640(%rdi), %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 528(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps 1088(%rdi), %ymm14, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 976(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 808(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1256(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermps 864(%rdi), %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 752(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps 1312(%rdi), %ymm14, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1200(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1],xmm3[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1032(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1480(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermps 1088(%rdi), %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 976(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps 1536(%rdi), %ymm14, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1424(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1256(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1704(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps 1760(%rdi), %ymm14, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermps 1312(%rdi), %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1200(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1,2],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1480(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpermps 1536(%rdi), %ymm0, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1424(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1704(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps 1760(%rdi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1648(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1648(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm11[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -11618,13 +11587,13 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -11634,13 +11603,13 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -11650,22 +11619,30 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 160(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%r9) @@ -11677,15 +11654,16 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -11695,424 +11673,426 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $2680, %rsp # imm = 0xA78 +; AVX2-FAST-PERLANE-NEXT: addq $2648, %rsp # imm = 0xA58 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-LABEL: load_i32_stride7_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm14 +; AVX512F-NEXT: subq $3336, %rsp # imm = 0xD08 +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm0 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] ; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm2 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm7, %zmm10, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm2 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm7, %zmm10, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm2 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm7, %zmm10, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm1, %zmm2 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm10, %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm10, %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm15 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512F-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm28, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm15 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm28, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] -; AVX512F-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm28, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm28, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm28, %zmm20 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm4, %zmm9, %zmm20 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm28, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm9, %zmm20 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm28, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm9, %zmm20 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm7, %zmm10, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm9, %zmm28 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm6, %zmm9, %zmm13 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm4, %zmm6, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm21, %zmm9, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm6, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm17, %zmm9, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm6, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm6, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] +; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm22, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm26, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] +; AVX512F-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm27, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] +; AVX512F-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm30, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm22, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm26, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm15 ; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm22, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm26, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm27, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm19, %zmm18, %zmm22 +; AVX512F-NEXT: vpermi2d %zmm18, %zmm19, %zmm26 +; AVX512F-NEXT: vpermi2d %zmm18, %zmm19, %zmm27 +; AVX512F-NEXT: vpermi2d %zmm18, %zmm19, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm3, %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <0,7,14,21,28,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <1,8,15,22,29,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <18,25,0,7,14,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <19,26,1,8,15,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm20 = [6,13,20,27] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm4, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm7, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm20 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <18,25,0,7,14,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm6, %zmm21 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm19 = <19,26,1,8,15,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm19, %zmm23 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm15 = [4,11,18,25] +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm17 = [5,12,19,26] +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm17, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm5 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm2, %zmm8 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm6, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm19, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm15, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm17, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm6, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm19, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm15, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm17, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm1 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm3, %zmm10 ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm4, %zmm11 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm7, %zmm12 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm19 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm15 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm17 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm31 = [6,13,20,27] +; AVX512F-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 ; AVX512F-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm18 {%k1} ; AVX512F-NEXT: movb $-32, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512F-NEXT: movw $480, %ax # imm = 0x1E0 ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm20 {%k2} ; AVX512F-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm8 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm21 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm24 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm13 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm9 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm6 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm10 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm19 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm9, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} -; AVX512F-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 +; AVX512F-NEXT: vinserti32x4 $0, (%rsp), %zmm0, %zmm0 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm29, %zmm22, %zmm22 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm24, %zmm26, %zmm24 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm24 {%k1} +; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm27, %zmm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm15 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm25, %zmm29, %zmm25 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm25 {%k1} +; AVX512F-NEXT: vinserti32x4 $0, %xmm17, %zmm30, %zmm17 +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} -; AVX512F-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 -; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} +; AVX512F-NEXT: vinserti32x4 $0, %xmm16, %zmm28, %zmm16 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm16 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} +; AVX512F-NEXT: vinserti32x4 $0, %xmm4, %zmm28, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm4 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, %xmm5, %zmm28, %zmm5 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload @@ -12121,450 +12101,448 @@ ; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm13, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm18, (%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm23, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm24, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm7, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm29, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm18, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm20, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm12, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm20, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm9, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm19, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm23, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm15, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm24, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm19, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm27, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm26, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, 192(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm27, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm26, (%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512F-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512F-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512F-NEXT: addq $3336, %rsp # imm = 0xD08 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride7_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm14 +; AVX512BW-NEXT: subq $3336, %rsp # imm = 0xD08 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm0 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm2 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm2 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm2 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm2 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm10, %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm10, %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm15 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] -; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] +; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm30, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm28, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm30, %zmm21 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm30, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm10, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm30, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm10, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm11, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm10, %zmm30 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm10, %zmm13 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm8, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm10, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm25, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] -; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm27, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm28, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm31, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm28, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm27, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm28, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm31, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm28, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm28, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm28, %zmm20 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm9, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm28, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm9, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm28, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm9, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm10, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm9, %zmm28 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm9, %zmm13 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm6, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm9, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm9, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm6, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm9, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm6, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm22, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm26, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] +; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm27, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] +; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm30, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm19, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm22, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm26, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm27, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm30, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm19, %zmm1, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm15 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm22, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm26, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm27, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm30, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm19, %zmm18, %zmm22 +; AVX512BW-NEXT: vpermi2d %zmm18, %zmm19, %zmm26 +; AVX512BW-NEXT: vpermi2d %zmm18, %zmm19, %zmm27 +; AVX512BW-NEXT: vpermi2d %zmm18, %zmm19, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm3, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,7,14,21,28,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <1,8,15,22,29,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm23 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <18,25,0,7,14,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm4, %zmm24 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <19,26,1,8,15,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm7, %zmm29 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm20 = [6,13,20,27] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm17 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm4, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm7, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm20 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <18,25,0,7,14,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm6, %zmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = <19,26,1,8,15,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm19, %zmm23 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm15 = [4,11,18,25] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [5,12,19,26] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm17, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm5 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm6, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm19, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm17, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm6, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm19, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm17, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm1 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm3, %zmm10 ; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm4, %zmm11 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm7, %zmm12 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm21 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm19, %zmm26 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm19 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm15 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm17 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm31 = [6,13,20,27] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 ; AVX512BW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm18 {%k1} ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} ; AVX512BW-NEXT: movw $480, %ax # imm = 0x1E0 ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm20 {%k2} ; AVX512BW-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm21 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm9 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm6 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm10 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm19 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm20 {%k1} -; AVX512BW-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 +; AVX512BW-NEXT: vinserti32x4 $0, (%rsp), %zmm0, %zmm0 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm29, %zmm22, %zmm22 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm24, %zmm26, %zmm24 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm24 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm15, %zmm27, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm15 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm25, %zmm29, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm25 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm30, %zmm17 +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} -; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm31, %zmm19 -; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm19 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm16, %zmm28, %zmm16 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm28, %zmm17 +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm16 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm17 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm28, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm28, %zmm5 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload @@ -12573,41 +12551,37 @@ ; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm20, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm27, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm26, (%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512BW-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512BW-NEXT: addq $3336, %rsp # imm = 0xD08 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <448 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll @@ -114,12 +114,14 @@ ; ; AVX512-SLOW-LABEL: load_i32_stride8_vf2: ; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: pushq %rbx ; AVX512-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512-SLOW-NEXT: vmovd %xmm1, %ebx +; AVX512-SLOW-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm2 ; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] ; AVX512-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX512-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -142,17 +144,20 @@ ; AVX512-SLOW-NEXT: vmovlps %xmm6, (%r11) ; AVX512-SLOW-NEXT: vmovlps %xmm4, (%r10) ; AVX512-SLOW-NEXT: vmovlps %xmm1, (%rax) +; AVX512-SLOW-NEXT: popq %rbx ; AVX512-SLOW-NEXT: vzeroupper ; AVX512-SLOW-NEXT: retq ; ; AVX512-FAST-LABEL: load_i32_stride8_vf2: ; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: pushq %rbx ; AVX512-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512-FAST-NEXT: vmovd %xmm1, %ebx +; AVX512-FAST-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm2 ; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5] ; AVX512-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -160,8 +165,7 @@ ; AVX512-FAST-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512-FAST-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] ; AVX512-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,5,5,13,5,5] -; AVX512-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [5,13,5,13,5,13,5,13] ; AVX512-FAST-NEXT: vpermi2d %ymm1, %ymm4, %ymm6 ; AVX512-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX512-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] @@ -176,17 +180,20 @@ ; AVX512-FAST-NEXT: vmovq %xmm6, (%r11) ; AVX512-FAST-NEXT: vmovq %xmm4, (%r10) ; AVX512-FAST-NEXT: vmovq %xmm1, (%rax) +; AVX512-FAST-NEXT: popq %rbx ; AVX512-FAST-NEXT: vzeroupper ; AVX512-FAST-NEXT: retq ; ; AVX512BW-FAST-LABEL: load_i32_stride8_vf2: ; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: pushq %rbx ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512BW-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BW-FAST-NEXT: vmovd %xmm1, %ebx +; AVX512BW-FAST-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm2 ; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5] ; AVX512BW-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512BW-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -194,8 +201,7 @@ ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512BW-FAST-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,5,5,13,5,5] -; AVX512BW-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [5,13,5,13,5,13,5,13] ; AVX512BW-FAST-NEXT: vpermi2d %ymm1, %ymm4, %ymm6 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX512BW-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] @@ -210,6 +216,7 @@ ; AVX512BW-FAST-NEXT: vmovq %xmm6, (%r11) ; AVX512BW-FAST-NEXT: vmovq %xmm4, (%r10) ; AVX512BW-FAST-NEXT: vmovq %xmm1, (%rax) +; AVX512BW-FAST-NEXT: popq %rbx ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %wide.vec = load <16 x i32>, ptr %in.vec, align 64 @@ -454,99 +461,100 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i32_stride8_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps 112(%rdi), %xmm15 -; SSE-NEXT: movaps 176(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm9 +; SSE-NEXT: movaps 112(%rdi), %xmm5 +; SSE-NEXT: movaps 176(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%rdi), %xmm15 ; SSE-NEXT: movaps (%rdi), %xmm10 -; SSE-NEXT: movaps 32(%rdi), %xmm1 -; SSE-NEXT: movaps 96(%rdi), %xmm13 -; SSE-NEXT: movaps 64(%rdi), %xmm11 -; SSE-NEXT: movaps 160(%rdi), %xmm2 -; SSE-NEXT: movaps 128(%rdi), %xmm6 -; SSE-NEXT: movaps 224(%rdi), %xmm12 -; SSE-NEXT: movaps 192(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movaps 32(%rdi), %xmm0 +; SSE-NEXT: movaps 96(%rdi), %xmm1 +; SSE-NEXT: movaps 64(%rdi), %xmm12 +; SSE-NEXT: movaps 160(%rdi), %xmm3 +; SSE-NEXT: movaps 128(%rdi), %xmm4 +; SSE-NEXT: movaps 224(%rdi), %xmm9 +; SSE-NEXT: movaps 192(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: movaps %xmm6, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; SSE-NEXT: movaps %xmm12, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] ; SSE-NEXT: movaps %xmm10, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] ; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm14[0] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm11[0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm14[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm11[1] ; SSE-NEXT: movaps 240(%rdi), %xmm14 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: movaps 208(%rdi), %xmm12 -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; SSE-NEXT: movaps 208(%rdi), %xmm11 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm13[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; SSE-NEXT: movaps %xmm10, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm11[1] -; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm13[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm12[1] +; SSE-NEXT: movaps %xmm11, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] -; SSE-NEXT: movaps %xmm11, %xmm13 +; SSE-NEXT: movaps %xmm15, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] +; SSE-NEXT: movaps %xmm12, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 80(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] -; SSE-NEXT: movaps 16(%rdi), %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: movaps 16(%rdi), %xmm14 ; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm0, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; SSE-NEXT: movaps %xmm14, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm1[1] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1] -; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm3[2],xmm14[3],xmm3[3] +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm11[1] +; SSE-NEXT: movaps %xmm14, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps %xmm6, 16(%rdx) ; SSE-NEXT: movaps %xmm7, (%rdx) -; SSE-NEXT: movaps %xmm5, 16(%rdx) -; SSE-NEXT: movaps %xmm8, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movaps %xmm8, (%rcx) +; SSE-NEXT: movaps %xmm4, 16(%r8) ; SSE-NEXT: movaps %xmm10, (%r8) -; SSE-NEXT: movaps %xmm6, 16(%r8) -; SSE-NEXT: movaps %xmm4, (%r9) ; SSE-NEXT: movaps %xmm13, 16(%r9) +; SSE-NEXT: movaps %xmm5, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm14, (%rax) -; SSE-NEXT: movaps %xmm11, 16(%rax) +; SSE-NEXT: movaps %xmm12, 16(%rax) +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, (%rax) ; SSE-NEXT: movaps %xmm1, 16(%rax) +; SSE-NEXT: movaps %xmm3, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm9, 16(%rax) -; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: movaps %xmm15, 16(%rax) +; SSE-NEXT: movaps %xmm14, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride8_vf8: @@ -862,7 +870,7 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i32_stride8_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $296, %rsp # imm = 0x128 +; SSE-NEXT: subq $280, %rsp # imm = 0x118 ; SSE-NEXT: movaps 288(%rdi), %xmm6 ; SSE-NEXT: movaps 352(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -950,14 +958,14 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm9[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 240(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 208(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 208(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 176(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] @@ -966,65 +974,64 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 496(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 464(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps 464(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 432(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 400(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movaps 400(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] +; SSE-NEXT: movaps %xmm11, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps 368(%rdi), %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps 368(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 336(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movaps 304(%rdi), %xmm12 -; SSE-NEXT: movaps 272(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 304(%rdi), %xmm13 +; SSE-NEXT: movaps 272(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1] ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE-NEXT: movaps 112(%rdi), %xmm13 +; SSE-NEXT: movaps 112(%rdi), %xmm12 ; SSE-NEXT: movaps 80(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps 48(%rdi), %xmm11 +; SSE-NEXT: movaps 48(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] ; SSE-NEXT: movaps %xmm3, %xmm9 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE-NEXT: unpckhps (%rsp), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm11[2],xmm8[3],xmm11[3] -; SSE-NEXT: movaps %xmm0, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] -; SSE-NEXT: movaps %xmm0, %xmm12 -; SSE-NEXT: movaps %xmm7, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1] -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE-NEXT: movaps %xmm15, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm14[1] +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm6[1] ; SSE-NEXT: movaps %xmm8, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] @@ -1033,58 +1040,57 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%r8) +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%r9) +; SSE-NEXT: movaps %xmm9, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm4, 32(%rax) -; SSE-NEXT: movaps %xmm10, 48(%rax) -; SSE-NEXT: movaps %xmm3, (%rax) +; SSE-NEXT: movaps %xmm11, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rax) +; SSE-NEXT: movaps %xmm3, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movaps %xmm7, 32(%rax) ; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps %xmm9, 32(%rax) -; SSE-NEXT: movaps %xmm11, 16(%rax) +; SSE-NEXT: movaps %xmm13, 16(%rax) ; SSE-NEXT: movaps %xmm2, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm6, 48(%rax) -; SSE-NEXT: movaps %xmm7, 32(%rax) -; SSE-NEXT: movaps %xmm12, 16(%rax) +; SSE-NEXT: movaps %xmm10, 48(%rax) +; SSE-NEXT: movaps %xmm5, 32(%rax) +; SSE-NEXT: movaps %xmm15, 16(%rax) ; SSE-NEXT: movaps %xmm8, (%rax) -; SSE-NEXT: addq $296, %rsp # imm = 0x128 +; SSE-NEXT: addq $280, %rsp # imm = 0x118 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride8_vf16: @@ -1924,48 +1930,48 @@ ; SSE-LABEL: load_i32_stride8_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $952, %rsp # imm = 0x3B8 -; SSE-NEXT: movaps 544(%rdi), %xmm5 +; SSE-NEXT: movaps 672(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 608(%rdi), %xmm6 +; SSE-NEXT: movaps 736(%rdi), %xmm6 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 576(%rdi), %xmm7 +; SSE-NEXT: movaps 704(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 672(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 640(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 736(%rdi), %xmm9 +; SSE-NEXT: movaps 160(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 704(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm10 +; SSE-NEXT: movaps 128(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm1 +; SSE-NEXT: movaps 192(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 544(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 512(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%rdi), %xmm0 +; SSE-NEXT: movaps 608(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 576(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movaps 512(%rdi), %xmm1 +; SSE-NEXT: movaps 640(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -1973,30 +1979,29 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 480(%rdi), %xmm1 +; SSE-NEXT: movaps 352(%rdi), %xmm14 +; SSE-NEXT: movaps 320(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movaps 288(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 256(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 448(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 416(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 384(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 992(%rdi), %xmm1 +; SSE-NEXT: movaps 864(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 960(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps 832(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 928(%rdi), %xmm2 +; SSE-NEXT: movaps 800(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 896(%rdi), %xmm1 +; SSE-NEXT: movaps 768(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -2004,130 +2009,130 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 352(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: movaps 480(%rdi), %xmm4 +; SSE-NEXT: movaps 448(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: movaps 416(%rdi), %xmm1 +; SSE-NEXT: movaps 384(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 288(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 256(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 864(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 832(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps 800(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 768(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 992(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 960(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movaps 928(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 896(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movaps %xmm0, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm9, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] -; SSE-NEXT: movaps (%rdi), %xmm8 -; SSE-NEXT: movaps 32(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm8, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm13[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: movaps (%rdi), %xmm10 +; SSE-NEXT: movaps 32(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm13[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm13[1] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm14[2],xmm8[3],xmm14[3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: unpckhps (%rsp), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm4[1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm12[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm10[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm13[0] +; SSE-NEXT: unpckhps (%rsp), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm4[1] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm12[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm12[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm13[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm8[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm15[1] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm11[1] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm9[1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 240(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 208(%rdi), %xmm12 @@ -2317,54 +2322,54 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm10[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps %xmm1, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rdx) +; SSE-NEXT: movaps %xmm1, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps %xmm1, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rdx) +; SSE-NEXT: movaps %xmm1, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rcx) +; SSE-NEXT: movaps %xmm1, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps %xmm1, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: movaps %xmm1, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps %xmm1, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rcx) +; SSE-NEXT: movaps %xmm1, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 96(%r8) @@ -2441,28 +2446,28 @@ ; ; AVX1-ONLY-LABEL: load_i32_stride8_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1800, %rsp # imm = 0x708 -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm10 +; AVX1-ONLY-NEXT: subq $1768, %rsp # imm = 0x6E8 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] ; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2471,81 +2476,81 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm8[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] @@ -2724,630 +2729,624 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm9 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm13 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm13 +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[1],ymm1[1],ymm15[4],ymm1[4],ymm15[5],ymm1[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm11 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm12 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm8 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm10 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm9[0],ymm4[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm0[0],ymm10[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1],ymm3[2,0],ymm6[4,5],ymm3[6,4] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm14[0],ymm2[2],ymm14[2] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[4],ymm12[4],ymm9[5],ymm12[5] -; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm8 ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,0],ymm5[1,0],ymm13[5,4],ymm5[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm13[0],ymm5[1],ymm13[1],ymm5[4],ymm13[4],ymm5[5],ymm13[5] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm7[1,0],ymm1[5,4],ymm7[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[4],ymm11[4],ymm12[5],ymm11[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm12[1,0],ymm15[1,0],ymm12[5,4],ymm15[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm13[0],ymm4[1],ymm13[1],ymm4[4],ymm13[4],ymm4[5],ymm13[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[1,0],ymm5[1,0],ymm11[5,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm8[0],ymm3[0],ymm8[1],ymm3[1],ymm8[4],ymm3[4],ymm8[5],ymm3[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[1,0],ymm5[1,0],ymm3[5,4],ymm5[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[1,0],ymm3[1,0],ymm1[5,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm11 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm11[0],ymm7[1],ymm11[1],ymm7[4],ymm11[4],ymm7[5],ymm11[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm8[1,0],ymm6[5,4],ymm8[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm9[0],mem[0],ymm9[1],mem[1],ymm9[4],mem[4],ymm9[5],mem[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm9[1,0],ymm1[5,4],ymm9[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[1,0],ymm4[1,0],ymm6[5,4],ymm4[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[4],ymm10[4],ymm0[5],ymm10[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm10[1,0],ymm6[5,4],ymm10[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm9[1,0],mem[1,0],ymm9[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm14[0],mem[0],ymm14[1],mem[1],ymm14[4],mem[4],ymm14[5],mem[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm14[1,0],ymm2[1,0],ymm14[5,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[1,0],mem[1,0],ymm2[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],ymm10[0],ymm2[1],ymm10[1],ymm2[4],ymm10[4],ymm2[5],ymm10[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm10[1,0],ymm9[1,0],ymm10[5,4],ymm9[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[6],ymm8[6],ymm1[7],ymm8[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm4[1],ymm13[3],ymm4[3] +; AVX1-ONLY-NEXT: vmovaps %ymm11, %ymm13 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm5[2],ymm11[2],ymm5[3],ymm11[3],ymm5[6],ymm11[6],ymm5[7],ymm11[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[6],ymm12[6],ymm15[7],ymm12[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[6],ymm6[6],ymm8[7],ymm6[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm10[2],ymm13[3],ymm10[3],ymm13[6],ymm10[6],ymm13[7],ymm10[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm2[2],ymm14[2],ymm2[3],ymm14[3],ymm2[6],ymm14[6],ymm2[7],ymm14[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm8[1],ymm11[1],ymm8[3],ymm11[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm7[1],ymm0[3],ymm7[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm15[2],ymm6[2],ymm15[3],ymm6[3],ymm15[6],ymm6[6],ymm15[7],ymm6[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm12[1],ymm2[3],ymm12[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm14 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm13[3,0],mem[3,0],ymm13[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm11[2],ymm8[2],ymm11[3],ymm8[3],ymm11[6],ymm8[6],ymm11[7],ymm8[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm12[2],ymm1[3],ymm12[3],ymm1[6],ymm12[6],ymm1[7],ymm12[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[3,0],ymm15[3,0],ymm6[7,4],ymm15[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[3,0],ymm6[3,0],ymm8[7,4],ymm6[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[2,3],ymm2[6,4],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm10[3,0],ymm13[3,0],ymm10[7,4],ymm13[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[2,0],ymm2[2,3],ymm4[6,4],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm9[2],ymm3[3],ymm9[3],ymm3[6],ymm9[6],ymm3[7],ymm9[7] -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm14[3,0],mem[3,0],ymm14[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) -; AVX1-ONLY-NEXT: addq $1800, %rsp # imm = 0x708 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rax) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm9, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: addq $1768, %rsp # imm = 0x6E8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride8_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1544, %rsp # imm = 0x608 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 +; AVX2-ONLY-NEXT: subq $1576, %rsp # imm = 0x628 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm10 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm14 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm3, %xmm15 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm3 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vbroadcastss %xmm4, %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm4, %xmm13 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vbroadcastss %xmm11, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vbroadcastss %xmm9, %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vbroadcastss %xmm8, %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vbroadcastss %xmm7, %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm6 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vbroadcastss %xmm10, %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,1,1,1] -; AVX2-ONLY-NEXT: vmovaps %xmm10, %xmm3 -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm13[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm12[0],mem[0],xmm12[1],mem[1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[1,1,1,1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; AVX2-ONLY-NEXT: vpermilps $85, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1] -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,1,1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3] -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm9[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm14[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1,2],xmm1[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm15 = mem[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm12[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm12[0,1,2],xmm15[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm14 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm12[0,1,2],xmm15[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm3[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm5[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1,2],xmm15[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1],xmm15[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm4[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm10 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm10 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm3[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm4[0,1,2],xmm11[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm9[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm15 = mem[0,1,2],xmm15[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm7[0,1,2],xmm15[3] +; AVX2-ONLY-NEXT: vmovaps %xmm7, %xmm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm14[0,1],xmm15[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm9 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 @@ -3356,15 +3355,16 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm6 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm15[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload @@ -3377,347 +3377,350 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm12[2],mem[2],xmm12[3],mem[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm13[2],mem[2],xmm13[3],mem[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm0[1] -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm15[2],xmm4[3],xmm15[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm8[2],xmm4[3],xmm8[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm11[2],xmm8[3],xmm11[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm15[2],xmm11[3],xmm15[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,2,2,2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[4],ymm8[4],ymm1[5],ymm8[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[4],ymm9[4],ymm1[5],ymm9[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[1],ymm14[1],ymm11[4],ymm14[4],ymm11[5],ymm14[5] -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[4],ymm14[4],ymm0[5],ymm14[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm9[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm12[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[4],ymm8[4],ymm1[5],ymm8[5] +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm11[0],ymm1[2],ymm11[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 404(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm6[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm3[1],ymm10[2,3,4],ymm3[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm6[1],ymm10[2,3,4],ymm6[5],ymm10[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 916(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vbroadcastss 660(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpermilps $85, (%rsp), %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = mem[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm13[1],ymm7[2,3,4],ymm13[5],ymm7[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 404(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm1 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4],ymm7[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 660(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vbroadcastss 916(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 504(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm6[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 1016(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm2 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm10[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 248(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vbroadcastss 248(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm8 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 760(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm1[2],ymm6[2],ymm1[3],ymm6[3],ymm1[6],ymm6[6],ymm1[7],ymm6[7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm1 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm11[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 504(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm15 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 220(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm4 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 476(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm1 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 732(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm5 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 988(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm3 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r9) +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 760(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm1[2],ymm13[2],ymm1[3],ymm13[3],ymm1[6],ymm13[6],ymm1[7],ymm13[7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm1 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm15[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 1016(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm4[2],ymm14[2],ymm4[3],ymm14[3],ymm4[6],ymm14[6],ymm4[7],ymm14[7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm4 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm14[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 220(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm5 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 476(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm12[1],ymm5[1],ymm12[3],ymm5[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 732(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm5[1],ymm8[3],ymm5[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm6 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 988(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm6 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm13[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rax) -; AVX2-ONLY-NEXT: addq $1544, %rsp # imm = 0x608 +; AVX2-ONLY-NEXT: addq $1576, %rsp # imm = 0x628 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -5992,38 +5995,36 @@ ; SSE-LABEL: load_i32_stride8_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $2232, %rsp # imm = 0x8B8 -; SSE-NEXT: movaps 288(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 352(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 416(%rdi), %xmm7 +; SSE-NEXT: movaps 416(%rdi), %xmm6 +; SSE-NEXT: movaps 480(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 384(%rdi), %xmm8 +; SSE-NEXT: movaps 448(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 288(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 480(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 448(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm10 +; SSE-NEXT: movaps 256(%rdi), %xmm9 +; SSE-NEXT: movaps 352(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 320(%rdi), %xmm15 +; SSE-NEXT: movaps 160(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 128(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 192(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movaps %xmm9, %xmm3 +; SSE-NEXT: movaps %xmm9, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 @@ -6031,21 +6032,36 @@ ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movaps 256(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movaps 384(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movaps %xmm6, %xmm13 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 608(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 576(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 544(%rdi), %xmm11 +; SSE-NEXT: movaps 512(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 736(%rdi), %xmm9 +; SSE-NEXT: movaps 736(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 704(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 672(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 640(%rdi), %xmm1 @@ -6056,17 +6072,14 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 608(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 576(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps 544(%rdi), %xmm15 -; SSE-NEXT: movaps 512(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSE-NEXT: movaps 864(%rdi), %xmm8 +; SSE-NEXT: movaps 832(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movaps 800(%rdi), %xmm9 +; SSE-NEXT: movaps 768(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6087,16 +6100,15 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 864(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 832(%rdi), %xmm0 +; SSE-NEXT: movaps 1120(%rdi), %xmm4 +; SSE-NEXT: movaps 1088(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 800(%rdi), %xmm14 -; SSE-NEXT: movaps 768(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movaps 1056(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 1024(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6117,15 +6129,13 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1120(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1088(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm1, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps 1056(%rdi), %xmm2 +; SSE-NEXT: movaps 1376(%rdi), %xmm3 +; SSE-NEXT: movaps 1344(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movaps 1312(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1024(%rdi), %xmm1 +; SSE-NEXT: movaps 1280(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -6148,14 +6158,14 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1376(%rdi), %xmm1 +; SSE-NEXT: movaps 1632(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1344(%rdi), %xmm0 +; SSE-NEXT: movaps 1600(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 1312(%rdi), %xmm2 +; SSE-NEXT: movaps 1568(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1280(%rdi), %xmm1 +; SSE-NEXT: movaps 1536(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -6178,17 +6188,15 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1632(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1600(%rdi), %xmm0 +; SSE-NEXT: movaps 1888(%rdi), %xmm10 +; SSE-NEXT: movaps 1856(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 1568(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1536(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movaps 1824(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 1792(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6204,184 +6212,174 @@ ; SSE-NEXT: movaps 1920(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1888(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1856(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 1824(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1792(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps (%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps (%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: movaps %xmm10, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: unpckhps (%rsp), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: unpckhps (%rsp), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm15[2],xmm6[3],xmm15[3] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm11[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm11[1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm10[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm10[2],xmm6[3],xmm10[3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm15[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: movaps %xmm12, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm14[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm14[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm15[0] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm13[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm11[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm9[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm14[0] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm14[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -6389,11 +6387,12 @@ ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm8[0] +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -6401,12 +6400,12 @@ ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movaps %xmm2, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -6414,31 +6413,26 @@ ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: movaps %xmm13, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 240(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 208(%rdi), %xmm7 @@ -6457,7 +6451,7 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 368(%rdi), %xmm3 ; SSE-NEXT: movaps 336(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movaps 304(%rdi), %xmm4 ; SSE-NEXT: movaps 272(%rdi), %xmm1 @@ -6470,7 +6464,7 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 496(%rdi), %xmm5 ; SSE-NEXT: movaps 464(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: movaps 432(%rdi), %xmm6 ; SSE-NEXT: movaps 400(%rdi), %xmm1 @@ -6675,15 +6669,15 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -6778,14 +6772,14 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movaps (%rsp), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] @@ -6793,14 +6787,14 @@ ; SSE-NEXT: movaps %xmm5, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm15, %xmm0 ; SSE-NEXT: movaps %xmm15, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -6864,14 +6858,6 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 240(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 176(%rsi) @@ -6880,13 +6866,13 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 192(%rsi) +; SSE-NEXT: movaps %xmm1, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 128(%rsi) +; SSE-NEXT: movaps %xmm1, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps %xmm1, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -6896,38 +6882,46 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 224(%rdx) +; SSE-NEXT: movaps %xmm1, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 240(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 192(%rdx) +; SSE-NEXT: movaps %xmm1, 224(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 208(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 160(%rdx) +; SSE-NEXT: movaps %xmm1, 192(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 128(%rdx) +; SSE-NEXT: movaps %xmm1, 160(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rdx) +; SSE-NEXT: movaps %xmm1, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rdx) +; SSE-NEXT: movaps %xmm1, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps %xmm1, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 240(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 224(%rcx) @@ -7070,7 +7064,7 @@ ; SSE-NEXT: movaps %xmm1, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rax) @@ -7096,7 +7090,7 @@ ; SSE-NEXT: movaps %xmm0, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) @@ -7113,27 +7107,26 @@ ; AVX1-ONLY-LABEL: load_i32_stride8_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $3720, %rsp # imm = 0xE88 -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm14 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7142,258 +7135,284 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm11[1],xmm14[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm13[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm7[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm6[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[1],xmm14[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm14[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[1],xmm14[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm14[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm15[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1,1,1] @@ -7401,8 +7420,8 @@ ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 @@ -7410,8 +7429,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -7422,285 +7440,256 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vpermilps $85, (%rsp), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps (%rsp), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm14[2],mem[2],xmm14[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm15[2],xmm11[3],xmm15[3] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2],xmm15[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm8[2],mem[2],xmm8[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2],xmm13[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,2],xmm10[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm6[0,1,2],xmm15[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm5[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm15[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm15[0,1,2],xmm10[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm11[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm8[2],mem[2],xmm8[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm9[2],mem[2],xmm9[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm8[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm9[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm13[0,1,2],xmm9[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm12[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm0[0,1],xmm11[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm8[2],mem[2],xmm8[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm9[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm8[2],mem[2],xmm8[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm6[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm6[2],mem[2],xmm6[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm4[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm2[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm0[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm14[2],mem[2],xmm14[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm8[2],xmm11[2],xmm8[3],xmm11[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm15[2],mem[2],xmm15[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpermilps $238, (%rsp), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -7714,7 +7703,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm13[2],mem[2],xmm13[3],mem[3] @@ -7890,7 +7879,7 @@ ; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -7919,7 +7908,7 @@ ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] @@ -8054,7 +8043,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload @@ -8074,7 +8063,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload @@ -8194,7 +8183,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vunpckhpd (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload @@ -8212,7 +8201,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload @@ -8327,7 +8316,7 @@ ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -8346,7 +8335,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -8364,14 +8353,6 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) @@ -8380,13 +8361,13 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -8396,13 +8377,13 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -8412,22 +8393,30 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r9) @@ -8494,29 +8483,29 @@ ; AVX2-ONLY-LABEL: load_i32_stride8_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $3528, %rsp # imm = 0xDC8 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm9 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm3, %xmm13 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm3 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -8524,28 +8513,29 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vbroadcastss %xmm12, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vbroadcastss %xmm13, %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -8553,89 +8543,88 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vbroadcastss %xmm15, %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm14[0],xmm12[1],xmm14[1] +; AVX2-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -8643,29 +8632,29 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -8673,29 +8662,29 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -8703,47 +8692,59 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vbroadcastss %xmm11, %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vbroadcastss %xmm10, %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vbroadcastss %xmm9, %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vbroadcastss %xmm8, %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vbroadcastss %xmm7, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vbroadcastss %xmm5, %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps %xmm13, %xmm9 -; AVX2-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8757,13 +8758,12 @@ ; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm8[0],xmm13[1],xmm8[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8773,17 +8773,15 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm12[1,1,1,1] +; AVX2-ONLY-NEXT: vmovaps %xmm12, %xmm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm15[0],xmm12[1],xmm15[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8801,24 +8799,9 @@ ; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8836,9 +8819,9 @@ ; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8856,9 +8839,9 @@ ; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8877,7 +8860,7 @@ ; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpcklps (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -8889,9 +8872,9 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] @@ -8901,11 +8884,9 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8916,257 +8897,254 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm12[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm10 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm5[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = mem[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = mem[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = mem[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = mem[0,1,2],xmm6[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm3[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm4[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = xmm4[2],mem[2],xmm4[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm15 = mem[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1,2],xmm15[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm15 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = mem[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = mem[0,1,2],xmm13[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpermilps $170, (%rsp), %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = mem[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = mem[0,1,2],xmm14[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm4[0,1,2],xmm15[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm5[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm5[1],xmm13[1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm14 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm12 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm10[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm9[0,1,2],xmm15[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm5[1],xmm12[1] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm8[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm12 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm5[0,1,2],xmm12[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = xmm7[2],mem[2],xmm7[3],mem[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm11 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm11 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm10[1],xmm11[1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm11 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm11 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm10 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm9[1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm8 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = mem[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm6 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm1[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm3[2],xmm11[3],xmm3[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm14[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm1[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9315,7 +9293,7 @@ ; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5] @@ -9326,7 +9304,7 @@ ; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm7, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm9 ; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm6 @@ -9446,7 +9424,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 1428(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vblendps $32, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3,4],mem[5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9466,7 +9444,7 @@ ; AVX2-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermilps $85, (%rsp), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] @@ -9582,7 +9560,7 @@ ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm9 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] @@ -9601,7 +9579,7 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -9612,7 +9590,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 2040(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7] @@ -9630,7 +9608,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 220(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] @@ -9717,14 +9695,6 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rsi) @@ -9733,13 +9703,13 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -9749,13 +9719,13 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -9765,22 +9735,30 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%r9) @@ -9814,9 +9792,9 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rax) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rax) @@ -9845,1231 +9823,4829 @@ ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; -; AVX512F-LABEL: load_i32_stride8_vf64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm7 -; AVX512F-NEXT: vmovaps 1152(%rdi), %zmm0 -; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm23 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm22 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm28 -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm30 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm27 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm2 -; AVX512F-NEXT: movb $-64, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm10 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512F-NEXT: vmovdqu64 %zmm19, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm18, %zmm1, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm19 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm1, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm3 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm9 -; AVX512F-NEXT: vpermi2d %zmm26, %zmm9, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm8 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm1, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm1, %zmm13 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm25, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm26, %zmm9, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm4 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm1, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm24 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm23 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm25, %zmm1, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm10, %zmm3, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512F-NEXT: vpermt2d %zmm14, %zmm0, %zmm15 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm25, %zmm1, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm12, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm8, %zmm3, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm9, %zmm1, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm24, %zmm0, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm1, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm16 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vpermt2d %zmm31, %zmm1, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm22 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vpermi2d %zmm12, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm8, %zmm2, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm7, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm26 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,21,29,5,13,21,29] -; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm9, %zmm2, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm2, %zmm14 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm9 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm30 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm25 -; AVX512F-NEXT: vpermt2d %zmm28, %zmm7, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm28, %zmm10, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm28, %zmm2, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm10, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm24, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm7, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm10, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm2, %zmm19 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm7, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm10, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm2, %zmm23 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm7, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm10, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm22, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm7, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm10, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm22 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm3, %zmm10, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm4, %zmm10, %zmm5 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm10, %zmm8 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm3, %zmm2, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm2, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm25 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [5,13,21,29,5,13,21,29] -; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm14, %zmm1, %zmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm21 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm7 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,7,15,23,31] -; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2d %zmm2, %zmm11, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm11, %zmm9 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm22, %zmm3, %zmm2 -; AVX512F-NEXT: vpermi2d %zmm14, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm11, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm6 -; AVX512F-NEXT: vpblendd $15, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm31, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm3 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm20, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm23 {%k1} -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm4 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm23, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm7 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm26 {%k1} -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm8 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm26, %zmm8 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 192(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 128(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, (%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 192(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, (%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 128(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 192(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, (%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 128(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 192(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, (%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 64(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 128(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, (%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 128(%r9) -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm21, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 128(%rax) -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512F-NEXT: addq $3304, %rsp # imm = 0xCE8 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-ONLY-SLOW-LABEL: load_i32_stride8_vf64: +; AVX512F-ONLY-SLOW: # %bb.0: +; AVX512F-ONLY-SLOW-NEXT: subq $3048, %rsp # imm = 0xBE8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovaps 1152(%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovaps 640(%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: movb $-64, %al +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm21, %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm20, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm21, %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm20, %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm24 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm24, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm20, %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm1, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm1, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm22, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm7, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm1, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm22, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm7, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [5,13,21,29,5,13,21,29] +; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm3, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm6, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm6, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm6, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm3, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm6, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm6, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm6, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm3, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm6, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm6, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm6, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm31 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm13, %zmm4, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,22,30,6,14,22,30] +; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm12, %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [7,15,23,31,7,15,23,31] +; AVX512F-ONLY-SLOW-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm12, %zmm24, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm2, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm24, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm24, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm13, %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm13, %zmm24, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm23, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm27, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, 192(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, 128(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, 192(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, 128(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, 64(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, 192(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, 128(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, 64(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, (%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, 192(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, 128(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, 64(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, 192(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, 128(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, 64(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, (%r9) +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, (%rax) +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $3048, %rsp # imm = 0xBE8 +; AVX512F-ONLY-SLOW-NEXT: vzeroupper +; AVX512F-ONLY-SLOW-NEXT: retq ; -; AVX512BW-LABEL: load_i32_stride8_vf64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm7 -; AVX512BW-NEXT: vmovaps 1152(%rdi), %zmm0 -; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: movb $-64, %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512BW-NEXT: vmovdqu64 %zmm19, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm1, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm3 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm9 -; AVX512BW-NEXT: vpermi2d %zmm26, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm8 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm1, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm25, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm26, %zmm9, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm4 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm1, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm24 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm23 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm1, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm3, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm15 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm1, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm22 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm12, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm3, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm4 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm1, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm22 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm12, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm2, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm7, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm26 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,21,29,5,13,21,29] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm2, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm9 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm9 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm30 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm7, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm10, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm2, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm10, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm7, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm10, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm2, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm7, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm10, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm2, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm7, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm7, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm10, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm10, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm10, %zmm8 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm2, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm25 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [5,13,21,29,5,13,21,29] -; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm1, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm21 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm7 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,7,15,23,31] -; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm11, %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm3, %zmm2 -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm11, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm6 -; AVX512BW-NEXT: vpblendd $15, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm31 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm31, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm3 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm20, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm23 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm4 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm23, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm7 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm29, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm26 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm26, %zmm8 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 192(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 128(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, (%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 192(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, (%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 128(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 192(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, (%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 128(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 192(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, (%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 64(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 128(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, (%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 128(%r9) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 128(%rax) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-NEXT: addq $3304, %rsp # imm = 0xCE8 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512F-ONLY-FAST-LABEL: load_i32_stride8_vf64: +; AVX512F-ONLY-FAST: # %bb.0: +; AVX512F-ONLY-FAST-NEXT: subq $3048, %rsp # imm = 0xBE8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovaps 1152(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovaps 640(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm29 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: movb $-64, %al +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm10, %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm21, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm20, %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm1, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm21, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm20, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm13, %zmm0, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm24 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm24, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm20, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm13, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm1, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm1, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm22, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm7, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm13, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm1, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm22, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm7, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [5,13,21,29,5,13,21,29] +; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm3, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm6, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm6, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm6, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm3, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm6, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm6, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm6, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm3, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm6, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm6, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm6, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm31 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm13, %zmm4, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,22,30,6,14,22,30] +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm12, %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [7,15,23,31,7,15,23,31] +; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm12, %zmm24, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm10, %zmm2, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm10, %zmm24, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm24, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm13, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm13, %zmm24, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm23, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm27, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, 192(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, 128(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, 192(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, 128(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, 64(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, 192(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, 128(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, 64(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, 192(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, 128(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, 64(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, 192(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, 128(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, 64(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, (%r9) +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, (%rax) +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-ONLY-FAST-NEXT: addq $3048, %rsp # imm = 0xBE8 +; AVX512F-ONLY-FAST-NEXT: vzeroupper +; AVX512F-ONLY-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: load_i32_stride8_vf64: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: subq $3048, %rsp # imm = 0xBE8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm28 +; AVX512DQ-SLOW-NEXT: vmovaps 1152(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm30 +; AVX512DQ-SLOW-NEXT: vmovaps 640(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm29 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: movb $-64, %al +; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm10, %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm12 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm21, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm20, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm21, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm20, %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm8, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm24 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm24, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm20, %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm1, %zmm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm1, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm22, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm7, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm1, %zmm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm22, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm7, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [5,13,21,29,5,13,21,29] +; AVX512DQ-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm14, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm4, %zmm3, %zmm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm6, %zmm28 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm6, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm4, %zmm6, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm3, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm6, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm6, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm6, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm3, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm6, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm6, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm6, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm31 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm13, %zmm4, %zmm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm20 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,22,30,6,14,22,30] +; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm12, %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [7,15,23,31,7,15,23,31] +; AVX512DQ-SLOW-NEXT: # ymm24 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm12, %zmm24, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm10, %zmm2, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm10, %zmm24, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm24, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm13, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm13, %zmm24, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm23, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm27, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, 192(%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, 128(%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, (%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, 192(%rdx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, 128(%rdx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, 64(%rdx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, (%rdx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, 192(%rcx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, 128(%rcx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, 64(%rcx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, (%rcx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, 192(%r8) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, 128(%r8) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, (%r8) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, 192(%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, 128(%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, 64(%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, (%r9) +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, (%rax) +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-SLOW-NEXT: addq $3048, %rsp # imm = 0xBE8 +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: load_i32_stride8_vf64: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: subq $3048, %rsp # imm = 0xBE8 +; AVX512DQ-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm28 +; AVX512DQ-FAST-NEXT: vmovaps 1152(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm30 +; AVX512DQ-FAST-NEXT: vmovaps 640(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm29 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: movb $-64, %al +; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm10, %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm12 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm21, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2d %zmm20, %zmm5, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm4, %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2d %zmm21, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2d %zmm20, %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm8, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm13, %zmm0, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm24 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm24, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm20, %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm13, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %zmm22, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %zmm7, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm13, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2d %zmm22, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm7, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [5,13,21,29,5,13,21,29] +; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm14, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm4, %zmm3, %zmm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm6, %zmm28 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm6, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm4, %zmm6, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm3, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm6, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm6, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm6, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm3, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm6, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm6, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm6, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm31 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm12 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %zmm13, %zmm4, %zmm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm20 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,22,30,6,14,22,30] +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm12, %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [7,15,23,31,7,15,23,31] +; AVX512DQ-FAST-NEXT: # ymm24 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm12, %zmm24, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm10, %zmm2, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm10, %zmm24, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm24, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm13, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm13, %zmm24, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm23, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm7 = mem[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm27, %zmm7 +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 192(%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 128(%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, (%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 192(%rdx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 128(%rdx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 64(%rdx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, (%rdx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 192(%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 128(%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 64(%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, (%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 192(%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 128(%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 64(%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, (%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 192(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 128(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 64(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, (%r9) +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, (%rax) +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-FAST-NEXT: addq $3048, %rsp # imm = 0xBE8 +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-ONLY-SLOW-LABEL: load_i32_stride8_vf64: +; AVX512BW-ONLY-SLOW: # %bb.0: +; AVX512BW-ONLY-SLOW-NEXT: subq $3048, %rsp # imm = 0xBE8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovaps 1152(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovaps 640(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %al +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm21, %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm20, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm21, %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm20, %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm24 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm24, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm20, %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm1, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm1, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm22, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm7, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm1, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm22, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm7, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm3, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm6, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm6, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm6, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm3, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm6, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm6, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm6, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm3, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm6, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm6, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm6, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm31 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm13, %zmm4, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm12, %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-SLOW-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm12, %zmm24, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm2, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm24, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm24, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm13, %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm13, %zmm24, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm23, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm27, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 192(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 128(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, (%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 192(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 128(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 64(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, (%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 192(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 128(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 64(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, (%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 192(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 128(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 64(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, (%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 192(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 128(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 64(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, (%r9) +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: addq $3048, %rsp # imm = 0xBE8 +; AVX512BW-ONLY-SLOW-NEXT: vzeroupper +; AVX512BW-ONLY-SLOW-NEXT: retq +; +; AVX512BW-ONLY-FAST-LABEL: load_i32_stride8_vf64: +; AVX512BW-ONLY-FAST: # %bb.0: +; AVX512BW-ONLY-FAST-NEXT: subq $3048, %rsp # imm = 0xBE8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovaps 1152(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovaps 640(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: movb $-64, %al +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm10, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm21, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm20, %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm21, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm20, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm13, %zmm0, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm24, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm20, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm13, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm22, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm7, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm13, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm22, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm7, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm3, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm6, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm6, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm6, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm3, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm6, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm6, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm6, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm3, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm6, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm6, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm6, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm31 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm13, %zmm4, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm12, %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm12, %zmm24, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm10, %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm10, %zmm24, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm24, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm13, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm13, %zmm24, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm4 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm23, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm7 = mem[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm27, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 192(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 128(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, (%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 192(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 128(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 64(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, (%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 192(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 128(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 64(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, (%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 192(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 128(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 64(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, (%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 192(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 128(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 64(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, (%r9) +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, (%rax) +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-ONLY-FAST-NEXT: addq $3048, %rsp # imm = 0xBE8 +; AVX512BW-ONLY-FAST-NEXT: vzeroupper +; AVX512BW-ONLY-FAST-NEXT: retq +; +; AVX512DQBW-SLOW-LABEL: load_i32_stride8_vf64: +; AVX512DQBW-SLOW: # %bb.0: +; AVX512DQBW-SLOW-NEXT: subq $3048, %rsp # imm = 0xBE8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovaps 1152(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovaps 640(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm29 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: movb $-64, %al +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm10, %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm21, %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm20, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm21, %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm20, %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm8, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm24 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm24, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm20, %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm22, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm7, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm13, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm22, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm7, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [5,13,21,29,5,13,21,29] +; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm14, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm4, %zmm3, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm6, %zmm28 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm6, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm4, %zmm6, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm3, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm6, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm1, %zmm6, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm6, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm3, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm6, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm1, %zmm6, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm1, %zmm6, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm31 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm13, %zmm4, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm20 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,22,30,6,14,22,30] +; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm12, %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [7,15,23,31,7,15,23,31] +; AVX512DQBW-SLOW-NEXT: # ymm24 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm12, %zmm24, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm10, %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm10, %zmm24, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm24, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm13, %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm13, %zmm24, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm4 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm23, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm7 = mem[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm27, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 192(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 128(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, (%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 192(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 128(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 64(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, (%rdx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 192(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 128(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 64(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, (%rcx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 192(%r8) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 128(%r8) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 64(%r8) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, (%r8) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 192(%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 128(%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 64(%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, (%r9) +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, (%rax) +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQBW-SLOW-NEXT: addq $3048, %rsp # imm = 0xBE8 +; AVX512DQBW-SLOW-NEXT: vzeroupper +; AVX512DQBW-SLOW-NEXT: retq +; +; AVX512DQBW-FAST-LABEL: load_i32_stride8_vf64: +; AVX512DQBW-FAST: # %bb.0: +; AVX512DQBW-FAST-NEXT: subq $3048, %rsp # imm = 0xBE8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm28 +; AVX512DQBW-FAST-NEXT: vmovaps 1152(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm30 +; AVX512DQBW-FAST-NEXT: vmovaps 640(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm29 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: movb $-64, %al +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm10, %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm12 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm21, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm20, %zmm5, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm8, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm4, %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm21, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm20, %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm10, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm8, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm13, %zmm0, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm24 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm24, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm20, %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm13, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm22, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm7, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm8, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm13, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm22, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm7, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [5,13,21,29,5,13,21,29] +; AVX512DQBW-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm14, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm4, %zmm3, %zmm11 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm6, %zmm28 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm6, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm4, %zmm6, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm3, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm6, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm1, %zmm6, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm6, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm3, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm6, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm1, %zmm6, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm1, %zmm6, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31, %zmm31 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm12 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm13, %zmm4, %zmm10 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm20 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,22,30,6,14,22,30] +; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm12, %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [7,15,23,31,7,15,23,31] +; AVX512DQBW-FAST-NEXT: # ymm24 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm12, %zmm24, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm10, %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm10, %zmm24, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm24, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm13, %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm13, %zmm24, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm26 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm2 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm4 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm23, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm21, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm7 = mem[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm27, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 192(%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 128(%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 64(%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, (%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 192(%rdx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 128(%rdx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 64(%rdx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, (%rdx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 192(%rcx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 128(%rcx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 64(%rcx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, (%rcx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 192(%r8) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 128(%r8) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 64(%r8) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, (%r8) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 192(%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 128(%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 64(%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, (%r9) +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, (%rax) +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQBW-FAST-NEXT: addq $3048, %rsp # imm = 0xBE8 +; AVX512DQBW-FAST-NEXT: vzeroupper +; AVX512DQBW-FAST-NEXT: retq %wide.vec = load <512 x i32>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> %strided.vec1 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll @@ -51,16 +51,16 @@ ; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps 32(%rdi), %xmm2 ; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm5, (%rsi) -; SSE-NEXT: movaps %xmm4, 16(%rsi) -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: movaps %xmm5, 16(%rsi) +; SSE-NEXT: movaps %xmm4, (%rsi) ; SSE-NEXT: movaps %xmm2, 16(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride2_vf4: @@ -128,48 +128,48 @@ ; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps 32(%rdi), %xmm2 ; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: movaps 80(%rdi), %xmm4 -; SSE-NEXT: movaps 64(%rdi), %xmm5 -; SSE-NEXT: movaps 112(%rdi), %xmm6 -; SSE-NEXT: movaps 96(%rdi), %xmm7 +; SSE-NEXT: movaps 112(%rdi), %xmm4 +; SSE-NEXT: movaps 96(%rdi), %xmm5 +; SSE-NEXT: movaps 80(%rdi), %xmm6 +; SSE-NEXT: movaps 64(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0] ; SSE-NEXT: movaps %xmm5, %xmm9 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm4[0] -; SSE-NEXT: movaps %xmm2, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm3[0] -; SSE-NEXT: movaps %xmm0, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm3[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm4[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm9, 32(%rsi) -; SSE-NEXT: movaps %xmm8, 48(%rsi) -; SSE-NEXT: movaps %xmm11, (%rsi) -; SSE-NEXT: movaps %xmm10, 16(%rsi) -; SSE-NEXT: movaps %xmm5, 32(%rdx) -; SSE-NEXT: movaps %xmm7, 48(%rdx) -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: movaps %xmm9, 48(%rsi) +; SSE-NEXT: movaps %xmm8, 32(%rsi) +; SSE-NEXT: movaps %xmm11, 16(%rsi) +; SSE-NEXT: movaps %xmm10, (%rsi) +; SSE-NEXT: movaps %xmm5, 48(%rdx) +; SSE-NEXT: movaps %xmm7, 32(%rdx) ; SSE-NEXT: movaps %xmm2, 16(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride2_vf8: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -179,18 +179,18 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -220,36 +220,36 @@ ; SSE-NEXT: movaps (%rdi), %xmm1 ; SSE-NEXT: movaps 16(%rdi), %xmm8 ; SSE-NEXT: movaps 32(%rdi), %xmm0 -; SSE-NEXT: movaps 208(%rdi), %xmm11 -; SSE-NEXT: movaps 192(%rdi), %xmm2 ; SSE-NEXT: movaps 240(%rdi), %xmm10 -; SSE-NEXT: movaps 224(%rdi), %xmm4 -; SSE-NEXT: movaps 144(%rdi), %xmm14 -; SSE-NEXT: movaps 128(%rdi), %xmm3 +; SSE-NEXT: movaps 224(%rdi), %xmm2 +; SSE-NEXT: movaps 208(%rdi), %xmm11 +; SSE-NEXT: movaps 192(%rdi), %xmm3 ; SSE-NEXT: movaps 176(%rdi), %xmm12 -; SSE-NEXT: movaps 160(%rdi), %xmm6 -; SSE-NEXT: movaps 80(%rdi), %xmm13 -; SSE-NEXT: movaps 64(%rdi), %xmm5 +; SSE-NEXT: movaps 160(%rdi), %xmm4 +; SSE-NEXT: movaps 144(%rdi), %xmm14 +; SSE-NEXT: movaps 128(%rdi), %xmm5 ; SSE-NEXT: movaps 112(%rdi), %xmm15 -; SSE-NEXT: movaps 96(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: movaps 96(%rdi), %xmm6 +; SSE-NEXT: movaps 80(%rdi), %xmm13 +; SSE-NEXT: movaps 64(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm9 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] -; SSE-NEXT: movaps %xmm7, %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm13[1] +; SSE-NEXT: movaps %xmm6, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm15[1] -; SSE-NEXT: movaps %xmm3, %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm15[1] +; SSE-NEXT: movaps %xmm5, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm14[1] -; SSE-NEXT: movaps %xmm6, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm14[1] +; SSE-NEXT: movaps %xmm4, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm12[1] -; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm12[1] +; SSE-NEXT: movaps %xmm3, %xmm12 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] -; SSE-NEXT: movaps %xmm4, %xmm11 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm11[1] +; SSE-NEXT: movaps %xmm2, %xmm11 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm10[1] ; SSE-NEXT: movaps %xmm1, %xmm10 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm8[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] @@ -258,20 +258,20 @@ ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] -; SSE-NEXT: movaps %xmm12, 96(%rsi) ; SSE-NEXT: movaps %xmm11, 112(%rsi) -; SSE-NEXT: movaps %xmm15, 64(%rsi) +; SSE-NEXT: movaps %xmm12, 96(%rsi) ; SSE-NEXT: movaps %xmm14, 80(%rsi) -; SSE-NEXT: movaps %xmm9, 32(%rsi) +; SSE-NEXT: movaps %xmm15, 64(%rsi) ; SSE-NEXT: movaps %xmm13, 48(%rsi) -; SSE-NEXT: movaps %xmm10, (%rsi) +; SSE-NEXT: movaps %xmm9, 32(%rsi) ; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps %xmm4, 112(%rdx) -; SSE-NEXT: movaps %xmm2, 96(%rdx) -; SSE-NEXT: movaps %xmm6, 80(%rdx) -; SSE-NEXT: movaps %xmm3, 64(%rdx) -; SSE-NEXT: movaps %xmm7, 48(%rdx) -; SSE-NEXT: movaps %xmm5, 32(%rdx) +; SSE-NEXT: movaps %xmm10, (%rsi) +; SSE-NEXT: movaps %xmm2, 112(%rdx) +; SSE-NEXT: movaps %xmm3, 96(%rdx) +; SSE-NEXT: movaps %xmm4, 80(%rdx) +; SSE-NEXT: movaps %xmm5, 64(%rdx) +; SSE-NEXT: movaps %xmm6, 48(%rdx) +; SSE-NEXT: movaps %xmm7, 32(%rdx) ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdx) @@ -283,30 +283,30 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm2[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm6[1],ymm1[3],ymm6[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm10[1],ymm2[3],ymm10[3] -; AVX1-ONLY-NEXT: vmovaps %ymm11, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm3[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] +; AVX1-ONLY-NEXT: vmovaps %ymm11, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -316,34 +316,34 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,1,3] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,1,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,1,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,1,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,1,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm9, 64(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm11, (%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 32(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm9, 96(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm10, (%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -378,21 +378,21 @@ ; SSE-LABEL: load_i64_stride2_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movaps 208(%rdi), %xmm11 -; SSE-NEXT: movaps 192(%rdi), %xmm6 -; SSE-NEXT: movaps 80(%rdi), %xmm1 -; SSE-NEXT: movaps 64(%rdi), %xmm5 -; SSE-NEXT: movaps 240(%rdi), %xmm14 -; SSE-NEXT: movaps 224(%rdi), %xmm8 -; SSE-NEXT: movaps 112(%rdi), %xmm3 -; SSE-NEXT: movaps 96(%rdi), %xmm7 -; SSE-NEXT: movaps 272(%rdi), %xmm12 -; SSE-NEXT: movaps 144(%rdi), %xmm2 -; SSE-NEXT: movaps 128(%rdi), %xmm9 -; SSE-NEXT: movaps 304(%rdi), %xmm0 -; SSE-NEXT: movaps 288(%rdi), %xmm13 -; SSE-NEXT: movaps 176(%rdi), %xmm4 -; SSE-NEXT: movaps 160(%rdi), %xmm10 +; SSE-NEXT: movaps 240(%rdi), %xmm11 +; SSE-NEXT: movaps 224(%rdi), %xmm6 +; SSE-NEXT: movaps 112(%rdi), %xmm1 +; SSE-NEXT: movaps 96(%rdi), %xmm5 +; SSE-NEXT: movaps 208(%rdi), %xmm14 +; SSE-NEXT: movaps 192(%rdi), %xmm8 +; SSE-NEXT: movaps 80(%rdi), %xmm3 +; SSE-NEXT: movaps 64(%rdi), %xmm7 +; SSE-NEXT: movaps 304(%rdi), %xmm12 +; SSE-NEXT: movaps 176(%rdi), %xmm2 +; SSE-NEXT: movaps 160(%rdi), %xmm9 +; SSE-NEXT: movaps 272(%rdi), %xmm0 +; SSE-NEXT: movaps 256(%rdi), %xmm13 +; SSE-NEXT: movaps 144(%rdi), %xmm4 +; SSE-NEXT: movaps 128(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm7, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -428,236 +428,236 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 256(%rdi), %xmm0 +; SSE-NEXT: movaps 288(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm0 -; SSE-NEXT: movaps 352(%rdi), %xmm15 +; SSE-NEXT: movaps 336(%rdi), %xmm0 +; SSE-NEXT: movaps 320(%rdi), %xmm15 ; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movaps 336(%rdi), %xmm0 -; SSE-NEXT: movaps 320(%rdi), %xmm13 +; SSE-NEXT: movaps 368(%rdi), %xmm0 +; SSE-NEXT: movaps 352(%rdi), %xmm13 ; SSE-NEXT: movaps %xmm13, %xmm11 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps 432(%rdi), %xmm0 -; SSE-NEXT: movaps 416(%rdi), %xmm12 +; SSE-NEXT: movaps 400(%rdi), %xmm0 +; SSE-NEXT: movaps 384(%rdi), %xmm12 ; SSE-NEXT: movaps %xmm12, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: movaps 400(%rdi), %xmm0 -; SSE-NEXT: movaps 384(%rdi), %xmm9 +; SSE-NEXT: movaps 432(%rdi), %xmm0 +; SSE-NEXT: movaps 416(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm9, %xmm10 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 496(%rdi), %xmm0 -; SSE-NEXT: movaps 480(%rdi), %xmm7 +; SSE-NEXT: movaps 464(%rdi), %xmm0 +; SSE-NEXT: movaps 448(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, %xmm6 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] -; SSE-NEXT: movaps 464(%rdi), %xmm1 -; SSE-NEXT: movaps 448(%rdi), %xmm3 +; SSE-NEXT: movaps 496(%rdi), %xmm1 +; SSE-NEXT: movaps 480(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE-NEXT: movaps 32(%rdi), %xmm8 -; SSE-NEXT: movaps 48(%rdi), %xmm1 +; SSE-NEXT: movaps (%rdi), %xmm8 +; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm8, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] -; SSE-NEXT: movaps (%rdi), %xmm4 -; SSE-NEXT: movaps 16(%rdi), %xmm0 +; SSE-NEXT: movaps 32(%rdi), %xmm4 +; SSE-NEXT: movaps 48(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE-NEXT: movaps %xmm2, 224(%rsi) -; SSE-NEXT: movaps %xmm11, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps %xmm6, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 176(%rsi) +; SSE-NEXT: movaps %xmm2, 240(%rsi) +; SSE-NEXT: movaps %xmm11, 176(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) -; SSE-NEXT: movaps %xmm10, 192(%rsi) +; SSE-NEXT: movaps %xmm6, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps %xmm0, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps %xmm14, 208(%rsi) +; SSE-NEXT: movaps %xmm0, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps %xmm10, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rsi) -; SSE-NEXT: movaps %xmm5, 16(%rsi) -; SSE-NEXT: movaps %xmm3, 224(%rdx) -; SSE-NEXT: movaps %xmm7, 240(%rdx) -; SSE-NEXT: movaps %xmm9, 192(%rdx) -; SSE-NEXT: movaps %xmm12, 208(%rdx) -; SSE-NEXT: movaps %xmm13, 160(%rdx) -; SSE-NEXT: movaps %xmm15, 176(%rdx) +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps %xmm14, 192(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rdx) +; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps %xmm5, (%rsi) +; SSE-NEXT: movaps %xmm3, 240(%rdx) +; SSE-NEXT: movaps %xmm7, 224(%rdx) +; SSE-NEXT: movaps %xmm9, 208(%rdx) +; SSE-NEXT: movaps %xmm12, 192(%rdx) +; SSE-NEXT: movaps %xmm13, 176(%rdx) +; SSE-NEXT: movaps %xmm15, 160(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movaps %xmm4, (%rdx) -; SSE-NEXT: movaps %xmm8, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm4, 16(%rdx) +; SSE-NEXT: movaps %xmm8, (%rdx) ; SSE-NEXT: addq $152, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride2_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm2, %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm7[0],ymm11[2],ymm7[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm3[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm3, %ymm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm2, %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm4[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm4, %ymm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm5, %ymm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm9[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm5, %ymm15 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm6[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm7[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm6[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm3[0],ymm7[0],ymm3[2],ymm7[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm4[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm4[0],ymm15[0],ymm4[2],ymm15[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm15[1],ymm4[3],ymm15[3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm13[1],ymm6[3],ymm13[3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm1[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm11[0],ymm1[2],ymm11[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] -; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 128(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 160(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride2_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm2[0],ymm13[2],ymm2[2] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm13[1],ymm2[1],ymm13[3],ymm2[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm5[0],ymm15[2],ymm5[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm15[1],ymm5[1],ymm15[3],ymm5[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm14[0],ymm6[0],ymm14[2],ymm6[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm14[1],ymm6[1],ymm14[3],ymm6[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm8[0],ymm11[2],ymm8[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm11[1],ymm8[1],ymm11[3],ymm8[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm3[0],ymm15[2],ymm3[2] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm15[1],ymm3[1],ymm15[3],ymm3[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm8[0],ymm5[0],ymm8[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm5[1],ymm8[3],ymm5[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm12[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 224(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm13[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 160(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm15[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm14[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm10[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm4, 192(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm12[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm4, 128(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm11[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -676,22 +676,22 @@ ; AVX512-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10 ; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-NEXT: vpermt2q %zmm3, %zmm8, %zmm11 -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm11 +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] -; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm5 ; AVX512-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 -; AVX512-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 +; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm5 ; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm10, 192(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm11, 64(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm9, 128(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm11, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm5, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm7, 128(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <64 x i64>, ptr %in.vec, align 64 @@ -1034,30 +1034,30 @@ ; AVX1-ONLY-LABEL: load_i64_stride2_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $424, %rsp # imm = 0x1A8 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm9[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm9, %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm9, %ymm12 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm6[0],ymm12[2],ymm6[2] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm4, %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm4, %ymm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm2, %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm2, %ymm13 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm11, %ymm4 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] @@ -1065,7 +1065,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm5, %ymm2 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm12[1],ymm6[1],ymm12[3],ymm6[3] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] @@ -1073,7 +1073,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm8, %ymm2 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] @@ -1085,101 +1085,101 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm7, %ymm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 800(%rdi), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 800(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 992(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 992(%rdi), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 384(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 320(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 256(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 480(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 416(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 352(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 288(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 224(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 480(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 416(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 352(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 288(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 448(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 384(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 320(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 256(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 480(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 384(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 416(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 320(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 352(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 416(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 352(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 320(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: addq $424, %rsp # imm = 0x1A8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -1187,21 +1187,21 @@ ; AVX2-ONLY-LABEL: load_i64_stride2_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $424, %rsp # imm = 0x1A8 -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm14 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm14[0],ymm12[0],ymm14[2],ymm12[2] ; AVX2-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm12[1],ymm14[3],ymm12[3] @@ -1229,126 +1229,126 @@ ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] ; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm12[0],ymm15[0],ymm12[2],ymm15[2] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm12[1],ymm15[1],ymm12[3],ymm15[3] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) ; AVX2-ONLY-NEXT: vpermpd $216, (%rsp), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm14[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rsi) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rsi) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rsi) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rsi) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm11[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rdx) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rdx) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rdx) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rdx) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rdx) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rdx) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm12[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX2-ONLY-NEXT: addq $424, %rsp # imm = 0x1A8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -1361,16 +1361,16 @@ ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm7 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm8 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm10 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm10 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm11 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm12 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm13 -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm14 -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512-NEXT: vpermt2q %zmm14, %zmm16, %zmm17 @@ -1384,34 +1384,34 @@ ; AVX512-NEXT: vpermt2q %zmm6, %zmm16, %zmm21 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm22 ; AVX512-NEXT: vpermt2q %zmm4, %zmm16, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512-NEXT: vpermt2q %zmm3, %zmm16, %zmm23 -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512-NEXT: vpermt2q %zmm1, %zmm16, %zmm23 +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm16 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] +; AVX512-NEXT: vpermt2q %zmm14, %zmm24, %zmm15 ; AVX512-NEXT: vpermt2q %zmm12, %zmm24, %zmm13 ; AVX512-NEXT: vpermt2q %zmm10, %zmm24, %zmm11 ; AVX512-NEXT: vpermt2q %zmm8, %zmm24, %zmm9 ; AVX512-NEXT: vpermt2q %zmm6, %zmm24, %zmm7 ; AVX512-NEXT: vpermt2q %zmm4, %zmm24, %zmm5 -; AVX512-NEXT: vpermt2q %zmm14, %zmm24, %zmm15 -; AVX512-NEXT: vpermt2q %zmm3, %zmm24, %zmm2 ; AVX512-NEXT: vpermt2q %zmm1, %zmm24, %zmm0 +; AVX512-NEXT: vpermt2q %zmm3, %zmm24, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm22, 448(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm21, 256(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm21, 384(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm20, 320(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm19, 128(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm19, 256(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm23, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm17, 384(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm15, 384(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm17, 128(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm23, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm5, 448(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm7, 256(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm7, 384(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm9, 320(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm11, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm11, 256(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm13, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <128 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll @@ -56,31 +56,18 @@ ; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rcx) ; AVX2-ONLY-NEXT: retq ; -; AVX512-SLOW-LABEL: load_i64_stride3_vf2: -; AVX512-SLOW: # %bb.0: -; AVX512-SLOW-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,3,2,3,4,7,6,7] -; AVX512-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512-SLOW-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX512-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; AVX512-SLOW-NEXT: vmovaps %xmm0, (%rsi) -; AVX512-SLOW-NEXT: vmovdqa %xmm2, (%rdx) -; AVX512-SLOW-NEXT: vmovdqa %xmm1, (%rcx) -; AVX512-SLOW-NEXT: vzeroupper -; AVX512-SLOW-NEXT: retq -; -; AVX512-FAST-LABEL: load_i64_stride3_vf2: -; AVX512-FAST: # %bb.0: -; AVX512-FAST-NEXT: vmovaps {{.*#+}} xmm0 = [1,4] -; AVX512-FAST-NEXT: vmovaps (%rdi), %zmm1 -; AVX512-FAST-NEXT: vpermpd %zmm1, %zmm0, %zmm0 -; AVX512-FAST-NEXT: vpermpd {{.*#+}} zmm1 = zmm1[0,3,2,3,4,7,6,7] -; AVX512-FAST-NEXT: vmovaps 16(%rdi), %xmm2 -; AVX512-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512-FAST-NEXT: vmovaps %xmm1, (%rsi) -; AVX512-FAST-NEXT: vmovaps %xmm0, (%rdx) -; AVX512-FAST-NEXT: vmovaps %xmm2, (%rcx) -; AVX512-FAST-NEXT: vzeroupper -; AVX512-FAST-NEXT: retq +; AVX512-LABEL: load_i64_stride3_vf2: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX512-NEXT: vpunpcklqdq 24(%rdi){1to2}, %xmm0, %xmm0 +; AVX512-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; AVX512-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512-NEXT: vmovaps %xmm2, (%rdx) +; AVX512-NEXT: vmovaps %xmm1, (%rcx) +; AVX512-NEXT: retq %wide.vec = load <6 x i64>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <6 x i64> %wide.vec, <6 x i64> poison, <2 x i32> %strided.vec1 = shufflevector <6 x i64> %wide.vec, <6 x i64> poison, <2 x i32> @@ -120,17 +107,18 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = mem[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2],ymm1[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm0[0],ymm2[3],ymm0[2] -; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm0, (%rcx) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0],ymm1[1],ymm3[2],ymm1[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[1],ymm0[0],ymm3[3],ymm0[2] +; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm0, (%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm1, (%rcx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -184,46 +172,46 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i64_stride3_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movapd 128(%rdi), %xmm2 -; SSE-NEXT: movapd 176(%rdi), %xmm1 +; SSE-NEXT: movapd 176(%rdi), %xmm2 +; SSE-NEXT: movapd 128(%rdi), %xmm1 ; SSE-NEXT: movapd 80(%rdi), %xmm0 -; SSE-NEXT: movapd 96(%rdi), %xmm3 +; SSE-NEXT: movapd 144(%rdi), %xmm4 +; SSE-NEXT: movapd 160(%rdi), %xmm7 +; SSE-NEXT: movapd 96(%rdi), %xmm5 ; SSE-NEXT: movapd 112(%rdi), %xmm8 -; SSE-NEXT: movapd 144(%rdi), %xmm5 -; SSE-NEXT: movapd 160(%rdi), %xmm9 ; SSE-NEXT: movapd (%rdi), %xmm6 -; SSE-NEXT: movapd 16(%rdi), %xmm10 -; SSE-NEXT: movapd 32(%rdi), %xmm4 -; SSE-NEXT: movapd 48(%rdi), %xmm7 +; SSE-NEXT: movapd 16(%rdi), %xmm9 +; SSE-NEXT: movapd 32(%rdi), %xmm3 +; SSE-NEXT: movapd 48(%rdi), %xmm10 ; SSE-NEXT: movapd 64(%rdi), %xmm11 ; SSE-NEXT: movapd %xmm11, %xmm12 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm7[0],xmm12[1] -; SSE-NEXT: movapd %xmm9, %xmm13 +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm10[0],xmm12[1] +; SSE-NEXT: movapd %xmm8, %xmm13 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm5[0],xmm13[1] -; SSE-NEXT: movapd %xmm8, %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm3[0],xmm14[1] -; SSE-NEXT: movapd %xmm10, %xmm15 +; SSE-NEXT: movapd %xmm7, %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm4[0],xmm14[1] +; SSE-NEXT: movapd %xmm9, %xmm15 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm6[0],xmm15[1] -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm0[0] +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm0[0] ; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm1[0] -; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm2[0] -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm4[0] +; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm2[0] +; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm3[0] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm8[0],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm10[0],xmm4[1] -; SSE-NEXT: movapd %xmm14, 32(%rsi) -; SSE-NEXT: movapd %xmm13, 48(%rsi) -; SSE-NEXT: movapd %xmm15, (%rsi) +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm9[0],xmm3[1] +; SSE-NEXT: movapd %xmm14, 48(%rsi) +; SSE-NEXT: movapd %xmm13, 32(%rsi) ; SSE-NEXT: movapd %xmm12, 16(%rsi) -; SSE-NEXT: movapd %xmm3, 32(%rdx) -; SSE-NEXT: movapd %xmm5, 48(%rdx) +; SSE-NEXT: movapd %xmm15, (%rsi) +; SSE-NEXT: movapd %xmm4, 48(%rdx) +; SSE-NEXT: movapd %xmm5, 32(%rdx) +; SSE-NEXT: movapd %xmm10, 16(%rdx) ; SSE-NEXT: movapd %xmm6, (%rdx) -; SSE-NEXT: movapd %xmm7, 16(%rdx) -; SSE-NEXT: movapd %xmm2, 32(%rcx) -; SSE-NEXT: movapd %xmm1, 48(%rcx) -; SSE-NEXT: movapd %xmm4, (%rcx) +; SSE-NEXT: movapd %xmm2, 48(%rcx) +; SSE-NEXT: movapd %xmm1, 32(%rcx) ; SSE-NEXT: movapd %xmm0, 16(%rcx) +; SSE-NEXT: movapd %xmm3, (%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride3_vf8: @@ -231,29 +219,31 @@ ; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = mem[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2],ymm5[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm1[0],ymm3[3],ymm1[2] -; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm0[0],ymm6[3],ymm0[2] -; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm4, (%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 32(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm3, (%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm0, 32(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm5[0],ymm2[1],ymm5[2],ymm2[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm7[0],ymm4[1],ymm7[2],ymm4[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm5[1],ymm1[0],ymm5[3],ymm1[2] +; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[1],ymm0[0],ymm7[3],ymm0[2] +; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd %ymm8, 32(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm6, (%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 32(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm2, (%rcx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -332,25 +322,25 @@ ; SSE-LABEL: load_i64_stride3_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movapd 128(%rdi), %xmm0 -; SSE-NEXT: movapd 176(%rdi), %xmm1 -; SSE-NEXT: movapd 224(%rdi), %xmm4 +; SSE-NEXT: movapd 176(%rdi), %xmm0 +; SSE-NEXT: movapd 128(%rdi), %xmm1 ; SSE-NEXT: movapd 272(%rdi), %xmm3 ; SSE-NEXT: movapd 80(%rdi), %xmm2 -; SSE-NEXT: movapd 96(%rdi), %xmm5 -; SSE-NEXT: movapd 112(%rdi), %xmm12 -; SSE-NEXT: movapd 144(%rdi), %xmm6 -; SSE-NEXT: movapd 160(%rdi), %xmm14 -; SSE-NEXT: movapd 192(%rdi), %xmm7 -; SSE-NEXT: movapd 208(%rdi), %xmm11 -; SSE-NEXT: movapd 240(%rdi), %xmm10 -; SSE-NEXT: movapd 256(%rdi), %xmm13 -; SSE-NEXT: movapd 48(%rdi), %xmm9 +; SSE-NEXT: movapd 224(%rdi), %xmm4 +; SSE-NEXT: movapd 144(%rdi), %xmm5 +; SSE-NEXT: movapd 160(%rdi), %xmm12 +; SSE-NEXT: movapd 96(%rdi), %xmm6 +; SSE-NEXT: movapd 112(%rdi), %xmm14 +; SSE-NEXT: movapd 240(%rdi), %xmm7 +; SSE-NEXT: movapd 256(%rdi), %xmm11 +; SSE-NEXT: movapd 48(%rdi), %xmm10 ; SSE-NEXT: movapd 64(%rdi), %xmm15 +; SSE-NEXT: movapd 192(%rdi), %xmm9 +; SSE-NEXT: movapd 208(%rdi), %xmm13 ; SSE-NEXT: movapd %xmm15, %xmm8 -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm9[0],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm10[0],xmm8[1] ; SSE-NEXT: movapd %xmm8, (%rsp) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm2[0] +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm2[0] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm15[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm14, %xmm15 @@ -366,200 +356,204 @@ ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm12[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm13, %xmm12 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm10[0],xmm12[1] -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm3[0] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm13[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm9[0],xmm12[1] +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm4[0] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm13[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm11, %xmm8 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm7[0],xmm8[1] -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm4[0] +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm3[0] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm11[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 336(%rdi), %xmm13 -; SSE-NEXT: movapd 352(%rdi), %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm11[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 288(%rdi), %xmm13 +; SSE-NEXT: movapd 304(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm7 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm13[0],xmm7[1] -; SSE-NEXT: movapd 368(%rdi), %xmm11 +; SSE-NEXT: movapd 320(%rdi), %xmm11 ; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm11[0] ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] -; SSE-NEXT: movapd 288(%rdi), %xmm0 -; SSE-NEXT: movapd 304(%rdi), %xmm2 +; SSE-NEXT: movapd 336(%rdi), %xmm0 +; SSE-NEXT: movapd 352(%rdi), %xmm2 ; SSE-NEXT: movapd %xmm2, %xmm1 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd 320(%rdi), %xmm6 +; SSE-NEXT: movapd 368(%rdi), %xmm6 ; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm6[0] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1] ; SSE-NEXT: movapd (%rdi), %xmm2 -; SSE-NEXT: movapd 16(%rdi), %xmm5 -; SSE-NEXT: movapd %xmm5, %xmm3 +; SSE-NEXT: movapd 16(%rdi), %xmm4 +; SSE-NEXT: movapd %xmm4, %xmm3 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; SSE-NEXT: movapd 32(%rdi), %xmm4 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm4[0] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] -; SSE-NEXT: movapd %xmm1, 96(%rsi) -; SSE-NEXT: movapd %xmm14, 32(%rsi) -; SSE-NEXT: movapd %xmm7, 112(%rsi) -; SSE-NEXT: movapd %xmm15, 48(%rsi) -; SSE-NEXT: movapd %xmm8, 64(%rsi) -; SSE-NEXT: movapd %xmm3, (%rsi) -; SSE-NEXT: movapd %xmm12, 80(%rsi) +; SSE-NEXT: movapd 32(%rdi), %xmm5 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm5[0] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; SSE-NEXT: movapd %xmm1, 112(%rsi) +; SSE-NEXT: movapd %xmm14, 48(%rsi) +; SSE-NEXT: movapd %xmm7, 96(%rsi) +; SSE-NEXT: movapd %xmm15, 32(%rsi) +; SSE-NEXT: movapd %xmm8, 80(%rsi) ; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movapd %xmm0, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) -; SSE-NEXT: movapd %xmm13, 112(%rdx) +; SSE-NEXT: movapd %xmm12, 64(%rsi) +; SSE-NEXT: movapd %xmm3, (%rsi) +; SSE-NEXT: movapd %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movapd %xmm13, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) -; SSE-NEXT: movapd %xmm2, (%rdx) -; SSE-NEXT: movapd %xmm10, 80(%rdx) -; SSE-NEXT: movapd %xmm9, 16(%rdx) -; SSE-NEXT: movapd %xmm6, 96(%rcx) -; SSE-NEXT: movapd %xmm11, 112(%rcx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rcx) +; SSE-NEXT: movaps %xmm0, 80(%rdx) +; SSE-NEXT: movapd %xmm10, 16(%rdx) +; SSE-NEXT: movapd %xmm9, 64(%rdx) +; SSE-NEXT: movapd %xmm2, (%rdx) +; SSE-NEXT: movapd %xmm6, 112(%rcx) +; SSE-NEXT: movapd %xmm11, 96(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) +; SSE-NEXT: movaps %xmm0, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rcx) -; SSE-NEXT: movapd %xmm4, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movapd %xmm5, (%rcx) ; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride3_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = mem[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = mem[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm7[1],ymm5[0],ymm7[3],ymm5[2] -; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm8[1],ymm2[0],ymm8[3],ymm2[2] -; AVX1-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm9[1],ymm1[0],ymm9[3],ymm1[2] -; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm3[1],ymm0[0],ymm3[3],ymm0[2] -; AVX1-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = mem[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0],ymm6[1],ymm10[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0],ymm2[1],ymm11[2],ymm2[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = mem[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm10[1],ymm4[0],ymm10[3],ymm4[2] +; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm12[0],ymm9[1],ymm12[2],ymm9[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm11[1],ymm8[0],ymm11[3],ymm8[2] +; AVX1-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm12[1],ymm7[0],ymm12[3],ymm7[2] +; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm11[1],ymm5[0],ymm11[3],ymm5[2] +; AVX1-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3] ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd %ymm9, (%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 96(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 64(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm10, (%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 96(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 32(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm0, 64(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm1, (%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 96(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 32(%rcx) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2],ymm9[3] +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3] +; AVX1-ONLY-NEXT: vmovapd %ymm11, 96(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 32(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 64(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm0, (%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 96(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 64(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm4, (%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 96(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 64(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm3, (%rcx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride3_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm11 +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm10[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm7[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm9[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm6[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = mem[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rsi) ; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 64(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, (%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 96(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 96(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rcx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -613,55 +607,55 @@ ; SSE-LABEL: load_i64_stride3_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $408, %rsp # imm = 0x198 -; SSE-NEXT: movapd 224(%rdi), %xmm6 ; SSE-NEXT: movapd 272(%rdi), %xmm9 -; SSE-NEXT: movapd 128(%rdi), %xmm5 -; SSE-NEXT: movapd 176(%rdi), %xmm8 -; SSE-NEXT: movapd 80(%rdi), %xmm7 -; SSE-NEXT: movapd 96(%rdi), %xmm10 -; SSE-NEXT: movapd 112(%rdi), %xmm0 -; SSE-NEXT: movapd 144(%rdi), %xmm11 -; SSE-NEXT: movapd 160(%rdi), %xmm1 -; SSE-NEXT: movapd 192(%rdi), %xmm12 -; SSE-NEXT: movapd 208(%rdi), %xmm2 -; SSE-NEXT: movapd 240(%rdi), %xmm13 -; SSE-NEXT: movapd 256(%rdi), %xmm3 -; SSE-NEXT: movapd 48(%rdi), %xmm14 +; SSE-NEXT: movapd 224(%rdi), %xmm8 +; SSE-NEXT: movapd 176(%rdi), %xmm7 +; SSE-NEXT: movapd 128(%rdi), %xmm6 +; SSE-NEXT: movapd 80(%rdi), %xmm5 +; SSE-NEXT: movapd 144(%rdi), %xmm10 +; SSE-NEXT: movapd 160(%rdi), %xmm0 +; SSE-NEXT: movapd 96(%rdi), %xmm11 +; SSE-NEXT: movapd 112(%rdi), %xmm2 +; SSE-NEXT: movapd 240(%rdi), %xmm12 +; SSE-NEXT: movapd 256(%rdi), %xmm1 +; SSE-NEXT: movapd 48(%rdi), %xmm13 ; SSE-NEXT: movapd 64(%rdi), %xmm4 +; SSE-NEXT: movapd 192(%rdi), %xmm14 +; SSE-NEXT: movapd 208(%rdi), %xmm3 ; SSE-NEXT: movapd %xmm4, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm14[0],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm13[0],xmm15[1] ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm7[0] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm0, %xmm4 -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm10[0],xmm4[1] +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm5[0] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, %xmm4 +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm11[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm5[0] +; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm6[0] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm0, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm10[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm7[0] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm3, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm8[0] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm8[0] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm3[0],xmm8[1] ; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm12[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm6[0] +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm9[0] ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm3, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm9[0] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm3[0],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 288(%rdi), %xmm2 ; SSE-NEXT: movapd 304(%rdi), %xmm0 @@ -713,106 +707,106 @@ ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 528(%rdi), %xmm15 +; SSE-NEXT: movapd 528(%rdi), %xmm2 ; SSE-NEXT: movapd 544(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm0, %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] ; SSE-NEXT: movapd 560(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm1[0] +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 576(%rdi), %xmm12 ; SSE-NEXT: movapd 592(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm12[0],xmm14[1] +; SSE-NEXT: movapd %xmm0, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm12[0],xmm15[1] ; SSE-NEXT: movapd 608(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm1[0] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 624(%rdi), %xmm8 +; SSE-NEXT: movapd 624(%rdi), %xmm9 ; SSE-NEXT: movapd 640(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm11 -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] +; SSE-NEXT: movapd %xmm0, %xmm10 +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm9[0],xmm10[1] ; SSE-NEXT: movapd 656(%rdi), %xmm13 -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm13[0] +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm13[0] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: movapd 672(%rdi), %xmm6 +; SSE-NEXT: movapd 672(%rdi), %xmm7 ; SSE-NEXT: movapd 688(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm5 -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] -; SSE-NEXT: movapd 704(%rdi), %xmm10 -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm10[0] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: movapd 720(%rdi), %xmm4 -; SSE-NEXT: movapd 736(%rdi), %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] -; SSE-NEXT: movapd 752(%rdi), %xmm7 -; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm7[0] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1] -; SSE-NEXT: movapd (%rdi), %xmm2 +; SSE-NEXT: movapd %xmm0, %xmm4 +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm7[0],xmm4[1] +; SSE-NEXT: movapd 704(%rdi), %xmm11 +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm11[0] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] +; SSE-NEXT: movapd 720(%rdi), %xmm3 +; SSE-NEXT: movapd 736(%rdi), %xmm6 +; SSE-NEXT: movapd %xmm6, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] +; SSE-NEXT: movapd 752(%rdi), %xmm8 +; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm8[0] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm6[0],xmm8[1] +; SSE-NEXT: movapd (%rdi), %xmm5 ; SSE-NEXT: movapd 16(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE-NEXT: movapd 32(%rdi), %xmm9 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm9[0] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: movapd %xmm5, 224(%rsi) +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] +; SSE-NEXT: movapd 32(%rdi), %xmm6 +; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm6[0] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: movapd %xmm2, 240(%rsi) +; SSE-NEXT: movapd %xmm14, 176(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movapd %xmm4, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movapd %xmm3, 240(%rsi) +; SSE-NEXT: movapd %xmm10, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 176(%rsi) +; SSE-NEXT: movaps %xmm0, 144(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 112(%rsi) +; SSE-NEXT: movaps %xmm0, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rsi) -; SSE-NEXT: movapd %xmm14, 192(%rsi) +; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movapd %xmm15, 192(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rsi) ; SSE-NEXT: movapd %xmm1, (%rsi) -; SSE-NEXT: movapd %xmm11, 208(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movapd %xmm6, 224(%rdx) -; SSE-NEXT: movapd %xmm4, 240(%rdx) +; SSE-NEXT: movapd %xmm3, 240(%rdx) +; SSE-NEXT: movapd %xmm7, 224(%rdx) +; SSE-NEXT: movapd %xmm9, 208(%rdx) ; SSE-NEXT: movapd %xmm12, 192(%rdx) -; SSE-NEXT: movapd %xmm8, 208(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rdx) -; SSE-NEXT: movapd %xmm15, 176(%rdx) +; SSE-NEXT: movaps %xmm0, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rdx) +; SSE-NEXT: movaps %xmm0, 160(%rdx) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movapd %xmm2, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movapd %xmm7, 240(%rcx) -; SSE-NEXT: movapd %xmm10, 224(%rcx) +; SSE-NEXT: movapd %xmm5, (%rdx) +; SSE-NEXT: movapd %xmm8, 240(%rcx) +; SSE-NEXT: movapd %xmm11, 224(%rcx) ; SSE-NEXT: movapd %xmm13, 208(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%rcx) @@ -838,280 +832,277 @@ ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movapd %xmm9, (%rcx) +; SSE-NEXT: movapd %xmm6, (%rcx) ; SSE-NEXT: addq $408, %rsp # imm = 0x198 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride3_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $232, %rsp -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = mem[0,1],ymm10[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = mem[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: subq $168, %rsp +; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = mem[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = mem[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = mem[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = mem[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = mem[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = mem[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[1],ymm10[0],ymm12[3],ymm10[2] -; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = mem[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm14[1],ymm2[0],ymm14[3],ymm2[2] +; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm8[1],ymm1[0],ymm8[3],ymm1[2] +; AVX1-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm4[1],ymm3[0],ymm4[3],ymm3[2] +; AVX1-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[1],ymm0[0],ymm5[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovapd %ymm5, %ymm12 +; AVX1-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm8[0],ymm11[3],ymm8[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm13[1],ymm6[0],ymm13[3],ymm6[2] +; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[1],ymm7[0],ymm15[3],ymm7[2] ; AVX1-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[1],ymm9[0],ymm3[3],ymm9[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[1],ymm11[0],ymm10[3],ymm11[2] ; AVX1-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[1],ymm13[0],ymm2[3],ymm13[2] -; AVX1-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[1],ymm7[0],ymm15[3],ymm7[2] -; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = mem[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm4[1],ymm3[0],ymm4[3],ymm3[2] -; AVX1-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm3[1],ymm9[0],ymm3[3],ymm9[2] +; AVX1-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[1],ymm1[0],ymm6[3],ymm1[2] -; AVX1-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = mem[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[1],ymm2[0],ymm5[3],ymm2[2] -; AVX1-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0],ymm0[1],ymm12[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm0[0],ymm10[1],ymm0[2],ymm10[3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0],ymm0[1],ymm15[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0],ymm1[1],ymm14[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0],ymm0[1],ymm11[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm0[0],ymm7[1],ymm0[2],ymm7[3] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm0[1],ymm4[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0],ymm3[1],mem[2],ymm3[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm9[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2],ymm8[3] -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0],ymm8[1],mem[2],ymm8[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0],ymm13[1],ymm8[2],ymm13[3] -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2],ymm13[3] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm13[0],ymm1[1],ymm13[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm13[0],ymm2[1],ymm13[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd %ymm5, 192(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 128(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 64(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm15, (%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 224(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 160(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 96(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 192(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 224(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 128(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 160(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm0, 64(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm10, 96(%rcx) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0],ymm1[1],ymm8[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm1[0],ymm8[1],ymm1[2],ymm8[3] +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm15[0],ymm0[1],ymm15[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm0[0],ymm15[1],ymm0[2],ymm15[3] +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm0[1],ymm4[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm0[0],ymm6[1],ymm0[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm10[0],ymm6[1],ymm10[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm6[0],ymm10[1],ymm6[2],ymm10[3] +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0],ymm6[1],ymm12[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm11[1],ymm6[2],ymm11[3] +; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm11[1],ymm3[2],ymm11[3] +; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3] +; AVX1-ONLY-NEXT: vmovapd %ymm3, 224(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 160(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 96(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 32(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 192(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 128(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 64(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm14, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 224(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 192(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 160(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 128(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm15, 96(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 64(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm14, 32(%rcx) -; AVX1-ONLY-NEXT: addq $232, %rsp +; AVX1-ONLY-NEXT: addq $168, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride3_vf32: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $232, %rsp -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm0[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm15[2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = ymm6[0,1],mem[2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 496(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 496(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm10, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm10, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm10, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm10, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm10, 224(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm10, 160(%rsi) @@ -1119,48 +1110,56 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm10, 96(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm10, 32(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 128(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 224(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 160(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 128(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 160(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 32(%rcx) -; AVX2-ONLY-NEXT: addq $232, %rsp -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: load_i64_stride3_vf32: +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm10, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm10, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm10, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm10, (%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 160(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 160(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 128(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm14, (%rcx) +; AVX2-ONLY-NEXT: addq $232, %rsp +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq +; +; AVX512-LABEL: load_i64_stride3_vf32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm7 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm11 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,3,6,9,12,15,u,u> ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 +; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm13 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] ; AVX512-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm15 +; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm15 ; AVX512-NEXT: vpermt2q %zmm6, %zmm14, %zmm15 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm16 ; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm16 @@ -1168,26 +1167,26 @@ ; AVX512-NEXT: vpermi2q %zmm3, %zmm9, %zmm12 ; AVX512-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = <1,4,7,10,13,u,u,u> -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm17 ; AVX512-NEXT: vpermt2q %zmm2, %zmm14, %zmm17 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] -; AVX512-NEXT: vpermt2q %zmm6, %zmm18, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512-NEXT: vpermt2q %zmm8, %zmm18, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm19 ; AVX512-NEXT: vpermt2q %zmm1, %zmm14, %zmm19 -; AVX512-NEXT: vpermt2q %zmm8, %zmm18, %zmm19 +; AVX512-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512-NEXT: vpermt2q %zmm0, %zmm14, %zmm20 ; AVX512-NEXT: vpermt2q %zmm4, %zmm18, %zmm20 ; AVX512-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 ; AVX512-NEXT: vpermt2q %zmm10, %zmm18, %zmm14 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = <10,13,0,3,6,u,u,u> -; AVX512-NEXT: vpermt2q %zmm11, %zmm18, %zmm1 +; AVX512-NEXT: vpermt2q %zmm11, %zmm18, %zmm2 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] -; AVX512-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 +; AVX512-NEXT: vpermt2q %zmm8, %zmm11, %zmm2 +; AVX512-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 +; AVX512-NEXT: vpermt2q %zmm6, %zmm11, %zmm1 ; AVX512-NEXT: vpermt2q %zmm5, %zmm18, %zmm0 ; AVX512-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 -; AVX512-NEXT: vpermt2q %zmm7, %zmm18, %zmm2 -; AVX512-NEXT: vpermt2q %zmm6, %zmm11, %zmm2 ; AVX512-NEXT: vpermt2q %zmm9, %zmm18, %zmm3 ; AVX512-NEXT: vpermt2q %zmm10, %zmm11, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rsi) @@ -1195,13 +1194,13 @@ ; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm12, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm20, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm19, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm17, 64(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm17, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm2, 128(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm3, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <96 x i64>, ptr %in.vec, align 64 @@ -1705,280 +1704,245 @@ ; ; AVX1-ONLY-LABEL: load_i64_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1096, %rsp # imm = 0x448 -; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd %ymm4, %ymm6 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = mem[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = mem[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = mem[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: subq $1000, %rsp # imm = 0x3E8 +; AVX1-ONLY-NEXT: vmovapd 1376(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 1184(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 992(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = mem[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = mem[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = mem[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = mem[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = mem[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = mem[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm12[1],ymm3[0],ymm12[3],ymm3[2] +; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm13[1],ymm7[0],ymm13[3],ymm7[2] +; AVX1-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm8[1],ymm6[0],ymm8[3],ymm6[2] +; AVX1-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm15[1],ymm5[0],ymm15[3],ymm5[2] +; AVX1-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm11[1],ymm1[0],ymm11[3],ymm1[2] +; AVX1-ONLY-NEXT: vbroadcastsd 848(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[1],ymm6[0],ymm7[3],ymm6[2] -; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[1],ymm5[0],ymm8[3],ymm5[2] -; AVX1-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm9[1],ymm4[0],ymm9[3],ymm4[2] -; AVX1-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[1],ymm3[0],ymm10[3],ymm3[2] -; AVX1-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm2[0],ymm11[3],ymm2[2] -; AVX1-ONLY-NEXT: vbroadcastsd 944(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm9[0],ymm0[3],ymm9[2] -; AVX1-ONLY-NEXT: vbroadcastsd 1136(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm8[0],ymm0[3],ymm8[2] -; AVX1-ONLY-NEXT: vbroadcastsd 1328(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm14[1],ymm2[0],ymm14[3],ymm2[2] +; AVX1-ONLY-NEXT: vbroadcastsd 1040(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[1],ymm0[0],ymm10[3],ymm0[2] +; AVX1-ONLY-NEXT: vbroadcastsd 1232(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm7[0],ymm0[3],ymm7[2] -; AVX1-ONLY-NEXT: vbroadcastsd 1520(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm4[0],ymm0[3],ymm4[2] +; AVX1-ONLY-NEXT: vbroadcastsd 1424(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = mem[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm13[0],ymm0[3],ymm13[2] -; AVX1-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm12[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm12[0],ymm0[3],ymm12[2] -; AVX1-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm10[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm10[0],ymm0[3],ymm10[2] -; AVX1-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[1],ymm0[0],ymm3[3],ymm0[2] +; AVX1-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = mem[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm14[1],ymm5[0],ymm14[3],ymm5[2] -; AVX1-ONLY-NEXT: vbroadcastsd 848(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = mem[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[1],ymm0[0],ymm4[3],ymm0[2] +; AVX1-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 992(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm4[0],ymm11[3],ymm4[2] -; AVX1-ONLY-NEXT: vbroadcastsd 1040(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = mem[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[1],ymm0[0],ymm10[3],ymm0[2] +; AVX1-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1184(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm0[1],ymm2[0],ymm0[3],ymm2[2] +; AVX1-ONLY-NEXT: vbroadcastsd 944(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm11[1],ymm2[0],ymm11[3],ymm2[2] +; AVX1-ONLY-NEXT: vbroadcastsd 1136(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[1],ymm2[0],ymm6[3],ymm2[2] -; AVX1-ONLY-NEXT: vbroadcastsd 1232(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1376(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[1],ymm1[0],ymm3[3],ymm1[2] -; AVX1-ONLY-NEXT: vbroadcastsd 1424(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd $5, (%rsp), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0],ymm0[1],mem[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovupd %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2],ymm9[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1264(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0],ymm0[1],mem[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1456(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[0],ymm0[1],mem[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[1],ymm2[0],ymm6[3],ymm2[2] +; AVX1-ONLY-NEXT: vbroadcastsd 1328(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = mem[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm9[1],ymm2[0],ymm9[3],ymm2[2] +; AVX1-ONLY-NEXT: vbroadcastsd 1520(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm12[0],ymm2[1],ymm12[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm3[0],ymm0[1],ymm3[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm1[1],ymm6[2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm13[0],ymm1[1],ymm13[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1024(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0],ymm1[1],ymm11[2],ymm1[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm1[0],ymm4[1],ymm1[2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm1[1],ymm3[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0],ymm1[1],ymm8[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0],ymm1[1],ymm4[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm15[0],ymm1[1],ymm15[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm10[0],ymm1[1],ymm10[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0],ymm1[1],ymm14[2],ymm1[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3] -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0],ymm4[1],mem[2],ymm4[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm10[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm4[0],ymm9[1],ymm4[2],ymm9[3] -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0],ymm4[1],mem[2],ymm4[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm12[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2],ymm11[3] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = mem[0],ymm11[1],mem[2],ymm11[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm11[0],ymm13[1],ymm11[2],ymm13[3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1024(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm14[0],ymm0[1],ymm14[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 992(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0],ymm0[1],ymm11[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm0[0],ymm4[1],ymm0[2],ymm4[3] +; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3],ymm4[4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovaps 1264(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm14[1],ymm6[2],ymm14[3] +; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0],ymm11[1],ymm14[2],ymm11[3] +; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm11, %ymm11 ; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm11[2,3],mem[4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm8[2,3],ymm11[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovapd %ymm7, 448(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 384(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 320(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm14, 256(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 192(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 128(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm15, 416(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rdx) +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovaps 1456(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3] +; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2],ymm13[3] +; AVX1-ONLY-NEXT: vmovapd %ymm9, 480(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 416(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 352(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 288(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 384(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 320(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 256(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1995,305 +1959,321 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm11, (%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm13, 64(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 128(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm10, 192(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 256(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 320(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rcx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 480(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 448(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 416(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 384(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 352(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm15, 320(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX1-ONLY-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) +; AVX1-ONLY-NEXT: addq $1000, %rsp # imm = 0x3E8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride3_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $968, %rsp # imm = 0x3C8 -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1024(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm12[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 1024(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm14 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 944(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 848(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 1136(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 1040(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, (%rsp), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 1328(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 1232(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 1520(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 1424(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm14 = ymm14[0,1],mem[2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm8, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 848(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vbroadcastsd 944(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 1040(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vbroadcastsd 1136(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 1232(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vbroadcastsd 1328(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 1424(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vbroadcastsd 1520(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 496(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 880(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 1072(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 496(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 1264(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 1456(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 1360(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 1168(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 880(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -2301,43 +2281,27 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1072(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1168(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1264(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1360(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1456(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 448(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 384(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 320(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 256(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm15, 480(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm15, 416(%rsi) @@ -2353,55 +2317,71 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm15, 96(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm15, 32(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 448(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 384(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm15, 448(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm15, 384(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm15, 320(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm15, 256(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm15, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm15, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm15, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm15, (%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 480(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 416(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 320(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 352(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 256(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 288(%rdx) ; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 224(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 160(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 96(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 480(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 448(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 416(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 384(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 352(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 320(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 288(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 256(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 224(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 192(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 160(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 128(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 64(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 256(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 416(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 384(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 352(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm5, 320(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 384(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 480(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 416(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 352(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 288(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 288(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 192(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 160(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 128(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-ONLY-NEXT: addq $968, %rsp # imm = 0x3C8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -2410,67 +2390,67 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm14 -; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm22 +; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm18 ; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm13 -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm19 +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm23 ; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm12 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm27 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm26 ; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm10 -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm26 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm27 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm29 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm28 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm28 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm30 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm7 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm30 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm29 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,3,6,9,12,15,u,u> -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm6 ; AVX512-NEXT: vpermt2q %zmm7, %zmm11, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm15 ; AVX512-NEXT: vpermt2q %zmm8, %zmm11, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm16 ; AVX512-NEXT: vpermt2q %zmm9, %zmm11, %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm26, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm27, %zmm17 ; AVX512-NEXT: vpermt2q %zmm10, %zmm11, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm18 -; AVX512-NEXT: vpermt2q %zmm12, %zmm11, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512-NEXT: vpermt2q %zmm13, %zmm11, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512-NEXT: vpermt2q %zmm14, %zmm11, %zmm23 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = <1,4,7,10,13,u,u,u> -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512-NEXT: vpermt2q %zmm7, %zmm21, %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm26, %zmm19 +; AVX512-NEXT: vpermt2q %zmm12, %zmm11, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm21 +; AVX512-NEXT: vpermt2q %zmm13, %zmm11, %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512-NEXT: vpermt2q %zmm14, %zmm11, %zmm24 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = <1,4,7,10,13,u,u,u> +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm22 +; AVX512-NEXT: vpermt2q %zmm7, %zmm20, %zmm22 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = <10,13,0,3,6,u,u,u> -; AVX512-NEXT: vpermt2q %zmm30, %zmm31, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm29, %zmm30 -; AVX512-NEXT: vpermt2q %zmm9, %zmm21, %zmm30 -; AVX512-NEXT: vpermt2q %zmm29, %zmm31, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm29 -; AVX512-NEXT: vpermt2q %zmm8, %zmm21, %zmm29 -; AVX512-NEXT: vpermt2q %zmm28, %zmm31, %zmm8 +; AVX512-NEXT: vpermt2q %zmm29, %zmm31, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512-NEXT: vpermt2q %zmm8, %zmm20, %zmm29 +; AVX512-NEXT: vpermt2q %zmm30, %zmm31, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm30 +; AVX512-NEXT: vpermt2q %zmm9, %zmm20, %zmm30 +; AVX512-NEXT: vpermt2q %zmm28, %zmm31, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm28 -; AVX512-NEXT: vpermt2q %zmm12, %zmm21, %zmm28 -; AVX512-NEXT: vpermt2q %zmm27, %zmm31, %zmm12 +; AVX512-NEXT: vpermt2q %zmm10, %zmm20, %zmm28 +; AVX512-NEXT: vpermt2q %zmm27, %zmm31, %zmm10 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm27 -; AVX512-NEXT: vpermt2q %zmm10, %zmm21, %zmm27 -; AVX512-NEXT: vpermt2q %zmm26, %zmm31, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm26 -; AVX512-NEXT: vpermt2q %zmm14, %zmm21, %zmm26 -; AVX512-NEXT: vpermt2q %zmm22, %zmm31, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm22 -; AVX512-NEXT: vpermt2q %zmm13, %zmm21, %zmm22 -; AVX512-NEXT: vpermt2q %zmm19, %zmm31, %zmm13 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm19 -; AVX512-NEXT: vpermi2q %zmm19, %zmm25, %zmm11 -; AVX512-NEXT: vpermi2q %zmm19, %zmm25, %zmm21 -; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm19 +; AVX512-NEXT: vpermt2q %zmm12, %zmm20, %zmm27 +; AVX512-NEXT: vpermt2q %zmm26, %zmm31, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm26 +; AVX512-NEXT: vpermt2q %zmm13, %zmm20, %zmm26 +; AVX512-NEXT: vpermt2q %zmm23, %zmm31, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512-NEXT: vpermt2q %zmm14, %zmm20, %zmm23 +; AVX512-NEXT: vpermt2q %zmm18, %zmm31, %zmm14 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm18 +; AVX512-NEXT: vpermi2q %zmm18, %zmm25, %zmm11 +; AVX512-NEXT: vpermi2q %zmm18, %zmm25, %zmm20 +; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm18 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm25 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] ; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm6 @@ -2478,54 +2458,54 @@ ; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm15 ; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm16 ; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm17 -; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm18 -; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm20 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 +; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm19 +; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm21 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm24 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm11 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] -; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 -; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm24 -; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm30 +; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm22 ; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm29 -; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm28 -; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm27 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 -; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm21 +; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm30 +; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm28 +; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm27 +; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm26 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 +; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm20 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] ; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm7 -; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm9 ; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm8 -; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm12 +; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm9 ; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm10 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm14 +; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm12 ; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm13 -; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm23, 448(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm20, 384(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm18, 320(%rsi) +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm14 +; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm24, 448(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm21, 384(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm19, 320(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm17, 256(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rsi) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 64(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm11, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm26, 448(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm27, 256(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm28, 320(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm29, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm23, 448(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm26, 384(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm27, 320(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm28, 256(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm30, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm21, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm24, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm22, 384(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm13, 384(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm29, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm22, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm20, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm14, 448(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm10, 256(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm13, 384(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm12, 320(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm8, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm10, 256(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm9, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm8, 128(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm18, (%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <192 x i64>, ptr %in.vec, align 64 @@ -2544,6 +2524,8 @@ ; AVX2-FAST: {{.*}} ; AVX2-FAST-PERLANE: {{.*}} ; AVX2-SLOW: {{.*}} +; AVX512-FAST: {{.*}} +; AVX512-SLOW: {{.*}} ; AVX512BW: {{.*}} ; AVX512BW-FAST: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll @@ -50,56 +50,22 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, (%r8) ; AVX1-ONLY-NEXT: retq ; -; AVX2-ONLY-LABEL: load_i64_stride4_vf2: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsi) -; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rdx) -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, (%rcx) -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, (%r8) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-SLOW-LABEL: load_i64_stride4_vf2: -; AVX512-SLOW: # %bb.0: -; AVX512-SLOW-NEXT: vmovaps (%rdi), %xmm0 -; AVX512-SLOW-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX512-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] -; AVX512-SLOW-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX512-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX512-SLOW-NEXT: vmovaps (%rdi), %ymm3 -; AVX512-SLOW-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512-SLOW-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512-SLOW-NEXT: vmovaps %xmm2, (%rsi) -; AVX512-SLOW-NEXT: vmovaps %xmm0, (%rdx) -; AVX512-SLOW-NEXT: vextractf128 $1, %ymm4, (%rcx) -; AVX512-SLOW-NEXT: vextractf128 $1, %ymm1, (%r8) -; AVX512-SLOW-NEXT: vzeroupper -; AVX512-SLOW-NEXT: retq -; -; AVX512-FAST-LABEL: load_i64_stride4_vf2: -; AVX512-FAST: # %bb.0: -; AVX512-FAST-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0] -; AVX512-FAST-NEXT: vpermpd (%rdi), %zmm0, %zmm0 -; AVX512-FAST-NEXT: vmovaps (%rdi), %xmm1 -; AVX512-FAST-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX512-FAST-NEXT: vmovaps (%rdi), %ymm2 -; AVX512-FAST-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX512-FAST-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX512-FAST-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX512-FAST-NEXT: vmovaps %xmm0, (%rsi) -; AVX512-FAST-NEXT: vmovaps %xmm1, (%rdx) -; AVX512-FAST-NEXT: vextractf128 $1, %ymm4, (%rcx) -; AVX512-FAST-NEXT: vextractf128 $1, %ymm2, (%r8) -; AVX512-FAST-NEXT: vzeroupper -; AVX512-FAST-NEXT: retq +; AVX2-LABEL: load_i64_stride4_vf2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX2-NEXT: vmovaps %xmm2, (%rsi) +; AVX2-NEXT: vmovaps %xmm0, (%rdx) +; AVX2-NEXT: vextractf128 $1, %ymm4, (%rcx) +; AVX2-NEXT: vextractf128 $1, %ymm1, (%r8) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %wide.vec = load <8 x i64>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <8 x i64> %wide.vec, <8 x i64> poison, <2 x i32> %strided.vec1 = shufflevector <8 x i64> %wide.vec, <8 x i64> poison, <2 x i32> @@ -230,20 +196,20 @@ ; SSE-NEXT: movaps 16(%rdi), %xmm0 ; SSE-NEXT: movaps 32(%rdi), %xmm13 ; SSE-NEXT: movaps 224(%rdi), %xmm14 -; SSE-NEXT: movaps 192(%rdi), %xmm7 +; SSE-NEXT: movaps 192(%rdi), %xmm6 ; SSE-NEXT: movaps 160(%rdi), %xmm15 -; SSE-NEXT: movaps 128(%rdi), %xmm6 +; SSE-NEXT: movaps 128(%rdi), %xmm7 ; SSE-NEXT: movaps 96(%rdi), %xmm12 ; SSE-NEXT: movaps 64(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, %xmm10 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm12[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm12[1] -; SSE-NEXT: movaps %xmm6, %xmm12 +; SSE-NEXT: movaps %xmm7, %xmm12 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm15[1] -; SSE-NEXT: movaps %xmm7, %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm15[1] +; SSE-NEXT: movaps %xmm6, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm14[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm14[1] ; SSE-NEXT: movaps %xmm4, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm13[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm13[1] @@ -262,13 +228,13 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE-NEXT: movaps %xmm15, 48(%rsi) -; SSE-NEXT: movaps %xmm14, (%rsi) ; SSE-NEXT: movaps %xmm12, 32(%rsi) ; SSE-NEXT: movaps %xmm10, 16(%rsi) -; SSE-NEXT: movaps %xmm7, 48(%rdx) -; SSE-NEXT: movaps %xmm4, (%rdx) -; SSE-NEXT: movaps %xmm6, 32(%rdx) +; SSE-NEXT: movaps %xmm14, (%rsi) +; SSE-NEXT: movaps %xmm6, 48(%rdx) +; SSE-NEXT: movaps %xmm7, 32(%rdx) ; SSE-NEXT: movaps %xmm8, 16(%rdx) +; SSE-NEXT: movaps %xmm4, (%rdx) ; SSE-NEXT: movaps %xmm9, 48(%rcx) ; SSE-NEXT: movaps %xmm11, 32(%rcx) ; SSE-NEXT: movaps %xmm13, 16(%rcx) @@ -436,35 +402,35 @@ ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp ; SSE-NEXT: movaps (%rdi), %xmm7 -; SSE-NEXT: movaps 416(%rdi), %xmm0 -; SSE-NEXT: movaps 384(%rdi), %xmm8 -; SSE-NEXT: movaps 160(%rdi), %xmm1 -; SSE-NEXT: movaps 128(%rdi), %xmm9 -; SSE-NEXT: movaps 480(%rdi), %xmm2 -; SSE-NEXT: movaps 448(%rdi), %xmm10 -; SSE-NEXT: movaps 224(%rdi), %xmm3 -; SSE-NEXT: movaps 192(%rdi), %xmm11 -; SSE-NEXT: movaps 288(%rdi), %xmm4 -; SSE-NEXT: movaps 256(%rdi), %xmm13 -; SSE-NEXT: movaps 352(%rdi), %xmm5 +; SSE-NEXT: movaps 480(%rdi), %xmm0 +; SSE-NEXT: movaps 448(%rdi), %xmm8 +; SSE-NEXT: movaps 224(%rdi), %xmm1 +; SSE-NEXT: movaps 192(%rdi), %xmm9 +; SSE-NEXT: movaps 416(%rdi), %xmm2 +; SSE-NEXT: movaps 384(%rdi), %xmm10 +; SSE-NEXT: movaps 160(%rdi), %xmm3 +; SSE-NEXT: movaps 128(%rdi), %xmm11 +; SSE-NEXT: movaps 352(%rdi), %xmm4 ; SSE-NEXT: movaps 320(%rdi), %xmm12 -; SSE-NEXT: movaps 96(%rdi), %xmm6 -; SSE-NEXT: movaps 64(%rdi), %xmm14 +; SSE-NEXT: movaps 96(%rdi), %xmm5 +; SSE-NEXT: movaps 64(%rdi), %xmm13 +; SSE-NEXT: movaps 288(%rdi), %xmm6 +; SSE-NEXT: movaps 256(%rdi), %xmm14 ; SSE-NEXT: movaps %xmm14, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm6[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm6 +; SSE-NEXT: movaps %xmm13, %xmm6 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm5[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm4[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm11, %xmm4 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] ; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill @@ -508,15 +474,15 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 304(%rdi), %xmm0 -; SSE-NEXT: movaps 272(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: movaps 272(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm10 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] ; SSE-NEXT: movaps 368(%rdi), %xmm0 -; SSE-NEXT: movaps 336(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: movaps 336(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm9 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movaps 432(%rdi), %xmm0 ; SSE-NEXT: movaps 400(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, %xmm6 @@ -524,59 +490,59 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] ; SSE-NEXT: movaps 496(%rdi), %xmm1 ; SSE-NEXT: movaps 464(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm0, 96(%rsi) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm3, 112(%rcx) ; SSE-NEXT: movaps %xmm6, 96(%rcx) -; SSE-NEXT: movaps %xmm14, 32(%rcx) -; SSE-NEXT: movaps %xmm4, 112(%rcx) -; SSE-NEXT: movaps %xmm13, 48(%rcx) -; SSE-NEXT: movaps %xmm10, 64(%rcx) -; SSE-NEXT: movaps %xmm3, (%rcx) ; SSE-NEXT: movaps %xmm9, 80(%rcx) +; SSE-NEXT: movaps %xmm10, 64(%rcx) +; SSE-NEXT: movaps %xmm13, 48(%rcx) +; SSE-NEXT: movaps %xmm14, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps %xmm4, (%rcx) ; SSE-NEXT: movaps %xmm2, 112(%r8) ; SSE-NEXT: movaps %xmm5, 96(%r8) -; SSE-NEXT: movaps %xmm8, 80(%r8) -; SSE-NEXT: movaps %xmm7, 64(%r8) +; SSE-NEXT: movaps %xmm7, 80(%r8) +; SSE-NEXT: movaps %xmm8, 64(%r8) ; SSE-NEXT: movaps %xmm12, 48(%r8) ; SSE-NEXT: movaps %xmm11, 32(%r8) ; SSE-NEXT: movaps %xmm15, 16(%r8) @@ -587,144 +553,144 @@ ; AVX1-ONLY-LABEL: load_i64_stride4_vf16: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $296, %rsp # imm = 0x128 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm8[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm6[0],xmm4[0] ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm7[0] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm8[0] ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm12[0],xmm10[0] ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm13[0] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm7[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm9[1],xmm8[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm12[1],xmm10[1] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm13[0],xmm14[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm10[0],xmm13[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm10[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm6[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm11[0],xmm12[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 96(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm4[0],ymm8[0],ymm4[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm0[0],xmm3[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm14[1],mem[1],ymm14[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm13[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm8[1],ymm4[3],ymm8[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm11[1],xmm12[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 112(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 80(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 80(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%r8) ; AVX1-ONLY-NEXT: addq $296, %rsp # imm = 0x128 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -732,41 +698,41 @@ ; AVX2-ONLY-LABEL: load_i64_stride4_vf16: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $360, %rsp # imm = 0x168 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm2, %ymm9 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm2, %ymm10 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm7, %ymm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm2, %ymm10 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm7, %ymm11 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm7, %ymm12 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm7, %ymm11 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, 320(%rdi), %ymm7, %ymm12 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm13, %ymm13 -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm8, %ymm15 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm8, %ymm13 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm8, %ymm15 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm14, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm14, %ymm1 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, 320(%rdi), %ymm10, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm10, %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -775,74 +741,74 @@ ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm14[0],ymm5[2],ymm14[2] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm10[0],ymm5[2],ymm10[2] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm8[0],ymm11[0],ymm8[2],ymm11[2] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm15[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm7[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm7[0],ymm3[2],ymm7[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm10[1],ymm14[1],ymm10[3],ymm14[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm10[1],ymm13[1],ymm10[3],ymm13[3] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm7[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm9[1],ymm13[1],ymm9[3],ymm13[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm11[1],ymm8[3],ymm11[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm15, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 32(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm12, (%r8) ; AVX2-ONLY-NEXT: addq $360, %rsp # imm = 0x168 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -924,26 +890,26 @@ ; SSE-LABEL: load_i64_stride4_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $664, %rsp # imm = 0x298 -; SSE-NEXT: movaps 416(%rdi), %xmm0 -; SSE-NEXT: movaps 384(%rdi), %xmm9 -; SSE-NEXT: movaps 160(%rdi), %xmm3 -; SSE-NEXT: movaps 128(%rdi), %xmm8 -; SSE-NEXT: movaps 480(%rdi), %xmm1 -; SSE-NEXT: movaps 448(%rdi), %xmm11 -; SSE-NEXT: movaps 224(%rdi), %xmm5 -; SSE-NEXT: movaps 192(%rdi), %xmm10 -; SSE-NEXT: movaps 288(%rdi), %xmm4 -; SSE-NEXT: movaps 256(%rdi), %xmm12 -; SSE-NEXT: movaps 608(%rdi), %xmm2 -; SSE-NEXT: movaps 352(%rdi), %xmm6 -; SSE-NEXT: movaps 320(%rdi), %xmm14 +; SSE-NEXT: movaps 480(%rdi), %xmm0 +; SSE-NEXT: movaps 448(%rdi), %xmm9 +; SSE-NEXT: movaps 224(%rdi), %xmm3 +; SSE-NEXT: movaps 192(%rdi), %xmm8 +; SSE-NEXT: movaps 416(%rdi), %xmm1 +; SSE-NEXT: movaps 384(%rdi), %xmm11 +; SSE-NEXT: movaps 160(%rdi), %xmm5 +; SSE-NEXT: movaps 128(%rdi), %xmm10 +; SSE-NEXT: movaps 352(%rdi), %xmm4 +; SSE-NEXT: movaps 320(%rdi), %xmm13 ; SSE-NEXT: movaps 96(%rdi), %xmm7 -; SSE-NEXT: movaps 64(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm15 +; SSE-NEXT: movaps 64(%rdi), %xmm12 +; SSE-NEXT: movaps 544(%rdi), %xmm2 +; SSE-NEXT: movaps 288(%rdi), %xmm6 +; SSE-NEXT: movaps 256(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm12, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm7[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm7[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm10, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -959,11 +925,11 @@ ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm3 +; SSE-NEXT: movaps %xmm13, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm4[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm11, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -974,21 +940,14 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 576(%rdi), %xmm0 +; SSE-NEXT: movaps 512(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 544(%rdi), %xmm0 -; SSE-NEXT: movaps 512(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 736(%rdi), %xmm0 -; SSE-NEXT: movaps 704(%rdi), %xmm1 +; SSE-NEXT: movaps 608(%rdi), %xmm0 +; SSE-NEXT: movaps 576(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1001,8 +960,8 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 864(%rdi), %xmm0 -; SSE-NEXT: movaps 832(%rdi), %xmm1 +; SSE-NEXT: movaps 736(%rdi), %xmm0 +; SSE-NEXT: movaps 704(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1015,8 +974,8 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 992(%rdi), %xmm0 -; SSE-NEXT: movaps 960(%rdi), %xmm1 +; SSE-NEXT: movaps 864(%rdi), %xmm0 +; SSE-NEXT: movaps 832(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1029,6 +988,13 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 992(%rdi), %xmm0 +; SSE-NEXT: movaps 960(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%rdi), %xmm1 ; SSE-NEXT: movaps 32(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -1134,14 +1100,6 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rsi) @@ -1150,13 +1108,13 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movaps %xmm0, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps %xmm0, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1166,37 +1124,45 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rdx) +; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rdx) +; SSE-NEXT: movaps %xmm0, 224(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rdx) +; SSE-NEXT: movaps %xmm0, 192(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rdx) +; SSE-NEXT: movaps %xmm0, 160(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps %xmm1, 240(%rcx) ; SSE-NEXT: movaps %xmm5, 224(%rcx) ; SSE-NEXT: movaps %xmm8, 208(%rcx) @@ -1251,528 +1217,512 @@ ; ; AVX1-ONLY-LABEL: load_i64_stride4_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1112, %rsp # imm = 0x458 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm5[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm10[0] -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0] -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm5[0] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm2[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm10[0],xmm9[0] ; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm8[0] +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm5[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 944(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 912(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm15[0],ymm11[2],ymm15[2] +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm10[0],ymm14[0],ymm10[2],ymm14[2] +; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm2[0],xmm6[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 944(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 912(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm3[0],xmm7[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm15[1],ymm11[3],ymm15[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm10[1],ymm14[1],ymm10[3],ymm14[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 224(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 240(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 176(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 112(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 208(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 144(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 80(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 240(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 176(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 208(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 144(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 80(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm13, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX1-ONLY-NEXT: addq $1112, %rsp # imm = 0x458 +; AVX1-ONLY-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride4_vf32: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $1224, %rsp # imm = 0x4C8 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vinsertf128 $1, 320(%rdi), %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm5, %ymm5 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vinsertf128 $1, 992(%rdi), %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, 960(%rdi), %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm7, %ymm7 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm9, %ymm9 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm10, %ymm10 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 320(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm2, %ymm2 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 992(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 960(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm15[0],ymm6[0],ymm15[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm6[0],ymm11[0],ymm6[2],ymm11[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm9[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm5[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm15[1],ymm6[1],ymm15[3],ymm6[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm5[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm6[1],ymm11[1],ymm6[3],ymm11[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rsi) @@ -1781,13 +1731,13 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -1797,13 +1747,13 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -1812,17 +1762,23 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm11, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 32(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r8) ; AVX2-ONLY-NEXT: addq $1224, %rsp # imm = 0x4C8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -1835,72 +1791,72 @@ ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm17 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm10 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm12 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm18 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm17 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,0,4,8,12] ; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512-NEXT: vpermt2q %zmm12, %zmm19, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512-NEXT: vpermt2q %zmm8, %zmm19, %zmm13 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512-NEXT: vpermt2q %zmm17, %zmm19, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512-NEXT: vpermt2q %zmm14, %zmm19, %zmm18 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512-NEXT: vpermt2q %zmm10, %zmm19, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512-NEXT: vpermt2q %zmm18, %zmm19, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512-NEXT: vpermt2q %zmm14, %zmm19, %zmm13 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512-NEXT: vpermt2q %zmm12, %zmm19, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512-NEXT: vpermt2q %zmm9, %zmm19, %zmm16 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm16[0,1,2,3],zmm13[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm16 +; AVX512-NEXT: vpermt2q %zmm7, %zmm19, %zmm16 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512-NEXT: vpermt2q %zmm4, %zmm19, %zmm20 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm20[0,1,2,3],zmm16[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm20 ; AVX512-NEXT: vpermt2q %zmm3, %zmm19, %zmm20 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,1,5,9,13] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512-NEXT: vpermt2q %zmm12, %zmm21, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512-NEXT: vpermt2q %zmm17, %zmm21, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512-NEXT: vpermt2q %zmm14, %zmm21, %zmm23 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,5,9,13,1,5,9,13] +; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm21 +; AVX512-NEXT: vpermt2q %zmm18, %zmm20, %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512-NEXT: vpermt2q %zmm14, %zmm20, %zmm22 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm22[0,1,2,3],zmm21[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512-NEXT: vpermt2q %zmm12, %zmm20, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm23 +; AVX512-NEXT: vpermt2q %zmm9, %zmm20, %zmm23 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512-NEXT: vpermt2q %zmm10, %zmm21, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm23 +; AVX512-NEXT: vpermt2q %zmm7, %zmm20, %zmm23 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512-NEXT: vpermt2q %zmm4, %zmm21, %zmm24 +; AVX512-NEXT: vpermt2q %zmm4, %zmm20, %zmm24 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512-NEXT: vpermt2q %zmm3, %zmm21, %zmm24 -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm21 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm3, %zmm20, %zmm24 +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm20 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm20[0,1,2,3],zmm24[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,2,6,10,14] ; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512-NEXT: vpermt2q %zmm12, %zmm24, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512-NEXT: vpermt2q %zmm8, %zmm24, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512-NEXT: vpermt2q %zmm18, %zmm24, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm26 +; AVX512-NEXT: vpermt2q %zmm14, %zmm24, %zmm26 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512-NEXT: vpermt2q %zmm17, %zmm24, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512-NEXT: vpermt2q %zmm14, %zmm24, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512-NEXT: vpermt2q %zmm12, %zmm24, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm27 +; AVX512-NEXT: vpermt2q %zmm9, %zmm24, %zmm27 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512-NEXT: vpermt2q %zmm10, %zmm24, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm27 +; AVX512-NEXT: vpermt2q %zmm7, %zmm24, %zmm27 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm28 ; AVX512-NEXT: vpermt2q %zmm4, %zmm24, %zmm28 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7] @@ -1910,34 +1866,34 @@ ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,3,7,11,15] ; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm17, %zmm28, %zmm16 +; AVX512-NEXT: vpermt2q %zmm18, %zmm28, %zmm17 ; AVX512-NEXT: vpermt2q %zmm14, %zmm28, %zmm15 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512-NEXT: vpermt2q %zmm4, %zmm28, %zmm5 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm17[4,5,6,7] ; AVX512-NEXT: vpermt2q %zmm12, %zmm28, %zmm11 -; AVX512-NEXT: vpermt2q %zmm8, %zmm28, %zmm9 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm9, %zmm28, %zmm10 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm11[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm7, %zmm28, %zmm6 +; AVX512-NEXT: vpermt2q %zmm4, %zmm28, %zmm5 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm6[4,5,6,7] ; AVX512-NEXT: vpermt2q %zmm3, %zmm28, %zmm2 ; AVX512-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm18, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm13, 128(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm8, 64(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm23, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm21, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm22, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm20, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm22, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm20, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm27, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm25, 64(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm26, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm5, 128(%r8) ; AVX512-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512-NEXT: vmovdqa64 %zmm9, 128(%r8) ; AVX512-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <128 x i64>, ptr %in.vec, align 64 @@ -2636,386 +2592,394 @@ ; AVX1-ONLY-LABEL: load_i64_stride4_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $2728, %rsp # imm = 0xAA8 -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm10[0] -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0] -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm9[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm7[0] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm6[0] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm4[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm5[0] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm3[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm2[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm10[0],xmm9[0] +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm8[0] +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm3[0] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 944(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 912(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1456(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1424(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1712(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1680(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1968(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1936(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm8[0],xmm0[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm2[0],xmm10[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 944(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 912(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm11[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm12[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 1456(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 1424(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm13[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 1712(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 1680(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm6[0],xmm14[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 1968(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 1936(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm15[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm8 = xmm8[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm8 = mem[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -3029,7 +2993,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -3038,7 +3002,8 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm10[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] @@ -3060,235 +3025,228 @@ ; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm12[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm13[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm14[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm15[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 464(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 448(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 320(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 336(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 496(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 480(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 416(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 352(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 368(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 320(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 336(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 448(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 464(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 352(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 368(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 416(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 480(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 496(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 480(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 448(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 416(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 384(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 352(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 288(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 256(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm4[1],xmm12[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm13[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm6[1],xmm14[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm15[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 480(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 416(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 352(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 288(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 224(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 496(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 432(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 368(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 304(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 240(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 112(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 448(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 384(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 320(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 256(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 464(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 400(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 336(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 272(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 208(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 144(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 80(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 480(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 496(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 416(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 432(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 352(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 368(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 288(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 304(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 240(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 464(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 400(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 320(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 336(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 256(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 272(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 208(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 144(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 80(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 480(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 416(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 352(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 288(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 448(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 384(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 320(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 256(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 480(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 448(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 416(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 384(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 352(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 288(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm9, (%r8) ; AVX1-ONLY-NEXT: addq $2728, %rsp # imm = 0xAA8 ; AVX1-ONLY-NEXT: vzeroupper @@ -3297,134 +3255,146 @@ ; AVX2-ONLY-LABEL: load_i64_stride4_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $3016, %rsp # imm = 0xBC8 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vinsertf128 $1, 320(%rdi), %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm5, %ymm5 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vinsertf128 $1, 992(%rdi), %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, 960(%rdi), %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm7, %ymm7 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1120(%rdi), %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1088(%rdi), %ymm9, %ymm9 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1376(%rdi), %ymm10, %ymm10 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1472(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1344(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1760(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1632(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1600(%rdi), %ymm2, %ymm2 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 2016(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1888(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1984(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1856(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 320(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 992(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 960(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1088(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1376(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1344(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1472(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1632(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1600(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1760(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1888(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1856(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 2016(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1984(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 @@ -3437,294 +3407,270 @@ ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] +; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm12[0],ymm6[2],ymm12[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd (%rsp), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm11[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm8[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm8[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm5[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm5[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm3[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm12[1],ymm6[3],ymm12[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rsi) @@ -3741,21 +3687,21 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -3773,53 +3719,73 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 448(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 416(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 384(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 352(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 320(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 288(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 256(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 224(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 448(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 416(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 384(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 352(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r8) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r8) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r8) @@ -3834,68 +3800,68 @@ ; AVX512-LABEL: load_i64_stride4_vf64: ; AVX512: # %bb.0: ; AVX512-NEXT: subq $2056, %rsp # imm = 0x808 -; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm24 -; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm26 +; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm24 +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm25 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm28 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm26 ; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm25 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm27 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm27 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm28 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm30 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm29 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,4,8,12,0,4,8,12] -; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-NEXT: vpermt2q %zmm29, %zmm9, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512-NEXT: vpermt2q %zmm30, %zmm9, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512-NEXT: vpermt2q %zmm27, %zmm9, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-NEXT: vpermt2q %zmm25, %zmm9, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-NEXT: vpermt2q %zmm28, %zmm9, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-NEXT: vpermt2q %zmm26, %zmm9, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512-NEXT: vpermt2q %zmm24, %zmm9, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,4,8,12,0,4,8,12] +; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512-NEXT: vpermt2q %zmm29, %zmm7, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512-NEXT: vpermt2q %zmm30, %zmm7, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512-NEXT: vpermt2q %zmm28, %zmm7, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512-NEXT: vpermt2q %zmm27, %zmm7, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512-NEXT: vpermt2q %zmm26, %zmm7, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512-NEXT: vpermt2q %zmm25, %zmm7, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512-NEXT: vpermt2q %zmm24, %zmm7, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [1,5,9,13,1,5,9,13] ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512-NEXT: vpermt2q %zmm30, %zmm15, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-NEXT: vpermt2q %zmm28, %zmm15, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-NEXT: vpermt2q %zmm26, %zmm15, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512-NEXT: vpermt2q %zmm27, %zmm15, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-NEXT: vpermt2q %zmm25, %zmm15, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512-NEXT: vpermt2q %zmm29, %zmm15, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512-NEXT: vpermt2q %zmm30, %zmm15, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512-NEXT: vpermt2q %zmm28, %zmm15, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512-NEXT: vpermt2q %zmm27, %zmm15, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512-NEXT: vpermt2q %zmm26, %zmm15, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512-NEXT: vpermt2q %zmm25, %zmm15, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,6,10,14,2,6,10,14] ; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-NEXT: vpermt2q %zmm29, %zmm22, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512-NEXT: vpermt2q %zmm29, %zmm22, %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,7,11,15,3,7,11,15] ; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2q %zmm29, %zmm31, %zmm6 @@ -3905,26 +3871,26 @@ ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm30, %zmm31, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512-NEXT: vpermt2q %zmm28, %zmm22, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm28, %zmm31, %zmm2 +; AVX512-NEXT: vpermt2q %zmm28, %zmm31, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-NEXT: vpermt2q %zmm27, %zmm22, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-NEXT: vpermt2q %zmm26, %zmm22, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm26, %zmm31, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm26, %zmm22, %zmm2 +; AVX512-NEXT: vpermt2q %zmm25, %zmm22, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm26, %zmm31, %zmm1 +; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-NEXT: vpermt2q %zmm27, %zmm22, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm27, %zmm31, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-NEXT: vpermt2q %zmm25, %zmm22, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512-NEXT: vpermt2q %zmm24, %zmm15, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -3933,10 +3899,10 @@ ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm24, %zmm31, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm24 +; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm24 ; AVX512-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm24, %zmm1 ; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 @@ -3945,10 +3911,10 @@ ; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm24 -; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm28 +; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm28 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm30 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm30 +; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm30 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm1 ; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -3956,71 +3922,75 @@ ; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm28 -; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm26 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm29 +; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm29 ; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm29 -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm1 ; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm23 -; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm25 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm25 +; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm20 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm27 ; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm27 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm13 -; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm12 +; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm17 +; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm19 ; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm19 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm13, %zmm21 ; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm21 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm11 -; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm13 +; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm11 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm16 ; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm17 -; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm18 +; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm18 ; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm8 -; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm12 +; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm14 ; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm3 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 +; AVX512-NEXT: vpermt2q %zmm0, %zmm7, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512-NEXT: vpermt2q %zmm0, %zmm22, %zmm9 ; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm2 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm22 ; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] @@ -4029,100 +3999,96 @@ ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload ; AVX512-NEXT: # zmm31 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm26[0,1,2,3],zmm30[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload -; AVX512-NEXT: # zmm30 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm14[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm18[0,1,2,3],zmm20[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm4[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm16[4,5,6,7] -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm17[0,1,2,3],zmm20[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm11[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm7[0,1,2,3],zmm4[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 64-byte Folded Reload -; AVX512-NEXT: # zmm18 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload +; AVX512-NEXT: # zmm11 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 # 64-byte Folded Reload ; AVX512-NEXT: # zmm20 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload +; AVX512-NEXT: # zmm30 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm29 # 64-byte Folded Reload ; AVX512-NEXT: # zmm29 = zmm29[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm25[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm23[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm16[4,5,6,7] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm15[0,1,2,3],zmm5[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm17[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload ; AVX512-NEXT: # zmm15 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload -; AVX512-NEXT: # zmm17 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload +; AVX512-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload -; AVX512-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload +; AVX512-NEXT: # zmm23 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512-NEXT: # zmm4 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512-NEXT: # zmm7 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm27[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm22[0,1,2,3],zmm10[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm22 # 64-byte Folded Reload -; AVX512-NEXT: # zmm22 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm27 # 64-byte Folded Reload -; AVX512-NEXT: # zmm27 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512-NEXT: # zmm7 = zmm7[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm14[0,1,2,3],zmm18[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm22[0,1,2,3],zmm9[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 64-byte Folded Reload +; AVX512-NEXT: # zmm18 = zmm18[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm22 # 64-byte Folded Reload +; AVX512-NEXT: # zmm22 = zmm22[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 64-byte Folded Reload +; AVX512-NEXT: # zmm27 = zmm27[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm24 # 64-byte Folded Reload ; AVX512-NEXT: # zmm24 = zmm24[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],zmm28[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm13[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm25[0,1,2,3],zmm28[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm13[0,1,2,3],zmm12[4,5,6,7] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm8[4,5,6,7] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm14, 448(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm6, 384(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm30, 320(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm26, 256(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm31, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm6, 448(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm17, 384(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm26, 320(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm31, 256(%rsi) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 192(%rsi) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm2, 128(%rsi) -; AVX512-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm9, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm19, 448(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm29, 256(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm20, 320(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm18, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm2, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm4, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm10, 448(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm19, 384(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm29, 320(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm30, 256(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm20, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm11, 128(%rdx) +; AVX512-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 384(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm21, 448(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm0, 256(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm4, 320(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm17, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm10, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm14, 448(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm21, 384(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm0, 320(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm7, 256(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm23, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm16, 128(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm15, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm12, 384(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm3, 384(%r8) -; AVX512-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512-NEXT: vmovdqa64 %zmm23, 256(%r8) -; AVX512-NEXT: vmovdqa64 %zmm24, 320(%r8) -; AVX512-NEXT: vmovdqa64 %zmm7, 128(%r8) +; AVX512-NEXT: vmovdqa64 %zmm9, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm3, 448(%r8) +; AVX512-NEXT: vmovdqa64 %zmm12, 384(%r8) +; AVX512-NEXT: vmovdqa64 %zmm25, 320(%r8) +; AVX512-NEXT: vmovdqa64 %zmm24, 256(%r8) ; AVX512-NEXT: vmovdqa64 %zmm27, 192(%r8) +; AVX512-NEXT: vmovdqa64 %zmm22, 128(%r8) +; AVX512-NEXT: vmovdqa64 %zmm18, 64(%r8) ; AVX512-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm22, 64(%r8) ; AVX512-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -4140,10 +4106,11 @@ ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; AVX: {{.*}} ; AVX1: {{.*}} -; AVX2: {{.*}} ; AVX2-FAST: {{.*}} ; AVX2-FAST-PERLANE: {{.*}} ; AVX2-SLOW: {{.*}} +; AVX512-FAST: {{.*}} +; AVX512-SLOW: {{.*}} ; AVX512BW: {{.*}} ; AVX512BW-FAST: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll @@ -39,20 +39,20 @@ ; AVX1-ONLY-LABEL: load_i64_stride5_vf2: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%rsi) +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsi) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%r8) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, (%r8) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%r9) ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride5_vf2: @@ -80,17 +80,17 @@ ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3] ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX512-NEXT: vmovaps (%rdi), %ymm2 ; AVX512-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] -; AVX512-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX512-NEXT: vmovdqa %xmm4, (%rsi) +; AVX512-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512-NEXT: vpunpcklqdq 72(%rdi){1to2}, %xmm1, %xmm1 +; AVX512-NEXT: vmovdqa %xmm3, (%rsi) ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) ; AVX512-NEXT: vextractf128 $1, %ymm2, (%rcx) -; AVX512-NEXT: vmovdqa %xmm5, (%r8) +; AVX512-NEXT: vmovaps %xmm4, (%r8) ; AVX512-NEXT: vmovdqa %xmm1, (%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -152,33 +152,32 @@ ; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovapd (%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm4[0],xmm6[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm9[0],ymm2[3],ymm9[2] -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm4[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[3],ymm7[2] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm4[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm0[0],ymm6[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm6[0],xmm8[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm5[0],xmm7[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovapd %ymm3, (%rsi) ; AVX1-ONLY-NEXT: vmovapd %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm4, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm6, (%r8) ; AVX1-ONLY-NEXT: vmovapd %ymm0, (%r9) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -259,23 +258,23 @@ ; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,6,11,u> ; AVX512-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,4] -; AVX512-FAST-NEXT: vmovdqa 128(%rdi), %ymm5 +; AVX512-FAST-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX512-FAST-NEXT: vpermi2q %ymm5, %ymm3, %ymm4 -; AVX512-FAST-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 -; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <2,7,12,u> -; AVX512-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <11,0,5,u> +; AVX512-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3 +; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <2,7,12,u> +; AVX512-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <11,0,5,u> +; AVX512-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm5 +; AVX512-FAST-NEXT: vpbroadcastq 144(%rdi), %ymm6 +; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <12,1,6,u> ; AVX512-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm6 -; AVX512-FAST-NEXT: vpbroadcastq 144(%rdi), %ymm7 -; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <12,1,6,u> -; AVX512-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 -; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX512-FAST-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512-FAST-NEXT: vmovdqa %ymm4, (%rdx) ; AVX512-FAST-NEXT: vmovdqa %ymm3, (%rcx) -; AVX512-FAST-NEXT: vmovdqa %ymm6, (%r8) +; AVX512-FAST-NEXT: vmovdqa %ymm5, (%r8) ; AVX512-FAST-NEXT: vmovdqa %ymm0, (%r9) ; AVX512-FAST-NEXT: vzeroupper ; AVX512-FAST-NEXT: retq @@ -296,153 +295,152 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i64_stride5_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movapd 304(%rdi), %xmm2 -; SSE-NEXT: movapd 64(%rdi), %xmm1 -; SSE-NEXT: movapd 224(%rdi), %xmm0 -; SSE-NEXT: movapd 256(%rdi), %xmm4 -; SSE-NEXT: movapd 176(%rdi), %xmm3 -; SSE-NEXT: movapd 288(%rdi), %xmm7 -; SSE-NEXT: movapd 208(%rdi), %xmm6 +; SSE-NEXT: movapd 224(%rdi), %xmm2 +; SSE-NEXT: movapd 144(%rdi), %xmm1 +; SSE-NEXT: movapd 64(%rdi), %xmm0 +; SSE-NEXT: movapd 176(%rdi), %xmm4 +; SSE-NEXT: movapd 96(%rdi), %xmm3 +; SSE-NEXT: movapd 208(%rdi), %xmm7 +; SSE-NEXT: movapd 128(%rdi), %xmm6 ; SSE-NEXT: movapd (%rdi), %xmm9 ; SSE-NEXT: movapd 16(%rdi), %xmm5 ; SSE-NEXT: movapd 32(%rdi), %xmm14 ; SSE-NEXT: movapd 48(%rdi), %xmm8 -; SSE-NEXT: movapd 240(%rdi), %xmm11 -; SSE-NEXT: movapd 272(%rdi), %xmm13 ; SSE-NEXT: movapd 160(%rdi), %xmm10 -; SSE-NEXT: movapd 192(%rdi), %xmm15 -; SSE-NEXT: movapd %xmm15, %xmm12 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm10[0],xmm12[1] -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm6[0] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] -; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm0[0] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm14, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm9[0],xmm15[1] +; SSE-NEXT: movapd 192(%rdi), %xmm13 +; SSE-NEXT: movapd 80(%rdi), %xmm11 +; SSE-NEXT: movapd 112(%rdi), %xmm15 +; SSE-NEXT: movapd %xmm14, %xmm12 +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm9[0],xmm12[1] ; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm8[0] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1] -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm1[0] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm13, %xmm14 +; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm0[0] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm15, %xmm14 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm11[0],xmm14[1] -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm7[0] +; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm6[0] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] +; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm1[0] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm13, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm10[0],xmm15[1] +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm7[0] ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] ; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm2[0] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm13[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 80(%rdi), %xmm13 -; SSE-NEXT: movapd 112(%rdi), %xmm4 +; SSE-NEXT: movapd 240(%rdi), %xmm13 +; SSE-NEXT: movapd 272(%rdi), %xmm4 ; SSE-NEXT: movapd %xmm4, %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm13[0],xmm2[1] -; SSE-NEXT: movapd 128(%rdi), %xmm0 +; SSE-NEXT: movapd 288(%rdi), %xmm0 ; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm0[0] -; SSE-NEXT: movapd 96(%rdi), %xmm1 +; SSE-NEXT: movapd 256(%rdi), %xmm1 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd 144(%rdi), %xmm3 +; SSE-NEXT: movapd 304(%rdi), %xmm3 ; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm3[0] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] -; SSE-NEXT: movapd %xmm2, 16(%rsi) -; SSE-NEXT: movapd %xmm14, 48(%rsi) -; SSE-NEXT: movapd %xmm15, (%rsi) -; SSE-NEXT: movapd %xmm12, 32(%rsi) -; SSE-NEXT: movapd %xmm13, 16(%rdx) -; SSE-NEXT: movapd %xmm11, 48(%rdx) -; SSE-NEXT: movapd %xmm9, (%rdx) +; SSE-NEXT: movapd %xmm2, 48(%rsi) +; SSE-NEXT: movapd %xmm15, 32(%rsi) +; SSE-NEXT: movapd %xmm14, 16(%rsi) +; SSE-NEXT: movapd %xmm12, (%rsi) +; SSE-NEXT: movapd %xmm13, 48(%rdx) ; SSE-NEXT: movapd %xmm10, 32(%rdx) -; SSE-NEXT: movapd %xmm0, 16(%rcx) -; SSE-NEXT: movapd %xmm7, 48(%rcx) +; SSE-NEXT: movapd %xmm11, 16(%rdx) +; SSE-NEXT: movapd %xmm9, (%rdx) +; SSE-NEXT: movapd %xmm0, 48(%rcx) +; SSE-NEXT: movapd %xmm7, 32(%rcx) +; SSE-NEXT: movapd %xmm6, 16(%rcx) ; SSE-NEXT: movapd %xmm8, (%rcx) -; SSE-NEXT: movapd %xmm6, 32(%rcx) -; SSE-NEXT: movapd %xmm1, 16(%r8) +; SSE-NEXT: movapd %xmm1, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps %xmm0, 32(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movapd %xmm5, (%r8) +; SSE-NEXT: movapd %xmm3, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%r8) -; SSE-NEXT: movapd %xmm3, 16(%r9) +; SSE-NEXT: movaps %xmm0, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r9) +; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%r9) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride5_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm7[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm8[0,1,2],ymm4[3] ; AVX1-ONLY-NEXT: vmovapd (%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm10[0],xmm4[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = xmm10[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm9[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm9[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm12[0],xmm5[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm13[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm14[0],ymm7[3],ymm14[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm12[0],xmm6[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm13[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm13[0],ymm8[3],ymm13[2] ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm10[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm10 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm9[0],ymm10[0],ymm9[3],ymm10[2] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm12[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm12[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm12[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0,1,2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = mem[0,1],xmm11[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm11[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm14[0,1,2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[0],ymm1[0],ymm12[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3] -; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm11[0],ymm12[0],ymm11[3],ymm12[2] +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm13[0],ymm2[0],ymm13[3],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vmovapd %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm15[0],ymm0[0],ymm15[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm4[0],xmm15[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm0 = xmm5[0],xmm0[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd %ymm6, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm5[0],xmm14[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm0 = xmm6[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX1-ONLY-NEXT: vmovapd %ymm9, 32(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm7, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm8, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 32(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 32(%r8) -; AVX1-ONLY-NEXT: vmovapd %ymm8, (%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 32(%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm13, (%r8) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 32(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm1, (%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm2, (%r9) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -655,8 +653,8 @@ ; SSE-NEXT: movapd 224(%rdi), %xmm3 ; SSE-NEXT: movapd 144(%rdi), %xmm2 ; SSE-NEXT: movapd 64(%rdi), %xmm1 -; SSE-NEXT: movapd 176(%rdi), %xmm4 -; SSE-NEXT: movapd 96(%rdi), %xmm5 +; SSE-NEXT: movapd 176(%rdi), %xmm5 +; SSE-NEXT: movapd 96(%rdi), %xmm4 ; SSE-NEXT: movapd 208(%rdi), %xmm7 ; SSE-NEXT: movapd 128(%rdi), %xmm8 ; SSE-NEXT: movapd (%rdi), %xmm10 @@ -683,10 +681,10 @@ ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm8[0] ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm4[0],xmm8[1] ; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm2[0] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm2[0] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm13, %xmm0 @@ -694,10 +692,10 @@ ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm7[0] ; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm5[0],xmm7[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm3[0] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm3[0] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm13[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 240(%rdi), %xmm2 @@ -710,19 +708,20 @@ ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 256(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 304(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 320(%rdi), %xmm15 +; SSE-NEXT: movapd 320(%rdi), %xmm2 ; SSE-NEXT: movapd 352(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 368(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm1[0] +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 336(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -730,79 +729,79 @@ ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 400(%rdi), %xmm11 +; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd 400(%rdi), %xmm10 ; SSE-NEXT: movapd 432(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm11[0],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm10[0],xmm13[1] ; SSE-NEXT: movapd 448(%rdi), %xmm12 -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm12[0] +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm12[0] ; SSE-NEXT: movapd 416(%rdi), %xmm14 ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm14[0],xmm12[1] -; SSE-NEXT: movapd 464(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm1[0] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 480(%rdi), %xmm4 -; SSE-NEXT: movapd 512(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm6 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] -; SSE-NEXT: movapd 528(%rdi), %xmm7 -; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm7[0] +; SSE-NEXT: movapd 464(%rdi), %xmm15 +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm15[0] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] +; SSE-NEXT: movapd 480(%rdi), %xmm5 +; SSE-NEXT: movapd 512(%rdi), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm6 +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm5[0],xmm6[1] +; SSE-NEXT: movapd 528(%rdi), %xmm8 +; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm8[0] ; SSE-NEXT: movapd 496(%rdi), %xmm9 -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm9[0],xmm7[1] -; SSE-NEXT: movapd 544(%rdi), %xmm10 -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm10[0] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm9[0],xmm8[1] +; SSE-NEXT: movapd 544(%rdi), %xmm11 +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm11[0] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] ; SSE-NEXT: movapd 560(%rdi), %xmm0 -; SSE-NEXT: movapd 592(%rdi), %xmm5 -; SSE-NEXT: movapd %xmm5, %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd 608(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE-NEXT: movapd 592(%rdi), %xmm4 +; SSE-NEXT: movapd %xmm4, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd 608(%rdi), %xmm2 +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm2[0] ; SSE-NEXT: movapd 576(%rdi), %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] -; SSE-NEXT: movapd 624(%rdi), %xmm8 -; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm8[0] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] +; SSE-NEXT: movapd 624(%rdi), %xmm7 +; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm7[0] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] +; SSE-NEXT: movapd %xmm1, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) ; SSE-NEXT: movapd %xmm6, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 32(%rsi) -; SSE-NEXT: movapd %xmm2, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movapd %xmm13, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movapd %xmm4, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) ; SSE-NEXT: movapd %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movapd %xmm15, 64(%rdx) +; SSE-NEXT: movapd %xmm5, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movapd %xmm11, 80(%rdx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movapd %xmm10, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movapd %xmm7, 96(%rcx) -; SSE-NEXT: movapd %xmm1, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rcx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movapd %xmm2, 112(%rcx) +; SSE-NEXT: movapd %xmm8, 96(%rcx) ; SSE-NEXT: movapd %xmm12, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movapd %xmm3, 112(%r8) ; SSE-NEXT: movapd %xmm9, 96(%r8) ; SSE-NEXT: movapd %xmm14, 80(%r8) @@ -816,11 +815,10 @@ ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movapd %xmm8, 112(%r9) -; SSE-NEXT: movapd %xmm10, 96(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movapd %xmm7, 112(%r9) +; SSE-NEXT: movapd %xmm11, 96(%r9) +; SSE-NEXT: movapd %xmm15, 80(%r9) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r9) @@ -835,343 +833,347 @@ ; ; AVX1-ONLY-LABEL: load_i64_stride5_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $360, %rsp # imm = 0x168 +; AVX1-ONLY-NEXT: subq $392, %rsp # imm = 0x188 +; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = xmm4[0],xmm6[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm11[0],xmm6[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm5[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd (%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm2[0],xmm0[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm4[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm1[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm11[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm14[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm0[0],xmm6[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm13[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm15[0],ymm3[3],ymm15[2] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[3],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm11[0],ymm5[3],ymm11[2] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm14[0],ymm5[0],ymm14[3],ymm5[2] -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm14[0],xmm12[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm3[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm0[0],xmm10[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm13[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm13[0],ymm2[3],ymm13[2] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm11[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm9[0,1,2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm13[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm10[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1,2,3],xmm14[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[3],ymm14[2] -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm9[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[3],ymm5[2] -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[3],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm2[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm14[0],ymm3[0],ymm14[3],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[3],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm10[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm10[0],ymm6[3],ymm10[2] -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3] +; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm15[0],ymm14[0],ymm15[3],ymm14[2] +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm11[0],ymm15[0],ymm11[3],ymm15[2] +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm0 = xmm12[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2],ymm3[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 64(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 64(%r8) -; AVX1-ONLY-NEXT: vmovapd %ymm7, (%r8) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 96(%r8) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 32(%r8) -; AVX1-ONLY-NEXT: vmovapd %ymm0, 64(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm1, (%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 96(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 32(%r9) -; AVX1-ONLY-NEXT: addq $360, %rsp # imm = 0x168 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm10[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 96(%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 32(%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 64(%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm8, (%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 64(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 32(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm4, (%r9) +; AVX1-ONLY-NEXT: addq $392, %rsp # imm = 0x188 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride5_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $360, %rsp # imm = 0x168 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX2-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm8[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm15 = xmm0[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm8[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm9[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm13[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm7[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm12[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm14, %ymm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm13 = ymm9[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm14 = ymm3[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 64(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm3, (%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 96(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 32(%r9) -; AVX2-ONLY-NEXT: addq $360, %rsp # imm = 0x168 +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 96(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 32(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm13, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 96(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 64(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%r9) +; AVX2-ONLY-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -1391,17 +1393,17 @@ ; SSE-NEXT: movapd 64(%rdi), %xmm3 ; SSE-NEXT: movapd 176(%rdi), %xmm7 ; SSE-NEXT: movapd 96(%rdi), %xmm6 -; SSE-NEXT: movapd 208(%rdi), %xmm9 -; SSE-NEXT: movapd 128(%rdi), %xmm10 +; SSE-NEXT: movapd 208(%rdi), %xmm10 +; SSE-NEXT: movapd 128(%rdi), %xmm9 ; SSE-NEXT: movapd (%rdi), %xmm12 ; SSE-NEXT: movapd 16(%rdi), %xmm8 -; SSE-NEXT: movapd 32(%rdi), %xmm1 +; SSE-NEXT: movapd 32(%rdi), %xmm0 ; SSE-NEXT: movapd 48(%rdi), %xmm11 ; SSE-NEXT: movapd 160(%rdi), %xmm13 -; SSE-NEXT: movapd 192(%rdi), %xmm0 +; SSE-NEXT: movapd 192(%rdi), %xmm1 ; SSE-NEXT: movapd 80(%rdi), %xmm14 ; SSE-NEXT: movapd 112(%rdi), %xmm2 -; SSE-NEXT: movapd %xmm1, %xmm15 +; SSE-NEXT: movapd %xmm0, %xmm15 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm12[0],xmm15[1] ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm11[0] @@ -1410,29 +1412,29 @@ ; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm3[0] ; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm2, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm10[0] +; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm9[0] ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm6[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm6[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm4[0] ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm9[0] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm10[0] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm7[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm5[0] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 240(%rdi), %xmm2 ; SSE-NEXT: movapd 272(%rdi), %xmm0 @@ -1564,9 +1566,8 @@ ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 880(%rdi), %xmm2 ; SSE-NEXT: movapd 912(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm0, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm2[0],xmm15[1] ; SSE-NEXT: movapd 928(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1578,44 +1579,45 @@ ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 960(%rdi), %xmm10 +; SSE-NEXT: movapd 960(%rdi), %xmm12 ; SSE-NEXT: movapd 992(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm10[0],xmm14[1] -; SSE-NEXT: movapd 1008(%rdi), %xmm15 -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm15[0] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm12[0],xmm14[1] +; SSE-NEXT: movapd 1008(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm1[0] ; SSE-NEXT: movapd 976(%rdi), %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm2[0],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1024(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1040(%rdi), %xmm8 ; SSE-NEXT: movapd 1072(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm9 ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm8[0],xmm9[1] -; SSE-NEXT: movapd 1088(%rdi), %xmm11 -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm11[0] +; SSE-NEXT: movapd 1088(%rdi), %xmm10 +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm10[0] ; SSE-NEXT: movapd 1056(%rdi), %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm2[0],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm2[0],xmm10[1] ; SSE-NEXT: movapd 1104(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1120(%rdi), %xmm5 -; SSE-NEXT: movapd 1152(%rdi), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm3 +; SSE-NEXT: movapd 1152(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm3 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm5[0],xmm3[1] ; SSE-NEXT: movapd 1168(%rdi), %xmm6 ; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm6[0] -; SSE-NEXT: movapd 1136(%rdi), %xmm12 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm12[0],xmm6[1] -; SSE-NEXT: movapd 1184(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm0[0] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 1136(%rdi), %xmm11 +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm11[0],xmm6[1] +; SSE-NEXT: movapd 1184(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm1[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1200(%rdi), %xmm0 ; SSE-NEXT: movapd 1232(%rdi), %xmm2 ; SSE-NEXT: movapd %xmm2, %xmm1 @@ -1627,27 +1629,19 @@ ; SSE-NEXT: movapd 1264(%rdi), %xmm13 ; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm13[0] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm2[0],xmm13[1] -; SSE-NEXT: movapd %xmm3, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rsi) ; SSE-NEXT: movapd %xmm1, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 176(%rsi) +; SSE-NEXT: movapd %xmm15, 176(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rsi) -; SSE-NEXT: movapd %xmm14, 192(%rsi) +; SSE-NEXT: movapd %xmm3, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 128(%rsi) +; SSE-NEXT: movaps %xmm1, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps %xmm1, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movapd %xmm9, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 144(%rsi) @@ -1655,38 +1649,46 @@ ; SSE-NEXT: movaps %xmm1, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movapd %xmm5, 224(%rdx) +; SSE-NEXT: movapd %xmm14, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) ; SSE-NEXT: movapd %xmm0, 240(%rdx) -; SSE-NEXT: movapd %xmm10, 192(%rdx) +; SSE-NEXT: movapd %xmm5, 224(%rdx) ; SSE-NEXT: movapd %xmm8, 208(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rdx) +; SSE-NEXT: movapd %xmm12, 192(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rdx) +; SSE-NEXT: movaps %xmm0, 160(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movapd %xmm4, 240(%rcx) ; SSE-NEXT: movapd %xmm6, 224(%rcx) -; SSE-NEXT: movapd %xmm11, 208(%rcx) -; SSE-NEXT: movapd %xmm15, 192(%rcx) +; SSE-NEXT: movapd %xmm10, 208(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 192(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1712,10 +1714,10 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movapd %xmm7, 240(%r8) -; SSE-NEXT: movapd %xmm12, 224(%r8) +; SSE-NEXT: movapd %xmm11, 224(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%r8) @@ -1744,7 +1746,7 @@ ; SSE-NEXT: movapd %xmm13, 240(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r9) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%r9) @@ -1777,609 +1779,586 @@ ; ; AVX1-ONLY-LABEL: load_i64_stride5_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1368, %rsp # imm = 0x558 -; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm2 +; AVX1-ONLY-NEXT: subq $1336, %rsp # imm = 0x538 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1056(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1024(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 992(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 960(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm7[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm9[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm6[0],xmm1[1] +; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm12[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm10[0],xmm1[1] +; AVX1-ONLY-NEXT: vmovapd %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm14[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1216(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1184(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1152(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1120(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm11[0],xmm1[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 1184(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 1152(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1120(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm15 = xmm1[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd (%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm13[0],xmm1[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm9[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 768(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm6[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[3],ymm0[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm7[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm9[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm12[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm13[0],ymm0[0],ymm13[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 848(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm10[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm14[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[3],ymm6[2] -; AVX1-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm11[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[3],ymm6[2] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[3],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm12[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[3],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm14[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm0[0],ymm4[0],ymm0[3],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm6[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 816(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm3[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 1136(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 656(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1,2,3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm13[0],ymm4[3],ymm13[2] -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm15[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm4[0],ymm14[0],ymm4[3],ymm14[2] -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm8[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm4[0],ymm5[3],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[3],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm9[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[3],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm13[0],ymm3[3],ymm13[2] +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[3],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[3],ymm8[2] +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm12[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 768(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[3],ymm5[2] -; AVX1-ONLY-NEXT: vmovdqa 704(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm12[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 768(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[3],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 704(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[3],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 896(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm10[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[3],ymm10[2] +; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm14[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[3],ymm12[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,1],xmm12[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = mem[0,1,2],ymm13[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,1,2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,2,3],xmm15[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm12[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vpblendw $15, (%rsp), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm6, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 224(%r9) ; AVX1-ONLY-NEXT: vmovapd %ymm2, 192(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 160(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 128(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 96(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm10, 64(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) -; AVX1-ONLY-NEXT: addq $1368, %rsp # imm = 0x558 +; AVX1-ONLY-NEXT: vmovapd %ymm3, 160(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 128(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 96(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm15, 64(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm14, (%r9) +; AVX1-ONLY-NEXT: addq $1336, %rsp # imm = 0x538 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride5_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1464, %rsp # imm = 0x5B8 -; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm4 +; AVX2-ONLY-NEXT: subq $1480, %rsp # imm = 0x5C8 +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm12[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm4[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm3[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm15 = xmm7[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm9[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm14 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm11 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr $8, (%rsp), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm13 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm12 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 848(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm10 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm11[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm5 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm8 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm9[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 848(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm6 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm15 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm8 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm15[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] @@ -2387,233 +2366,233 @@ ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm14[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm13[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm13[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm10[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm7 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm1[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm5 = mem[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm6 = mem[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = mem[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm8 = mem[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = ymm14[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = mem[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, (%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm8, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 224(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, 192(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 128(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm15, 96(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 64(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 32(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm12, (%r9) -; AVX2-ONLY-NEXT: addq $1464, %rsp # imm = 0x5B8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, 224(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 224(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 192(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 160(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 128(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, 96(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 64(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 32(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm15, (%r9) +; AVX2-ONLY-NEXT: addq $1480, %rsp # imm = 0x5C8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride5_vf32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $584, %rsp # imm = 0x248 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm19 ; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm15 ; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm25 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm22 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm14 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm27 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [12,1,6,0,12,1,6,0] -; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [12,1,6,0,12,1,6,0] +; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm26, %zmm10 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm24, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm26, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm27, %zmm24, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm20 = [0,5,10,15] +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm20, %zmm11 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm26, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm24, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm21, %zmm1, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm24, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm19, %zmm1, %zmm24 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm28 = <1,6,11,u> -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] -; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,10,15,0,5,10,15,0] +; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm12, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm12, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm12, %zmm18 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm21, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm13, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm13, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm18 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm19, %zmm13 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [6,11,0,1,6,11,0,1] ; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512F-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <2,7,12,u> -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm15, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = <2,7,12,u> +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm16, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 ; AVX512F-NEXT: vpermt2q %zmm4, %zmm23, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512F-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm21, %zmm23 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm19, %zmm23 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,12,0,2,7,12,0,2] ; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 @@ -2624,133 +2603,133 @@ ; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm29, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm29, %zmm17 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm21, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm29, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm29, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm22 = <11,0,5,u> +; AVX512F-NEXT: vpermi2q %zmm1, %zmm19, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm21 = <11,0,5,u> -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm21, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm22, %zmm19 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <12,1,6,u> -; AVX512F-NEXT: vpermt2q %zmm22, %zmm9, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm9, %zmm14 ; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm24, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm20, %zmm3 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm21, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm28, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm16, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm15 ; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm25 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm24, %zmm3 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm0 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm20, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 ; AVX512F-NEXT: vpermt2q %zmm25, %zmm28, %zmm6 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm16, %zmm4 ; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm25 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm10[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm10 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm10 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm24[0,1,2,3],zmm26[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm25 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm0 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm20 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm28 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm11[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm11 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm11 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm10 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm20[0,1,2,3],zmm24[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm20 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm24, %zmm11 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm26 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm24, %zmm10 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm24, %zmm3 -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm24, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm24, %zmm12 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm24, %zmm11 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm24, %zmm10 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm3 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm24, %zmm9 ; AVX512F-NEXT: movb $7, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm30 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,12] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm6, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm13 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm17 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm16 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,8,13] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm6, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm6, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm16 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm22 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,9,14] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm6, %zmm19 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm6, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm22 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm14 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,10,15] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm6, %zmm14 ; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} -; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm15 {%k1} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm7, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm10, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm11, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm12, 192(%rdx) +; AVX512F-NEXT: vpermt2q %zmm2, %zmm6, %zmm25 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm9, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm12, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm13, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm18, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm26, 64(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm30, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm19, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm15, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm16, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm16, 192(%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm21, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm22, 192(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm5, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm13, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm15, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm14, (%r9) ; AVX512F-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2758,60 +2737,60 @@ ; AVX512BW-LABEL: load_i64_stride5_vf32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $584, %rsp # imm = 0x248 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm19 ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm15 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm27 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [12,1,6,0,12,1,6,0] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [12,1,6,0,12,1,6,0] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm26, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,5,10,15] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm24, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm26, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm24, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [0,5,10,15] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm20, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm26, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm24, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm1, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm24, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm1, %zmm24 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm28 = <1,6,11,u> -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,10,15,0,5,10,15,0] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,10,15,0,5,10,15,0] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm18 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm21, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm19, %zmm13 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [6,11,0,1,6,11,0,1] ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <2,7,12,u> -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm15, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <2,7,12,u> +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm16, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm21, %zmm23 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm19, %zmm23 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,12,0,2,7,12,0,2] ; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 @@ -2822,133 +2801,133 @@ ; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm29, %zmm17 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm21, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm29, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = <11,0,5,u> +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm19, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <11,0,5,u> -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm21, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm22, %zmm19 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <12,1,6,u> -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm9, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm9, %zmm14 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm24, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm20, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm28, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm16, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm15 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm24, %zmm3 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm20, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm28, %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm16, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm25 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm10[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm10 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm10 = zmm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm24[0,1,2,3],zmm26[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm25 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm20 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm28 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm11[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm11 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm11 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm10 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm20[0,1,2,3],zmm24[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm20 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm24, %zmm11 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm24, %zmm10 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm3 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm24, %zmm12 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm11 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm24, %zmm10 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm24, %zmm9 ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm30 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,12] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm6, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm13 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm17 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm16 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,8,13] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm6, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm16 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm22 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,9,14] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm6, %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm22 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,10,15] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm6, %zmm14 ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm15 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm11, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rdx) +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm30, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 192(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm14, (%r9) ; AVX512BW-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3779,76 +3758,173 @@ ; ; AVX1-ONLY-LABEL: load_i64_stride5_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $3256, %rsp # imm = 0xCB8 -; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm0 +; AVX1-ONLY-NEXT: subq $3192, %rsp # imm = 0xC78 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2336(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2304(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 2272(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm11[0],xmm1[1] +; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm13[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm11[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 1216(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1184(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 1152(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; AVX1-ONLY-NEXT: vmovapd 1120(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm9[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1536(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1440(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm12[0],xmm1[1] +; AVX1-ONLY-NEXT: vmovapd 1440(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm7[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1856(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1824(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 1824(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vmovapd 1792(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1760(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm13[0],xmm1[1] +; AVX1-ONLY-NEXT: vmovapd 1760(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 2176(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2144(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 2144(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vmovapd 2112(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2080(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm15[0],xmm1[1] +; AVX1-ONLY-NEXT: vmovapd 2080(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm3[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %ymm0 @@ -3856,554 +3932,383 @@ ; AVX1-ONLY-NEXT: vmovaps 2464(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2400(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 2400(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 768(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1328(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1648(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 2048(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1968(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 2368(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 2288(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[0],ymm0[0],ymm12[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 848(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1568(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1488(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1888(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1808(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 2208(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 2128(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 2528(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 2448(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1456(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1776(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1936(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2096(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2336(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2304(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 2272(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovaps 2336(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2256(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2416(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 848(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm3[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1488(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm12[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1808(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm13[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2208(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 2128(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm15[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2528(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 2448(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 768(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 704(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm5[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 896(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm6[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1328(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1648(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2048(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1968(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2368(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovapd 1408(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1376(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 2288(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 1456(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 1776(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 2096(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm14[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 2416(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm15[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2336(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 2256(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 1936(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 1616(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 1296(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm3[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm5[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm4[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm6[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm7[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm10[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[3],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[3],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[3],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm8[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[3],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vmovapd 768(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[3],ymm6[2] -; AVX1-ONLY-NEXT: vmovdqa 704(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[3],ymm5[2] -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[3],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[3],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 1408(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] -; AVX1-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovapd 1568(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1536(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[3],ymm14[2] ; AVX1-ONLY-NEXT: vmovdqa 1504(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm10[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovapd 1728(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1696(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[3],ymm12[2] ; AVX1-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm15[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovapd 1888(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1856(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[3],ymm10[2] ; AVX1-ONLY-NEXT: vmovdqa 1824(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovapd 2048(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2016(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[3],ymm8[2] ; AVX1-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovapd 2208(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2176(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[3],ymm6[2] ; AVX1-ONLY-NEXT: vmovdqa 2144(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovapd 2368(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2336(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[3],ymm4[2] ; AVX1-ONLY-NEXT: vmovdqa 2304(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovapd 2528(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2496(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[3],ymm2[2] ; AVX1-ONLY-NEXT: vmovdqa 2464(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = xmm15[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] @@ -4507,22 +4412,6 @@ ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rsi) @@ -4539,21 +4428,21 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -4571,38 +4460,54 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%r8) @@ -4658,294 +4563,294 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) -; AVX1-ONLY-NEXT: addq $3256, %rsp # imm = 0xCB8 +; AVX1-ONLY-NEXT: addq $3192, %rsp # imm = 0xC78 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride5_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $3240, %rsp # imm = 0xCA8 -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm2 +; AVX2-ONLY-NEXT: subq $3288, %rsp # imm = 0xCD8 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2176(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2144(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 2112(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2080(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2496(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 2016(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2464(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2432(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 1952(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2400(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovdqa 1920(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 2336(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm14 -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 2272(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 2240(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm8 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %xmm7 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1376(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1856(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 1824(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1792(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1600(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2016(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 2176(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1952(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 2144(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2112(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1920(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 2080(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2336(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 2496(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 2464(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2272(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 2432(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2240(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 2400(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm15 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpalignr $8, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 848(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1488(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 1328(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1568(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1808(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 1648(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2128(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2448(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 1968(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 2048(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2288(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm11[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2368(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm15 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm13 +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm9 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 848(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm8 @@ -4953,9 +4858,9 @@ ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm7 @@ -4963,107 +4868,109 @@ ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1328(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 1488(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm12[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1568(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm5 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1648(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 1808(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm11[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm4 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1968(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2048(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2128(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm10 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2288(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 2448(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2368(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm14 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2080(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 2176(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2400(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 2496(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2240(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 2336(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] @@ -5075,158 +4982,160 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2080(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 2176(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2240(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 2336(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2400(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 2496(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm7 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 1824(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm13[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 2144(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm12[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm10[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 2464(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] @@ -5246,1177 +5155,1181 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = mem[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm11 = mem[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm12 = mem[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm10 = mem[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm9 = mem[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm11 = mem[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm8 = mem[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = mem[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = mem[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm7 = mem[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm6 = mem[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm5 = mem[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm3 = mem[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm12[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm10[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 480(%r9) +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 480(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm2, 448(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 416(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 384(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 352(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm13, 320(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 288(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 256(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm14, 224(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, 192(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, 160(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm12, 128(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 416(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 384(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, 352(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 320(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 288(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm9, 256(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 192(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm15, 160(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 128(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r9) -; AVX2-ONLY-NEXT: addq $3240, %rsp # imm = 0xCA8 +; AVX2-ONLY-NEXT: addq $3288, %rsp # imm = 0xCD8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride5_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm0 +; AVX512F-NEXT: subq $3464, %rsp # imm = 0xD88 +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm23 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm26 ; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [12,1,6,0,12,1,6,0] -; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm15, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm15, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm15, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,10,15,0,5,10,15,0] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,11,0,1,6,11,0,1] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [7,12,0,2,7,12,0,2] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,0,11,0,5,0,11] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm28 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm6 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [12,1,6,0,12,1,6,0] +; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm18, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm18, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm18, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm18, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,10,15,0,5,10,15,0] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm11, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,11,0,1,6,11,0,1] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [7,12,0,2,7,12,0,2] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,0,11,0,5,0,11] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm26, %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm2, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm15, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm23, %zmm2, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm18, %zmm22 ; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 -; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm5 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm5, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm5, %zmm1, %zmm9 +; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm10, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm1 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm18 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm5, %zmm1, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm5, %zmm1, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,10,15] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <1,6,11,u> +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm1, %zmm15 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <1,6,11,u> ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <2,7,12,u> +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <2,7,12,u> ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = <11,0,5,u> -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = <11,0,5,u> +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm25 = <12,1,6,u> -; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm31 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <12,1,6,u> +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm26 ; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm8, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm26 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm26 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm24 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm4 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm21 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm24 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm23 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm5 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm8, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm20 -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm22 -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm23 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm8, %zmm27 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm20 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm28 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm8, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm11, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm22 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm2 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm28 +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm1, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm4, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm12 -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm6 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm8, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm11, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm10 +; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm9 ; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm8, %zmm23 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm10, %zmm27 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm25, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm25, %zmm9 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm16[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm14 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm14 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vpermi2q %zmm9, %zmm0, %zmm1 +; AVX512F-NEXT: vpermi2q %zmm9, %zmm0, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm9, %zmm0, %zmm11 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm9, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm2 = zmm15[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm16[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm17[4,5,6,7] ; AVX512F-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload ; AVX512F-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm17[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm19 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm19 = zmm11[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm15[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,4,5,6,11] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm25, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm15 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm25, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm17 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm25, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm11 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm25, %zmm3 +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm12[0,1,2,3],zmm22[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm13[0,1,2,3],zmm19[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm18[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,11] +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm16 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm18 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm25, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm14 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm25, %zmm19 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm13 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm15 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm18 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm17 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm12 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm22 +; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm22 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: movb $7, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm4, %zmm25 -; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm11, %zmm4, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,12] +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm4, %zmm24 -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm21 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,13] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm11, %zmm3, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm3, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,13] +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,9,14] +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,10,15] +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm26 {%k1} +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm24 {%k1} +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm24 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm23 {%k1} +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm20 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm28 {%k1} +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm28 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,9,14] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm1, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm10 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm31 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm26 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} -; AVX512F-NEXT: vpermt2q %zmm11, %zmm3, %zmm21 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} -; AVX512F-NEXT: vpermt2q %zmm14, %zmm3, %zmm22 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} -; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm10 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} -; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 448(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 384(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 320(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 256(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 192(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 128(%rsi) -; AVX512F-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, (%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 448(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 256(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 320(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 128(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 192(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, (%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 64(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 384(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm8, 448(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm28, 256(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm24, 320(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm30, 192(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, (%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm23, 384(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm10, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm29, 256(%r8) +; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm9 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 448(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 384(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 320(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 256(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 192(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 128(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 448(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 384(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 320(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 256(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 192(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 128(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 64(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm11, 448(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm25, 384(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm31, 320(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm21, 256(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm27, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm29, 128(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 64(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm14, 448(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm30, 384(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm5, 320(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm4, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm2, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm6, 64(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm7, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm27, 384(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm9, 384(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm6, 448(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm12, 256(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm22, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm20, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm21, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm31, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm9, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm10, 384(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm28, 320(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm20, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm23, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm24, 128(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm26, 64(%r9) -; AVX512F-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512F-NEXT: vmovdqa64 %zmm19, (%r9) +; AVX512F-NEXT: addq $3464, %rsp # imm = 0xD88 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride5_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm0 +; AVX512BW-NEXT: subq $3464, %rsp # imm = 0xD88 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm26 ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [12,1,6,0,12,1,6,0] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm15, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,10,15,0,5,10,15,0] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,11,0,1,6,11,0,1] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [7,12,0,2,7,12,0,2] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,0,11,0,5,0,11] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm6 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [12,1,6,0,12,1,6,0] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm18, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm18, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm18, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [5,10,15,0,5,10,15,0] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,11,0,1,6,11,0,1] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [7,12,0,2,7,12,0,2] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,0,11,0,5,0,11] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm15, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm2, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm18, %zmm22 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm18 -; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm1, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,10,15] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <1,6,11,u> +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm15 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <1,6,11,u> ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <2,7,12,u> +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <2,7,12,u> ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <11,0,5,u> -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <11,0,5,u> +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm25 = <12,1,6,u> -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm31 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <12,1,6,u> +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm26 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm26 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm26 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm21 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm24 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm20 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm23 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm27 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm20 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm8, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm11, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm22 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm28 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm4, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm8, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm12 -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm8, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm11, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm8, %zmm23 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm10, %zmm27 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm25, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm25, %zmm9 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm16[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm14 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm14 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm9, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm9, %zmm0, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm9, %zmm0, %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm2 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm2 = zmm15[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm16[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm17[4,5,6,7] ; AVX512BW-NEXT: vshufi64x2 $228, (%rsp), %zmm5, %zmm5 # 64-byte Folded Reload ; AVX512BW-NEXT: # zmm5 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm17[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm19 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm19 = zmm11[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm18[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm15[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm25, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm25, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm25, %zmm3 +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm6 = zmm6[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm12[0,1,2,3],zmm22[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm13[0,1,2,3],zmm19[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm18[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,11] +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm25, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm25, %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm25, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm25, %zmm1 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm4, %zmm24 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,13] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,13] +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,9,14] +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,10,15] +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm26 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm26 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm24 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm23 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm20 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm28 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm28 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,9,14] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,10,15] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm31 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm26 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm20 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm10 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm9 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 448(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 384(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 320(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 256(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 192(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 128(%rsi) -; AVX512BW-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, (%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 448(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 256(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 320(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 128(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 192(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, (%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 64(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 256(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 320(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 192(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, (%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 384(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 256(%r8) +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 448(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 384(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 320(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 256(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 192(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 128(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 448(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 384(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 320(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 256(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 192(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 128(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 64(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 448(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 384(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 320(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 256(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 128(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 64(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 384(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm31, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 384(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 128(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%r9) -; AVX512BW-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512BW-NEXT: vmovdqa64 %zmm19, (%r9) +; AVX512BW-NEXT: addq $3464, %rsp # imm = 0xD88 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <320 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll @@ -42,28 +42,51 @@ ; SSE-NEXT: movaps %xmm4, (%rax) ; SSE-NEXT: retq ; -; AVX-LABEL: load_i64_stride6_vf2: -; AVX: # %bb.0: -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX-NEXT: vmovaps 48(%rdi), %xmm3 -; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm3[0] -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; AVX-NEXT: vmovaps 64(%rdi), %xmm3 -; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm3[0] -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; AVX-NEXT: vmovaps 80(%rdi), %xmm3 -; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm3[0] -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX-NEXT: vmovaps %xmm4, (%rsi) -; AVX-NEXT: vmovaps %xmm0, (%rdx) -; AVX-NEXT: vmovaps %xmm5, (%rcx) -; AVX-NEXT: vmovaps %xmm1, (%r8) -; AVX-NEXT: vmovaps %xmm6, (%r9) -; AVX-NEXT: vmovaps %xmm2, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: load_i64_stride6_vf2: +; AVX1: # %bb.0: +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX1-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX1-NEXT: vmovaps 48(%rdi), %xmm3 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm3[0] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; AVX1-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm3[0] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; AVX1-NEXT: vmovaps 80(%rdi), %xmm3 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm3[0] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX1-NEXT: vmovaps %xmm4, (%rsi) +; AVX1-NEXT: vmovaps %xmm0, (%rdx) +; AVX1-NEXT: vmovaps %xmm5, (%rcx) +; AVX1-NEXT: vmovaps %xmm1, (%r8) +; AVX1-NEXT: vmovaps %xmm6, (%r9) +; AVX1-NEXT: vmovaps %xmm2, (%rax) +; AVX1-NEXT: retq +; +; AVX512-LABEL: load_i64_stride6_vf2: +; AVX512: # %bb.0: +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vmovaps (%rdi), %xmm0 +; AVX512-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX512-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX512-NEXT: vmovaps 48(%rdi), %xmm3 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm3[0] +; AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512-NEXT: vpbroadcastq 24(%rdi), %xmm3 +; AVX512-NEXT: vpunpcklqdq 72(%rdi){1to2}, %xmm3, %xmm3 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512-NEXT: vpbroadcastq 40(%rdi), %xmm5 +; AVX512-NEXT: vpunpcklqdq 88(%rdi){1to2}, %xmm5, %xmm5 +; AVX512-NEXT: vmovaps %xmm4, (%rsi) +; AVX512-NEXT: vmovaps %xmm0, (%rdx) +; AVX512-NEXT: vmovaps %xmm1, (%rcx) +; AVX512-NEXT: vmovdqa %xmm3, (%r8) +; AVX512-NEXT: vmovaps %xmm2, (%r9) +; AVX512-NEXT: vmovdqa %xmm5, (%rax) +; AVX512-NEXT: retq %wide.vec = load <12 x i64>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <12 x i64> %wide.vec, <12 x i64> poison, <2 x i32> %strided.vec1 = shufflevector <12 x i64> %wide.vec, <12 x i64> poison, <2 x i32> @@ -84,48 +107,48 @@ ; SSE-LABEL: load_i64_stride6_vf4: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps 80(%rdi), %xmm3 -; SSE-NEXT: movaps 176(%rdi), %xmm5 +; SSE-NEXT: movaps 176(%rdi), %xmm4 ; SSE-NEXT: movaps 128(%rdi), %xmm0 -; SSE-NEXT: movaps 64(%rdi), %xmm6 +; SSE-NEXT: movaps 80(%rdi), %xmm6 ; SSE-NEXT: movaps 160(%rdi), %xmm7 -; SSE-NEXT: movaps 112(%rdi), %xmm1 -; SSE-NEXT: movaps (%rdi), %xmm8 -; SSE-NEXT: movaps 16(%rdi), %xmm4 -; SSE-NEXT: movaps 32(%rdi), %xmm2 +; SSE-NEXT: movaps 112(%rdi), %xmm2 +; SSE-NEXT: movaps 64(%rdi), %xmm8 +; SSE-NEXT: movaps (%rdi), %xmm5 +; SSE-NEXT: movaps 16(%rdi), %xmm3 +; SSE-NEXT: movaps 32(%rdi), %xmm1 ; SSE-NEXT: movaps 48(%rdi), %xmm9 ; SSE-NEXT: movaps 144(%rdi), %xmm10 ; SSE-NEXT: movaps 96(%rdi), %xmm11 ; SSE-NEXT: movaps %xmm11, %xmm12 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm10[0] -; SSE-NEXT: movaps %xmm8, %xmm13 +; SSE-NEXT: movaps %xmm5, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm9[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm10[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] -; SSE-NEXT: movaps %xmm1, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm7[0] -; SSE-NEXT: movaps %xmm4, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSE-NEXT: movaps %xmm13, (%rsi) +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm9[1] +; SSE-NEXT: movaps %xmm3, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0] +; SSE-NEXT: movaps %xmm2, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm8[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] +; SSE-NEXT: movaps %xmm1, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0] +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] ; SSE-NEXT: movaps %xmm12, 16(%rsi) -; SSE-NEXT: movaps %xmm8, (%rdx) +; SSE-NEXT: movaps %xmm13, (%rsi) ; SSE-NEXT: movaps %xmm11, 16(%rdx) -; SSE-NEXT: movaps %xmm10, (%rcx) -; SSE-NEXT: movaps %xmm9, 16(%rcx) -; SSE-NEXT: movaps %xmm4, (%r8) -; SSE-NEXT: movaps %xmm1, 16(%r8) +; SSE-NEXT: movaps %xmm5, (%rdx) +; SSE-NEXT: movaps %xmm10, 16(%rcx) +; SSE-NEXT: movaps %xmm9, (%rcx) +; SSE-NEXT: movaps %xmm2, 16(%r8) +; SSE-NEXT: movaps %xmm3, (%r8) +; SSE-NEXT: movaps %xmm8, 16(%r9) ; SSE-NEXT: movaps %xmm7, (%r9) -; SSE-NEXT: movaps %xmm6, 16(%r9) -; SSE-NEXT: movaps %xmm2, (%rax) ; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps %xmm1, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride6_vf4: @@ -273,20 +296,21 @@ ; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <10,0,6,u> ; AVX512-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,4] -; AVX512-FAST-NEXT: vmovdqa 160(%rdi), %ymm7 +; AVX512-FAST-NEXT: vmovdqa 160(%rdi), %xmm7 ; AVX512-FAST-NEXT: vpermi2q %ymm7, %ymm5, %ymm6 -; AVX512-FAST-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm5 -; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <11,1,7,u> -; AVX512-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 -; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,6,0,6] -; AVX512-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512-FAST-NEXT: vpermi2q %ymm7, %ymm4, %ymm8 +; AVX512-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm5 +; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <11,1,7,u> +; AVX512-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 +; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX512-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,6,0,6] +; AVX512-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512-FAST-NEXT: vmovdqa 160(%rdi), %ymm8 +; AVX512-FAST-NEXT: vpermi2q %ymm8, %ymm4, %ymm7 ; AVX512-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,10] ; AVX512-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FAST-NEXT: vpbroadcastq 136(%rdi), %ymm8 -; AVX512-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FAST-NEXT: vpbroadcastq 136(%rdi), %ymm7 +; AVX512-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] ; AVX512-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [5,11] ; AVX512-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7] @@ -318,21 +342,21 @@ ; SSE-LABEL: load_i64_stride6_vf8: ; SSE: # %bb.0: ; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movaps 160(%rdi), %xmm10 +; SSE-NEXT: movaps 352(%rdi), %xmm10 ; SSE-NEXT: movaps 256(%rdi), %xmm13 ; SSE-NEXT: movaps 208(%rdi), %xmm5 -; SSE-NEXT: movaps 352(%rdi), %xmm15 -; SSE-NEXT: movaps 304(%rdi), %xmm6 +; SSE-NEXT: movaps 160(%rdi), %xmm15 +; SSE-NEXT: movaps 112(%rdi), %xmm6 ; SSE-NEXT: movaps 64(%rdi), %xmm0 ; SSE-NEXT: movaps (%rdi), %xmm8 ; SSE-NEXT: movaps 16(%rdi), %xmm7 ; SSE-NEXT: movaps 48(%rdi), %xmm1 -; SSE-NEXT: movaps 144(%rdi), %xmm2 -; SSE-NEXT: movaps 96(%rdi), %xmm11 +; SSE-NEXT: movaps 336(%rdi), %xmm2 +; SSE-NEXT: movaps 288(%rdi), %xmm11 ; SSE-NEXT: movaps 240(%rdi), %xmm3 ; SSE-NEXT: movaps 192(%rdi), %xmm12 -; SSE-NEXT: movaps 336(%rdi), %xmm4 -; SSE-NEXT: movaps 288(%rdi), %xmm9 +; SSE-NEXT: movaps 144(%rdi), %xmm4 +; SSE-NEXT: movaps 96(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm9, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm4[0] ; SSE-NEXT: movaps %xmm14, (%rsp) # 16-byte Spill @@ -363,7 +387,7 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm13[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdi), %xmm13 +; SSE-NEXT: movaps 304(%rdi), %xmm13 ; SSE-NEXT: movaps %xmm13, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm10[1] @@ -372,8 +396,8 @@ ; SSE-NEXT: movaps %xmm10, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps 368(%rdi), %xmm0 -; SSE-NEXT: movaps 320(%rdi), %xmm5 +; SSE-NEXT: movaps 176(%rdi), %xmm0 +; SSE-NEXT: movaps 128(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, %xmm6 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] @@ -382,44 +406,44 @@ ; SSE-NEXT: movaps %xmm3, %xmm4 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE-NEXT: movaps 176(%rdi), %xmm0 -; SSE-NEXT: movaps 128(%rdi), %xmm1 +; SSE-NEXT: movaps 368(%rdi), %xmm0 +; SSE-NEXT: movaps 320(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm11, 16(%rsi) +; SSE-NEXT: movaps %xmm11, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps %xmm12, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rdx) -; SSE-NEXT: movaps %xmm9, 48(%rdx) +; SSE-NEXT: movaps %xmm9, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps %xmm7, 16(%rcx) +; SSE-NEXT: movaps %xmm7, 48(%rcx) ; SSE-NEXT: movaps %xmm15, 32(%rcx) -; SSE-NEXT: movaps %xmm14, 48(%rcx) +; SSE-NEXT: movaps %xmm14, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movaps %xmm13, 16(%r8) +; SSE-NEXT: movaps %xmm13, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movaps %xmm2, 16(%r9) +; SSE-NEXT: movaps %xmm2, 48(%r9) ; SSE-NEXT: movaps %xmm4, 32(%r9) -; SSE-NEXT: movaps %xmm6, 48(%r9) +; SSE-NEXT: movaps %xmm6, 16(%r9) ; SSE-NEXT: movaps %xmm8, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm1, 16(%rax) +; SSE-NEXT: movaps %xmm1, 48(%rax) ; SSE-NEXT: movaps %xmm3, 32(%rax) -; SSE-NEXT: movaps %xmm5, 48(%rax) +; SSE-NEXT: movaps %xmm5, 16(%rax) ; SSE-NEXT: movaps %xmm10, (%rax) ; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: retq @@ -785,35 +809,35 @@ ; SSE: # %bb.0: ; SSE-NEXT: subq $408, %rsp # imm = 0x198 ; SSE-NEXT: movaps (%rdi), %xmm7 -; SSE-NEXT: movaps 624(%rdi), %xmm0 -; SSE-NEXT: movaps 576(%rdi), %xmm8 -; SSE-NEXT: movaps 240(%rdi), %xmm1 -; SSE-NEXT: movaps 192(%rdi), %xmm9 -; SSE-NEXT: movaps 720(%rdi), %xmm2 -; SSE-NEXT: movaps 672(%rdi), %xmm10 -; SSE-NEXT: movaps 336(%rdi), %xmm3 -; SSE-NEXT: movaps 288(%rdi), %xmm11 -; SSE-NEXT: movaps 432(%rdi), %xmm4 -; SSE-NEXT: movaps 384(%rdi), %xmm13 -; SSE-NEXT: movaps 528(%rdi), %xmm5 +; SSE-NEXT: movaps 720(%rdi), %xmm0 +; SSE-NEXT: movaps 672(%rdi), %xmm8 +; SSE-NEXT: movaps 336(%rdi), %xmm1 +; SSE-NEXT: movaps 288(%rdi), %xmm9 +; SSE-NEXT: movaps 624(%rdi), %xmm2 +; SSE-NEXT: movaps 576(%rdi), %xmm10 +; SSE-NEXT: movaps 240(%rdi), %xmm3 +; SSE-NEXT: movaps 192(%rdi), %xmm11 +; SSE-NEXT: movaps 528(%rdi), %xmm4 ; SSE-NEXT: movaps 480(%rdi), %xmm12 -; SSE-NEXT: movaps 144(%rdi), %xmm6 -; SSE-NEXT: movaps 96(%rdi), %xmm14 +; SSE-NEXT: movaps 144(%rdi), %xmm5 +; SSE-NEXT: movaps 96(%rdi), %xmm13 +; SSE-NEXT: movaps 432(%rdi), %xmm6 +; SSE-NEXT: movaps 384(%rdi), %xmm14 ; SSE-NEXT: movaps %xmm14, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm6[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm6 +; SSE-NEXT: movaps %xmm13, %xmm6 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm5[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm4[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm11, %xmm4 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -902,16 +926,16 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 176(%rdi), %xmm0 -; SSE-NEXT: movaps 128(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps 128(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 272(%rdi), %xmm0 -; SSE-NEXT: movaps 224(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm14 +; SSE-NEXT: movaps 224(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 368(%rdi), %xmm0 ; SSE-NEXT: movaps 320(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm9, %xmm10 @@ -938,53 +962,53 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rcx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rcx) +; SSE-NEXT: movaps %xmm0, 96(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) +; SSE-NEXT: movaps %xmm0, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movaps %xmm11, 112(%r8) ; SSE-NEXT: movaps %xmm15, 96(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1015,8 +1039,8 @@ ; SSE-NEXT: movaps %xmm6, 80(%rax) ; SSE-NEXT: movaps %xmm5, 64(%rax) ; SSE-NEXT: movaps %xmm9, 48(%rax) -; SSE-NEXT: movaps %xmm12, 32(%rax) -; SSE-NEXT: movaps %xmm13, 16(%rax) +; SSE-NEXT: movaps %xmm13, 32(%rax) +; SSE-NEXT: movaps %xmm12, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: addq $408, %rsp # imm = 0x198 @@ -1074,326 +1098,323 @@ ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm12[1],xmm11[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm14[0],ymm2[2],ymm14[2] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm13[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm6[0],ymm15[0],ymm6[2],ymm15[2] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm3[0],xmm4[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm8[0],ymm2[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm11[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm3[0],ymm7[0],ymm3[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm14[0],xmm13[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm15[0],ymm6[0],ymm15[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm4[0],xmm5[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm14[1],ymm9[3],ymm14[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm10[1],xmm13[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm11[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm14[1],xmm13[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm15[1],ymm6[1],ymm15[3],ymm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm15[1],ymm6[3],ymm15[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm15[0],xmm9[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 512(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 512(%rdi), %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 464(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm10[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm3[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 464(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm13[0],xmm14[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm12[0],xmm9[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%rdi), %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm6[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] -; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm13[1],xmm14[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm12[1],xmm9[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm15[1],xmm9[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm8[1],xmm10[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rax) ; AVX1-ONLY-NEXT: addq $552, %rsp # imm = 0x228 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride6_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX2-ONLY-NEXT: subq $520, %rsp # imm = 0x208 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm5[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm4[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm3[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] +; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm13[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm11[0],xmm3[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] ; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm11[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] -; AVX2-ONLY-NEXT: vmovaps %ymm10, %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 624(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm14[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] -; AVX2-ONLY-NEXT: vbroadcastsd 104(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm15[1],ymm5[3],ymm15[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm0[0],xmm14[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vbroadcastsd 104(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm10[1],ymm4[3],ymm10[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm3[1] -; AVX2-ONLY-NEXT: vbroadcastsd 488(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm13[1] +; AVX2-ONLY-NEXT: vbroadcastsd 488(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm11[1] -; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm3[1] +; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] ; AVX2-ONLY-NEXT: vbroadcastsd 680(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm10[1],ymm1[3],ymm10[3] -; AVX2-ONLY-NEXT: vmovaps %ymm10, %ymm14 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm7[0],xmm3[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm12[0],xmm5[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 544(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm6[0],xmm3[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm12[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm14[0],xmm9[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm1[0],xmm13[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm3[1] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm15[1],ymm10[3],ymm15[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm12[1],xmm5[1] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm12[1],ymm8[3],ymm12[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm9[1] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm15[1],ymm8[1],ymm15[3],ymm8[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm12[1] -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm9[1],ymm5[1],ymm9[3],ymm5[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 464(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm7[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm4[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm3[0],xmm2[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm5[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 464(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm7[0],xmm5[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm12[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm9[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 712(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 520(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm7[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 136(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 328(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm0[0],xmm3[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 136(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm15[1],ymm11[1],ymm15[3],ymm11[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm4[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 328(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm14[1],ymm6[3],ymm14[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 520(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm12[1],ymm3[3],ymm12[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm5[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 712(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm8[1],ymm5[3],ymm8[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -1418,25 +1439,25 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r8) ; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm13, (%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 96(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 32(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 64(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 96(%rax) -; AVX2-ONLY-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rax) +; AVX2-ONLY-NEXT: addq $520, %rsp # imm = 0x208 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -1745,26 +1766,26 @@ ; SSE-LABEL: load_i64_stride6_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $1176, %rsp # imm = 0x498 -; SSE-NEXT: movaps 624(%rdi), %xmm0 -; SSE-NEXT: movaps 576(%rdi), %xmm9 -; SSE-NEXT: movaps 240(%rdi), %xmm3 -; SSE-NEXT: movaps 192(%rdi), %xmm8 -; SSE-NEXT: movaps 720(%rdi), %xmm1 -; SSE-NEXT: movaps 672(%rdi), %xmm11 -; SSE-NEXT: movaps 336(%rdi), %xmm5 -; SSE-NEXT: movaps 288(%rdi), %xmm10 -; SSE-NEXT: movaps 432(%rdi), %xmm4 -; SSE-NEXT: movaps 384(%rdi), %xmm12 -; SSE-NEXT: movaps 912(%rdi), %xmm2 -; SSE-NEXT: movaps 528(%rdi), %xmm6 -; SSE-NEXT: movaps 480(%rdi), %xmm14 +; SSE-NEXT: movaps 720(%rdi), %xmm0 +; SSE-NEXT: movaps 672(%rdi), %xmm9 +; SSE-NEXT: movaps 336(%rdi), %xmm3 +; SSE-NEXT: movaps 288(%rdi), %xmm8 +; SSE-NEXT: movaps 624(%rdi), %xmm1 +; SSE-NEXT: movaps 576(%rdi), %xmm11 +; SSE-NEXT: movaps 240(%rdi), %xmm5 +; SSE-NEXT: movaps 192(%rdi), %xmm10 +; SSE-NEXT: movaps 528(%rdi), %xmm4 +; SSE-NEXT: movaps 480(%rdi), %xmm13 ; SSE-NEXT: movaps 144(%rdi), %xmm7 -; SSE-NEXT: movaps 96(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm15 +; SSE-NEXT: movaps 96(%rdi), %xmm12 +; SSE-NEXT: movaps 816(%rdi), %xmm2 +; SSE-NEXT: movaps 432(%rdi), %xmm6 +; SSE-NEXT: movaps 384(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm12, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm7[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm7[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm10, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1780,11 +1801,11 @@ ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm3 +; SSE-NEXT: movaps %xmm13, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm4[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm11, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1795,21 +1816,14 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 864(%rdi), %xmm0 +; SSE-NEXT: movaps 768(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 816(%rdi), %xmm0 -; SSE-NEXT: movaps 768(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1104(%rdi), %xmm0 -; SSE-NEXT: movaps 1056(%rdi), %xmm1 +; SSE-NEXT: movaps 912(%rdi), %xmm0 +; SSE-NEXT: movaps 864(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1822,8 +1836,8 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1296(%rdi), %xmm0 -; SSE-NEXT: movaps 1248(%rdi), %xmm1 +; SSE-NEXT: movaps 1104(%rdi), %xmm0 +; SSE-NEXT: movaps 1056(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1836,8 +1850,8 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1488(%rdi), %xmm0 -; SSE-NEXT: movaps 1440(%rdi), %xmm1 +; SSE-NEXT: movaps 1296(%rdi), %xmm0 +; SSE-NEXT: movaps 1248(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1850,6 +1864,13 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 1488(%rdi), %xmm0 +; SSE-NEXT: movaps 1440(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%rdi), %xmm1 ; SSE-NEXT: movaps 48(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -2067,14 +2088,6 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rsi) @@ -2083,13 +2096,13 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movaps %xmm0, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps %xmm0, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2099,38 +2112,46 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rdx) +; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rdx) +; SSE-NEXT: movaps %xmm0, 224(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rdx) +; SSE-NEXT: movaps %xmm0, 192(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rdx) +; SSE-NEXT: movaps %xmm0, 160(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%rcx) @@ -2250,85 +2271,85 @@ ; AVX1-ONLY-LABEL: load_i64_stride6_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $1624, %rsp # imm = 0x658 -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovaps 1008(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1440(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm13[0] +; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 1008(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1440(%rdi), %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2367,7 +2388,7 @@ ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm14[1],xmm13[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] @@ -2382,128 +2403,123 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm14[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm9[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm4[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm13[0],ymm3[2],ymm13[2] -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm2[0],xmm6[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm3[0],xmm7[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm13[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm9[1],ymm14[3],ymm9[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],xmm14[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm13[1],ymm3[3],ymm13[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm6[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2514,6 +2530,11 @@ ; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm7[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 @@ -2642,14 +2663,6 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rsi) @@ -2658,13 +2671,13 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -2674,13 +2687,13 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -2690,22 +2703,30 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%r8) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r8) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%r9) @@ -2739,104 +2760,110 @@ ; ; AVX2-ONLY-LABEL: load_i64_stride6_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1496, %rsp # imm = 0x5D8 -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm2 +; AVX2-ONLY-NEXT: subq $1448, %rsp # imm = 0x5A8 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm0[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 624(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm0[0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1008(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1392(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1200(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] -; AVX2-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm2[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm12[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] +; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm12[0],xmm11[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 624(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm3[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 816(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm10[0],xmm13[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 1008(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm14[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1200(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm14[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 1392(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; AVX2-ONLY-NEXT: vbroadcastsd 104(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm6[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vbroadcastsd 488(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] @@ -2844,163 +2871,124 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 680(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vbroadcastsd 872(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 1064(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm5[1],mem[1] +; AVX2-ONLY-NEXT: vbroadcastsd 1256(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm4[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 1448(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm12[1] +; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm2[1] -; AVX2-ONLY-NEXT: vbroadcastsd 104(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm15[1],ymm2[3],ymm15[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm12[1],xmm11[1] -; AVX2-ONLY-NEXT: vbroadcastsd 488(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm3[1] +; AVX2-ONLY-NEXT: vbroadcastsd 680(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm11[1],ymm3[3],ymm11[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm13[1] -; AVX2-ONLY-NEXT: vbroadcastsd 872(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm14[1] +; AVX2-ONLY-NEXT: vbroadcastsd 1064(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm9[1],ymm3[3],ymm9[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm14[1] -; AVX2-ONLY-NEXT: vbroadcastsd 1256(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vbroadcastsd 1448(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm3[0] +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm7[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm6[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 1120(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 544(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 976(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm8[0] +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 1504(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vbroadcastsd 928(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 1360(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm10[0] +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 1312(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm2[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 544(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1168(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm1[0],xmm0[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm0[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 928(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm4[0],ymm12[0],ymm4[2],ymm12[2] -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm13[0],xmm12[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 1312(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm14[0],ymm1[2],ymm14[2] -; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovaps 1168(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm15[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm3[1] -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm14 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm9[1],xmm8[1] -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm11[1],xmm10[1] -; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm8[0],ymm2[2],ymm8[2] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm15[0],xmm14[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 736(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm3[0],ymm8[0],ymm3[2],ymm8[2] +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm13[0],xmm12[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm12[1] -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 1120(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm4[0],ymm8[0],ymm4[2],ymm8[2] +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 976(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm11[0],xmm10[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 1504(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm8[0],ymm5[2],ymm8[2] +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 1360(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm8[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm7[1] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm6[1],ymm1[3],ymm6[3] +; AVX2-ONLY-NEXT: vmovaps %ymm6, %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm14[1] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -3008,6 +2996,15 @@ ; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm14 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm12[1] +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -3015,14 +3012,39 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm12[1],ymm1[3],ymm12[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm8[1] +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3032,33 +3054,33 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 464(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] +; AVX2-ONLY-NEXT: vmovaps 464(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] +; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm14[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 848(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm7 @@ -3066,7 +3088,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 1040(%rdi), %xmm8 ; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm5 @@ -3074,16 +3096,16 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1232(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 1232(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm6[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm4[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 1424(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %xmm1 @@ -3091,32 +3113,31 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 136(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = xmm9[1],mem[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 328(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = xmm9[1],mem[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm13[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 520(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm13[1],xmm14[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm11 = xmm11[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 712(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm11[1],xmm12[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm14[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 904(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm10[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 1096(%rdi), %ymm0 @@ -3125,23 +3146,15 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 1288(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 1480(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) @@ -3150,13 +3163,13 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -3166,13 +3179,13 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -3182,22 +3195,30 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r8) ; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r9) @@ -3219,11 +3240,10 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm5, 160(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm9, 96(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 32(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX2-ONLY-NEXT: addq $1496, %rsp # imm = 0x5D8 +; AVX2-ONLY-NEXT: vmovaps %ymm11, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm15, (%rax) +; AVX2-ONLY-NEXT: addq $1448, %rsp # imm = 0x5A8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -3231,350 +3251,356 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $2632, %rsp # imm = 0xA48 ; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm18 ; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm19 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm28 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10] ; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm7 ; AVX512F-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <0,6,12,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm7, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm2, %zmm21, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm18, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11] ; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = <1,7,13,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm2, %zmm21, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [10,4,10,4,10,4,10,4] -; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,5,11,5,11,5,11,5] -; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,0,0,6,12,0,0,6] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <1,7,13,u> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm10, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [12,0,0,6,12,0,0,6] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,0,1,7,13,0,1,7] ; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm21, %zmm2, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm2, %zmm18, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm18, %zmm2, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm18, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm18, %zmm2, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <10,0,6,u> -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm31 = <11,1,7,u> -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = <11,1,7,u> +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm5, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,10,4,10,4,10,4,10] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm11, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm12, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm18 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm20 ; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm10, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm15 -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512F-NEXT: vpermi2q %zmm19, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm12 -; AVX512F-NEXT: vpermi2q %zmm19, %zmm4, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm11, %zmm13 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm19, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm31, %zmm14 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm19, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm19, %zmm4, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm6, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm3, %zmm6, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm6, %zmm3, %zmm11 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm3, %zmm12 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm6, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm17 ; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] -; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm1, %zmm24 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm16 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,6,12,0,0,6,12] ; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm3, %zmm27 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,7,13,0,1,7,13] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm31 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,10,0,6,0,10,0,6] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm3, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm27 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm21, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm21, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm3, %zmm16 -; AVX512F-NEXT: vpermi2q %zmm21, %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm3, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm4, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm5, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm8, %zmm2, %zmm3 +; AVX512F-NEXT: vpermi2q %zmm8, %zmm2, %zmm4 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm8, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm8 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm12 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm21 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm17 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm26 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512F-NEXT: movb $-32, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm28 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm5, %zmm10, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm25, %zmm10, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512F-NEXT: vinserti32x4 $0, %xmm10, %zmm3, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm5, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm4, %zmm9, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm8, %zmm6, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm20, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm15, 128(%rsi) +; AVX512F-NEXT: vinserti32x4 $0, %xmm18, %zmm9, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm6, %zmm7, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm7, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm24, 128(%rsi) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm29, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm17, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm7, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm14, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm26, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rdx) +; AVX512F-NEXT: vmovups (%rsp), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm7, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm11, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm28, 128(%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm13, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm14, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm12, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm29, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm16, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm2, 128(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512F-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -3583,350 +3609,356 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $2632, %rsp # imm = 0xA48 ; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm18 ; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm19 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm28 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,0,10,0,6,0,10] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm7 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <0,6,12,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm21, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm18, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,7,0,11,1,7,0,11] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <1,7,13,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm21, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [10,4,10,4,10,4,10,4] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,5,11,5,11,5,11,5] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,0,0,6,12,0,0,6] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <1,7,13,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [12,0,0,6,12,0,0,6] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [13,0,1,7,13,0,1,7] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm12, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm18, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <10,0,6,u> -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm31 = <11,1,7,u> -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <11,1,7,u> +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,10,4,10,4,10,4,10] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm31, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm11, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm19, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm12 -; AVX512BW-NEXT: vpermi2q %zmm19, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm13 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm19, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm31, %zmm14 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm19, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm19, %zmm4, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm6, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm3, %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm3, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm6, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm18 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm24 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm16 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,6,12,0,0,6,12] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm27 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,7,13,0,1,7,13] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm31 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,10,0,6,0,10,0,6] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm27 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm21, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm21, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm3, %zmm16 -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm5, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm2, %zmm3 +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm2, %zmm4 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm8, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm8 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm28 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm29 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm25, %zmm10, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm10, %zmm3, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm15, %zmm5, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm9, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm8, %zmm6, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm20, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rsi) +; AVX512BW-NEXT: vinserti32x4 $0, %xmm18, %zmm9, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm7, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm7, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 128(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm29, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm7, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rdx) +; AVX512BW-NEXT: vmovups (%rsp), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm7, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 128(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512BW-NEXT: addq $2632, %rsp # imm = 0xA48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -4982,182 +5014,182 @@ ; ; AVX1-ONLY-LABEL: load_i64_stride6_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $3768, %rsp # imm = 0xEB8 -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm2 +; AVX1-ONLY-NEXT: subq $3736, %rsp # imm = 0xE98 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovaps 1008(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1440(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1824(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1632(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1776(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2048(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2208(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2016(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 2160(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1968(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2112(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2624(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2592(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2400(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 2544(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 2352(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2304(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3008(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2816(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2976(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2784(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 2928(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 2736(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2880(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2688(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1008(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1440(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1632(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1824(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 1776(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2048(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, 2016(%rdi), %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2208(%rdi), %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovaps 1968(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 2160(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 2112(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, 2400(%rdi), %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vmovaps 2624(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2592(%rdi), %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovaps 2352(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 2304(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 2544(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2816(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, 2784(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 3008(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2976(%rdi), %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovaps 2736(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 2688(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2928(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 2880(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5275,123 +5307,123 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1888(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1744(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2208(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2272(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2592(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2656(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2560(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2512(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2976(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 3040(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2944(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2896(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2784(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2848(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1696(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2752(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2704(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2400(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2464(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1888(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2368(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2320(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1744(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5407,164 +5439,164 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2208(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1696(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2272(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2400(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2464(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] +; AVX1-ONLY-NEXT: vmovaps 2368(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 2320(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm7[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2592(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2656(%rdi), %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovaps 2560(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 2512(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm15[0],ymm6[2],ymm15[2] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm4[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2784(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2848(%rdi), %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovaps 2752(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 2704(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm14[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm0[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2976(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 3040(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovaps 2944(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 2896(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm15[1],ymm6[3],ymm15[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm8[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm12[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 @@ -5585,7 +5617,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -5757,7 +5789,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd (%rsp), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5808,7 +5840,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] @@ -5855,22 +5887,6 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rsi) @@ -5887,21 +5903,21 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -5919,38 +5935,54 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%r8) @@ -6026,7 +6058,7 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) @@ -6042,214 +6074,214 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $3768, %rsp # imm = 0xEB8 +; AVX1-ONLY-NEXT: addq $3736, %rsp # imm = 0xE98 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride6_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $3432, %rsp # imm = 0xD68 -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm2 +; AVX2-ONLY-NEXT: subq $3272, %rsp # imm = 0xCC8 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm7 ; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm0[0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 624(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1008(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1392(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1200(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1776(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1584(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2240(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2048(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2208(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2160(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1968(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2112(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2624(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2432(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2592(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 2400(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2544(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 2352(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2496(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 2304(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3008(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2816(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2976(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 2784(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2928(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 2736(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2880(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 2688(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 624(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1008(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm11 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1200(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1392(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1584(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 1776(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %xmm7 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm14[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2048(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2240(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 2208(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] ; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1968(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 2160(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 2112(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm10[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2432(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 2400(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2624(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 2592(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2352(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 2304(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 2544(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 2496(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm6[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2816(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 2784(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 3008(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 2976(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2736(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 2688(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 2928(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 2880(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 104(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] @@ -6257,7 +6289,7 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 680(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 488(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] @@ -6265,7 +6297,7 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 1064(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 872(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] @@ -6273,7 +6305,7 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 1448(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 1256(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] @@ -6281,7 +6313,7 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 1832(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 1640(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] @@ -6289,7 +6321,7 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 2216(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 2024(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] @@ -6297,7 +6329,7 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 2600(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 2408(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] @@ -6305,7 +6337,7 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 2984(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 2792(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] @@ -6313,352 +6345,346 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 104(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm13[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 488(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vbroadcastsd 680(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm11[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 872(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vbroadcastsd 1064(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm9[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 1256(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vbroadcastsd 1448(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm14[1] -; AVX2-ONLY-NEXT: vbroadcastsd 1640(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vbroadcastsd 1832(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm10[1] -; AVX2-ONLY-NEXT: vbroadcastsd 2024(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vbroadcastsd 2216(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm12[1],ymm5[3],ymm12[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm6[1] -; AVX2-ONLY-NEXT: vbroadcastsd 2408(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vbroadcastsd 2600(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm8[1],ymm3[3],ymm8[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vbroadcastsd 2792(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vbroadcastsd 2984(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 736(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 1120(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 544(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 1504(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 736(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1360(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 1888(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 928(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1744(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 2272(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 1120(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 2176(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2128(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 2656(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 1312(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 2560(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2512(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1168(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 3040(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 1504(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 2944(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2896(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1360(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 2848(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 1696(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 2752(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2704(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1552(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 2464(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 1888(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 2368(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2320(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vmovaps 1744(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 2080(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1936(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vmovaps 1936(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 1696(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 2272(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 1552(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] +; AVX2-ONLY-NEXT: vmovaps 2176(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovaps 2128(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm15[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 1312(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 1168(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm9[0] +; AVX2-ONLY-NEXT: vbroadcastsd 2464(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 2368(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 2320(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm13[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 928(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm5[0],xmm6[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 544(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm3[0],xmm13[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm15[0],xmm0[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm14[1],ymm2[3],ymm14[3] -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vbroadcastsd 2656(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm15 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm13[1] -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 2560(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 2512(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm11[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vbroadcastsd 2848(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm6[1] -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 2752(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 2704(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm9[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vbroadcastsd 3040(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 2944(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 2896(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm3[0],xmm7[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm9[1] -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm2[1],ymm10[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm5[1],mem[1] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm5[1],ymm14[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm8 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm12[1] -; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm1[1],ymm14[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm10 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm1[1],ymm14[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 2080(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm11 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm1[1],ymm14[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 2272(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm1[1],ymm14[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 2464(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm1[1],ymm14[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 2656(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm1[1],ymm14[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 2848(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm1[1],ymm14[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] +; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm1[1],ymm14[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm12[1],mem[1] +; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm14 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm10[1],mem[1] +; AVX2-ONLY-NEXT: vmovaps 2080(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm1[1],ymm10[3],ymm1[3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm12 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm15[1] +; AVX2-ONLY-NEXT: vmovaps 2272(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm1[1],ymm8[3],ymm1[3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm10 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm13[1] +; AVX2-ONLY-NEXT: vmovaps 2464(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm8 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm11[1] +; AVX2-ONLY-NEXT: vmovaps 2656(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm9[1] +; AVX2-ONLY-NEXT: vmovaps 2848(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm7[1] ; AVX2-ONLY-NEXT: vmovaps 3040(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] @@ -6667,27 +6693,29 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps %xmm12, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 464(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6697,7 +6725,8 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6707,7 +6736,8 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 848(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6717,7 +6747,8 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 1040(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6727,7 +6758,8 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 1232(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6737,7 +6769,8 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 1424(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6747,7 +6780,8 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 1616(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6757,54 +6791,47 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1808(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1808(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2000(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2000(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2192(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2192(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vmovaps 2144(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2384(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 2384(%rdi), %xmm8 ; AVX2-ONLY-NEXT: vmovaps 2336(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm12[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm8[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2576(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 2576(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovaps 2528(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm8[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 2768(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps 2720(%rdi), %xmm3 @@ -6822,7 +6849,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 136(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] @@ -6831,7 +6858,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd (%rsp), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6893,31 +6920,31 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 1864(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = xmm13[1],mem[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 2056(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm11 = xmm11[1],mem[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 2248(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = xmm9[1],mem[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 2440(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm12[1] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 2632(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 2824(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 3016(%rdi), %ymm0 @@ -6926,22 +6953,6 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rsi) @@ -6958,21 +6969,21 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -6990,38 +7001,54 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%r8) @@ -7110,182 +7137,187 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX2-ONLY-NEXT: addq $3432, %rsp # imm = 0xD68 +; AVX2-ONLY-NEXT: addq $3272, %rsp # imm = 0xCC8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride6_vf64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $7240, %rsp # imm = 0x1C48 -; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11] +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,0,10,0,6,0,10] ; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm9, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm10, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [1,7,0,11,1,7,0,11] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm3, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [10,4,10,4,10,4,10,4] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [11,5,11,5,11,5,11,5] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm12 ; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm12, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [12,0,0,6,12,0,0,6] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,0,1,7,13,0,1,7] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm13, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm3, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm10, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2816(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 2880(%rdi), %zmm1 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 +; AVX512F-NEXT: vmovdqa64 2816(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 2880(%rdi), %zmm0 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <0,6,12,u> +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <0,6,12,u> ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <1,7,13,u> +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <1,7,13,u> ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = <10,0,6,u> ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -7295,22 +7327,22 @@ ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,10,4,10,4,10,4,10] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 @@ -7319,17 +7351,17 @@ ; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 @@ -7338,17 +7370,17 @@ ; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 @@ -7357,17 +7389,17 @@ ; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 @@ -7376,334 +7408,328 @@ ; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm10, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm11, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm28 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm5, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm24 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm20 ; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm19 ; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm5, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm28 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm18 ; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm16 ; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm25 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm5, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm23 -; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm17 ; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm5, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm29 -; AVX512F-NEXT: vmovdqa64 3008(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 2688(%rdi), %zmm12 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm12, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm2, %zmm12, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm12, %zmm2, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm12, %zmm2, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm2, %zmm12, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm5, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 3008(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 2688(%rdi), %zmm8 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm8, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm8, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm8, %zmm1, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm8, %zmm1, %zmm11 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm8, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 ; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] ; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] ; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] ; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm2, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm2, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm2, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 -; AVX512F-NEXT: vpermi2q %zmm14, %zmm31, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm2, %zmm7 -; AVX512F-NEXT: vpermi2q %zmm14, %zmm31, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm16 -; AVX512F-NEXT: vpermi2q %zmm31, %zmm14, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm2, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm3, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm1, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm2, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm2, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm3, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 +; AVX512F-NEXT: vpermi2q %zmm13, %zmm26, %zmm1 +; AVX512F-NEXT: vpermi2q %zmm13, %zmm26, %zmm2 +; AVX512F-NEXT: vpermi2q %zmm26, %zmm13, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm13 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm25, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} ; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} ; AVX512F-NEXT: movb $-32, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm30 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm26 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -7722,150 +7748,151 @@ ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm23 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 16-byte Folded Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 16-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 16-byte Folded Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm9 # 16-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 16-byte Folded Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm16 # 16-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 16-byte Folded Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 16-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 16-byte Folded Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm19 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm19 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm4, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-NEXT: vinserti32x4 $0, %xmm5, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm21 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm20 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm12, %zmm21, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm31, %zmm15, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm22, %zmm14, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm8, 448(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm29, 384(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, 320(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, 256(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, 192(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, 128(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, 64(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm9, 448(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, 256(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, 320(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, 128(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, 192(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, (%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, 64(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, 384(%rdx) +; AVX512F-NEXT: vinserti32x4 $0, %xmm28, %zmm14, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm8, %zmm12, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm6, 448(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 384(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 320(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 256(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 192(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 128(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 64(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512F-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 384(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 320(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 256(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 192(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 128(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 64(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm10, 448(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, 256(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, 320(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, 128(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, 192(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, (%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, 64(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, 384(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm26, 384(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm30, 320(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 256(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 192(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 128(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 64(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm27, 256(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm31, 320(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, 128(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, 192(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, (%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, 64(%r8) -; AVX512F-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm8, 384(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm4, 448(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm15, 256(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm13, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm23, 384(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm25, 320(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm29, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm27, 192(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 128(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, 64(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm6, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm5, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm19, 384(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm17, 320(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm16, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm9, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm0, 384(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: vmovdqa64 %zmm8, 448(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm12, 448(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm19, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm15, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, 256(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512F-NEXT: addq $7240, %rsp # imm = 0x1C48 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -7873,175 +7900,180 @@ ; AVX512BW-LABEL: load_i64_stride6_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $7240, %rsp # imm = 0x1C48 -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,0,11,1,7,0,11] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,0,10,0,6,0,10] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,4,10,4,10,4,10,4] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [1,7,0,11,1,7,0,11] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,5,11,5,11,5,11,5] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [10,4,10,4,10,4,10,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,0,0,6,12,0,0,6] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,0,1,7,13,0,1,7] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [11,5,11,5,11,5,11,5] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm12, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [12,0,0,6,12,0,0,6] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,0,1,7,13,0,1,7] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm13, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm9 +; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <0,6,12,u> +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <0,6,12,u> ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <1,7,13,u> +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <1,7,13,u> ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <10,0,6,u> ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -8051,22 +8083,22 @@ ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,10,4,10,4,10,4,10] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,11,5,11,5,11,5,11] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 @@ -8075,17 +8107,17 @@ ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 @@ -8094,17 +8126,17 @@ ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 @@ -8113,17 +8145,17 @@ ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm3 @@ -8132,334 +8164,328 @@ ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm11, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm10, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm11, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm28 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm20 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm19 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm18 ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm16 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm29 -; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm12 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm12, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm12, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm12, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm12, %zmm2, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm12, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm1, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm1, %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm2, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm31, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm31, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm16 -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm14, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm3, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm13, %zmm26, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm13, %zmm26, %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm13 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm25, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm30 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -8478,150 +8504,151 @@ ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm23 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 16-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm9 # 16-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm16 # 16-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm13 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 16-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm17 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm19 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm19 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm21 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm20 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm12, %zmm21, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm31, %zmm15, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm22, %zmm14, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 384(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 320(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 256(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 192(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 128(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 448(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 256(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 320(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 128(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 192(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, (%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 64(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 384(%rdx) +; AVX512BW-NEXT: vinserti32x4 $0, %xmm28, %zmm14, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm8, %zmm12, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 384(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 320(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 256(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 192(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 128(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 384(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 320(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 256(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 192(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 128(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 64(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm10, 448(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 256(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 320(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 128(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 192(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, (%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 64(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 384(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 384(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 320(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 256(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 192(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 128(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 64(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 320(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 128(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 192(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, (%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 64(%r8) -; AVX512BW-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm8, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 128(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, 64(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm6, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 384(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 384(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512BW-NEXT: addq $7240, %rsp # imm = 0x1C48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -8641,12 +8668,11 @@ ret void } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; AVX1: {{.*}} +; AVX: {{.*}} ; AVX2: {{.*}} ; AVX2-FAST: {{.*}} ; AVX2-FAST-PERLANE: {{.*}} ; AVX2-SLOW: {{.*}} -; AVX512: {{.*}} ; AVX512BW-FAST: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} ; AVX512BW-ONLY-SLOW: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll @@ -101,27 +101,26 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3] -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3] -; AVX512-NEXT: vmovdqa 80(%rdi), %xmm6 -; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3] -; AVX512-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3] -; AVX512-NEXT: vmovdqa %xmm4, (%rsi) -; AVX512-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512-NEXT: vmovdqa %xmm5, (%rcx) -; AVX512-NEXT: vmovdqa %xmm1, (%r8) -; AVX512-NEXT: vmovdqa %xmm6, (%r9) -; AVX512-NEXT: vmovdqa %xmm2, (%r10) -; AVX512-NEXT: vmovdqa %xmm3, (%rax) +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = mem[0,1],xmm2[2,3] +; AVX512-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512-NEXT: vpunpcklqdq 72(%rdi){1to2}, %xmm0, %xmm0 +; AVX512-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512-NEXT: vpunpcklqdq 88(%rdi){1to2}, %xmm1, %xmm1 +; AVX512-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512-NEXT: vpunpcklqdq 104(%rdi){1to2}, %xmm2, %xmm2 +; AVX512-NEXT: vmovdqa %xmm3, (%rsi) +; AVX512-NEXT: vmovaps %xmm4, (%rdx) +; AVX512-NEXT: vmovdqa %xmm0, (%rcx) +; AVX512-NEXT: vmovaps %xmm5, (%r8) +; AVX512-NEXT: vmovdqa %xmm1, (%r9) +; AVX512-NEXT: vmovaps %xmm6, (%r10) +; AVX512-NEXT: vmovdqa %xmm2, (%rax) ; AVX512-NEXT: retq %wide.vec = load <14 x i64>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <14 x i64> %wide.vec, <14 x i64> poison, <2 x i32> @@ -211,17 +210,16 @@ ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm4[0],ymm5[3],ymm4[2] ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm7[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm10[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm8[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm7[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1],ymm9[0],ymm7[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm2[0,1,2],ymm8[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = mem[0],xmm9[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3] @@ -261,17 +259,18 @@ ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm8 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq 128(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] -; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm8, %ymm9 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] +; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm9[2,3] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm8 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] @@ -315,23 +314,23 @@ ; AVX512-SLOW-NEXT: vmovdqa 16(%rdi), %xmm7 ; AVX512-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] ; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512-SLOW-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX512-SLOW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX512-SLOW-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX512-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512-SLOW-NEXT: vmovdqa 64(%rdi), %ymm8 +; AVX512-SLOW-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512-SLOW-NEXT: vmovdqa 192(%rdi), %xmm9 +; AVX512-SLOW-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] ; AVX512-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] -; AVX512-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm7 -; AVX512-SLOW-NEXT: vmovdqa 128(%rdi), %ymm8 -; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX512-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm8[2,3],ymm6[2,3] +; AVX512-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm8 +; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX512-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] ; AVX512-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512-SLOW-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512-SLOW-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512-SLOW-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] ; AVX512-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11] ; AVX512-SLOW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512-SLOW-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 @@ -342,8 +341,8 @@ ; AVX512-SLOW-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512-SLOW-NEXT: vmovdqa %ymm2, (%rcx) ; AVX512-SLOW-NEXT: vmovdqa %ymm6, (%r8) -; AVX512-SLOW-NEXT: vmovdqa %ymm7, (%r9) -; AVX512-SLOW-NEXT: vmovdqa %ymm8, (%r10) +; AVX512-SLOW-NEXT: vmovdqa %ymm8, (%r9) +; AVX512-SLOW-NEXT: vmovdqa %ymm7, (%r10) ; AVX512-SLOW-NEXT: vmovdqa %ymm3, (%rax) ; AVX512-SLOW-NEXT: vzeroupper ; AVX512-SLOW-NEXT: retq @@ -365,27 +364,28 @@ ; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX512-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,7,0,7] ; AVX512-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512-FAST-NEXT: vmovdqa 128(%rdi), %xmm6 ; AVX512-FAST-NEXT: vpermi2q 160(%rdi), %ymm6, %ymm2 ; AVX512-FAST-NEXT: vmovdqa 16(%rdi), %xmm7 ; AVX512-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] ; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FAST-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX512-FAST-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512-FAST-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX512-FAST-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX512-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] -; AVX512-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX512-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512-FAST-NEXT: vmovdqa 64(%rdi), %ymm8 +; AVX512-FAST-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512-FAST-NEXT: vmovdqa 192(%rdi), %xmm9 +; AVX512-FAST-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX512-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm8[2,3],ymm6[2,3] +; AVX512-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm8 +; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX512-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] ; AVX512-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512-FAST-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512-FAST-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512-FAST-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] ; AVX512-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11] ; AVX512-FAST-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512-FAST-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 @@ -395,9 +395,9 @@ ; AVX512-FAST-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512-FAST-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512-FAST-NEXT: vmovdqa %ymm2, (%rcx) -; AVX512-FAST-NEXT: vmovdqa %ymm7, (%r8) +; AVX512-FAST-NEXT: vmovdqa %ymm6, (%r8) ; AVX512-FAST-NEXT: vmovdqa %ymm8, (%r9) -; AVX512-FAST-NEXT: vmovdqa %ymm6, (%r10) +; AVX512-FAST-NEXT: vmovdqa %ymm7, (%r10) ; AVX512-FAST-NEXT: vmovdqa %ymm3, (%rax) ; AVX512-FAST-NEXT: vzeroupper ; AVX512-FAST-NEXT: retq @@ -423,55 +423,57 @@ ; SSE-LABEL: load_i64_stride7_vf8: ; SSE: # %bb.0: ; SSE-NEXT: subq $88, %rsp -; SSE-NEXT: movapd 320(%rdi), %xmm1 -; SSE-NEXT: movapd 208(%rdi), %xmm0 -; SSE-NEXT: movapd 256(%rdi), %xmm3 +; SSE-NEXT: movapd 208(%rdi), %xmm1 +; SSE-NEXT: movapd 96(%rdi), %xmm0 ; SSE-NEXT: movapd 144(%rdi), %xmm2 -; SSE-NEXT: movapd 304(%rdi), %xmm5 ; SSE-NEXT: movapd 192(%rdi), %xmm4 -; SSE-NEXT: movapd 240(%rdi), %xmm7 +; SSE-NEXT: movapd 80(%rdi), %xmm3 ; SSE-NEXT: movapd 128(%rdi), %xmm6 -; SSE-NEXT: movapd 288(%rdi), %xmm9 -; SSE-NEXT: movapd 176(%rdi), %xmm8 -; SSE-NEXT: movapd 336(%rdi), %xmm10 -; SSE-NEXT: movapd 224(%rdi), %xmm11 -; SSE-NEXT: movapd 272(%rdi), %xmm14 +; SSE-NEXT: movapd 176(%rdi), %xmm9 +; SSE-NEXT: movapd 64(%rdi), %xmm8 +; SSE-NEXT: movapd (%rdi), %xmm10 +; SSE-NEXT: movapd 16(%rdi), %xmm7 +; SSE-NEXT: movapd 32(%rdi), %xmm5 +; SSE-NEXT: movapd 48(%rdi), %xmm15 +; SSE-NEXT: movaps 336(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 112(%rdi), %xmm13 -; SSE-NEXT: movapd 160(%rdi), %xmm15 -; SSE-NEXT: movapd %xmm15, %xmm12 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm13[0],xmm12[1] -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm8[0] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm6[0],xmm8[1] +; SSE-NEXT: movapd 160(%rdi), %xmm14 +; SSE-NEXT: movapd %xmm15, %xmm11 +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm10[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm8[0] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm7[0],xmm8[1] ; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm4[0] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] -; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm3[0] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm5[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm0[0] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm14, %xmm12 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm11[0],xmm12[1] -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm9[0] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm13[0],xmm12[1] +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm9[0] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm6[0],xmm9[1] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm5[0] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm1[0] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm4[0] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 384(%rdi), %xmm2 ; SSE-NEXT: movapd %xmm2, %xmm11 -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm10[0],xmm11[1] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] ; SSE-NEXT: movapd 400(%rdi), %xmm7 -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm7[0] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm7[0] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 352(%rdi), %xmm8 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm8[0],xmm7[1] ; SSE-NEXT: movapd 416(%rdi), %xmm10 @@ -481,252 +483,247 @@ ; SSE-NEXT: movapd 432(%rdi), %xmm14 ; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm14[0] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] -; SSE-NEXT: movapd (%rdi), %xmm2 -; SSE-NEXT: movapd 48(%rdi), %xmm9 +; SSE-NEXT: movapd 224(%rdi), %xmm2 +; SSE-NEXT: movapd 272(%rdi), %xmm9 ; SSE-NEXT: movapd %xmm9, %xmm3 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; SSE-NEXT: movapd 64(%rdi), %xmm0 +; SSE-NEXT: movapd 288(%rdi), %xmm0 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] -; SSE-NEXT: movapd 16(%rdi), %xmm1 +; SSE-NEXT: movapd 240(%rdi), %xmm1 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd 80(%rdi), %xmm4 +; SSE-NEXT: movapd 304(%rdi), %xmm4 ; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm4[0] -; SSE-NEXT: movapd 32(%rdi), %xmm5 +; SSE-NEXT: movapd 256(%rdi), %xmm5 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] -; SSE-NEXT: movapd 96(%rdi), %xmm6 +; SSE-NEXT: movapd 320(%rdi), %xmm6 ; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm6[0] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1] -; SSE-NEXT: movapd %xmm3, (%rsi) +; SSE-NEXT: movapd %xmm3, 32(%rsi) ; SSE-NEXT: movapd %xmm11, 48(%rsi) -; SSE-NEXT: movapd %xmm12, 32(%rsi) +; SSE-NEXT: movapd %xmm12, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movapd %xmm2, (%rdx) +; SSE-NEXT: movaps %xmm3, (%rsi) +; SSE-NEXT: movapd %xmm2, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rdx) ; SSE-NEXT: movapd %xmm13, 16(%rdx) -; SSE-NEXT: movapd %xmm0, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rdx) +; SSE-NEXT: movapd %xmm0, 32(%rcx) ; SSE-NEXT: movapd %xmm7, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movapd %xmm1, (%r8) -; SSE-NEXT: movapd %xmm8, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%r8) +; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movapd %xmm1, 32(%r8) +; SSE-NEXT: movapd %xmm8, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movapd %xmm4, (%r9) -; SSE-NEXT: movapd %xmm10, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%r9) +; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movapd %xmm4, 32(%r9) +; SSE-NEXT: movapd %xmm10, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm5, (%rax) +; SSE-NEXT: movapd %xmm5, 32(%rax) ; SSE-NEXT: movapd %xmm15, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm6, (%rax) +; SSE-NEXT: movapd %xmm6, 32(%rax) ; SSE-NEXT: movapd %xmm14, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: addq $88, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride7_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm1[1] +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm2[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm14 = xmm13[0],xmm4[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm14 = mem[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm8[0],ymm6[3],ymm8[2] -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm14[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[3],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm13[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm14[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm11[0,1,2,3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm14[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm15 = xmm14[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm15[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm11[1],ymm10[0],ymm11[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm14[1],ymm13[0],ymm14[2],ymm13[2] -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = mem[0],xmm10[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm1[0],xmm13[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm13[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm15[0],ymm13[0],ymm15[3],ymm13[2] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[3],ymm15[2] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm14 = xmm14[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm10[0],ymm8[3],ymm10[2] +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm13[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm13[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm12[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm13[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm13[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm14 = xmm13[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[1],ymm11[0],ymm12[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3] +; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm13[1],ymm14[0],ymm13[2],ymm14[2] +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm4[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm11 = mem[0],xmm11[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm14 = mem[0],xmm14[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm14[0],ymm4[3],ymm14[2] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm15[0],ymm6[0],ymm15[3],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm2[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm5[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm6, (%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 32(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm8, (%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 32(%r8) -; AVX1-ONLY-NEXT: vmovapd %ymm11, (%r8) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 32(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm10, (%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm9, (%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 32(%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm12, (%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 32(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm11, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovapd %ymm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm5, (%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm4, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovapd %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm13, (%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 32(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm1, (%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride7_vf8: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = mem[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm13[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = mem[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq 128(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm13[1],ymm9[1],ymm13[3],ymm9[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = mem[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq 352(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] -; AVX2-ONLY-NEXT: vmovdqa 240(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm13, %ymm14 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm14[1],ymm9[1],ymm14[3],ymm9[3] +; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm14, %ymm15 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm15[1],ymm10[1],ymm15[3],ymm10[3] +; AVX2-ONLY-NEXT: vmovdqa 240(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm15 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm14 = ymm14[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm7 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm15[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm15[1],ymm1[3],ymm15[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq %xmm3, %ymm2 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -736,16 +733,16 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) ; AVX2-ONLY-NEXT: vmovdqa %ymm10, 32(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm9, (%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm12, 32(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, (%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 32(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 32(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm13, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -998,8 +995,8 @@ ; SSE-NEXT: movapd 192(%rdi), %xmm6 ; SSE-NEXT: movapd 80(%rdi), %xmm5 ; SSE-NEXT: movapd 128(%rdi), %xmm8 -; SSE-NEXT: movapd 64(%rdi), %xmm10 -; SSE-NEXT: movapd 176(%rdi), %xmm11 +; SSE-NEXT: movapd 176(%rdi), %xmm10 +; SSE-NEXT: movapd 64(%rdi), %xmm11 ; SSE-NEXT: movapd (%rdi), %xmm12 ; SSE-NEXT: movapd 16(%rdi), %xmm9 ; SSE-NEXT: movapd 32(%rdi), %xmm7 @@ -1010,10 +1007,10 @@ ; SSE-NEXT: movapd %xmm0, %xmm15 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm12[0],xmm15[1] ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm10[0] +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm11[0] ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm9[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm9[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm5[0] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1] @@ -1025,10 +1022,10 @@ ; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm11[0] +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm10[0] ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm8[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm6[0] ; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] @@ -1124,12 +1121,12 @@ ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 672(%rdi), %xmm6 ; SSE-NEXT: movapd 720(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1] -; SSE-NEXT: movapd 736(%rdi), %xmm8 -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm8[0] +; SSE-NEXT: movapd %xmm0, %xmm8 +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm6[0],xmm8[1] +; SSE-NEXT: movapd 736(%rdi), %xmm7 +; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm7[0] ; SSE-NEXT: movapd 688(%rdi), %xmm10 -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm10[0],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm10[0],xmm7[1] ; SSE-NEXT: movapd 752(%rdi), %xmm12 ; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm12[0] ; SSE-NEXT: movapd 704(%rdi), %xmm15 @@ -1153,46 +1150,46 @@ ; SSE-NEXT: movapd 880(%rdi), %xmm11 ; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm11[0] ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm4[0],xmm11[1] -; SSE-NEXT: movapd %xmm7, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%rsi) ; SSE-NEXT: movapd %xmm2, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 48(%rsi) +; SSE-NEXT: movapd %xmm8, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps %xmm2, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movapd %xmm6, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rdx) +; SSE-NEXT: movaps %xmm2, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) ; SSE-NEXT: movapd %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movapd %xmm6, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movapd %xmm13, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movapd %xmm8, 96(%rcx) -; SSE-NEXT: movapd %xmm1, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rcx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movapd %xmm1, 112(%rcx) +; SSE-NEXT: movapd %xmm7, 96(%rcx) ; SSE-NEXT: movapd %xmm14, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) +; SSE-NEXT: movaps %xmm0, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movapd %xmm3, 112(%r8) ; SSE-NEXT: movapd %xmm10, 96(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1257,466 +1254,452 @@ ; ; AVX1-ONLY-LABEL: load_i64_stride7_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $552, %rsp # imm = 0x228 -; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 720(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: subq $504, %rsp # imm = 0x1F8 +; AVX1-ONLY-NEXT: vmovapd 768(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = mem[0],xmm7[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm15[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm7[0],xmm8[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 720(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm5[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[3],ymm5[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm6[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[3],ymm4[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[3],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[3],ymm8[2] -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm3[0],mem[1] +; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 800(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm5[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm4[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm3[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm2[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm5[1],ymm12[0],ymm5[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm7[0],ymm3[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 800(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm5[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm10[1],ymm11[0],ymm10[2],ymm11[2] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm4[1],ymm0[0],ymm4[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm4[0],ymm6[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm6[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm8[0],xmm7[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm7[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm5[0],xmm12[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm7[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm4[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = mem[0],xmm11[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm4[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm6[0],ymm2[0],ymm6[3],ymm2[2] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[3],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm3[0],ymm7[3],ymm3[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm1[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[0],ymm1[0],ymm10[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm13, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r8) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r8) -; AVX1-ONLY-NEXT: vmovapd %ymm11, (%r9) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm1[0],ymm3[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm2[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[1],ymm11[0],ymm5[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm4[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm4[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm0 = mem[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm0[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm1[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm3[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm3[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm11[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm5[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm11[0],ymm4[3],ymm11[2] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[3],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm4[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm14[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm14[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = mem[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm14 = xmm14[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm11 = xmm15[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm3[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm3[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 96(%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 96(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 32(%r9) ; AVX1-ONLY-NEXT: vmovapd %ymm9, 64(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 96(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm14, 32(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm12, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovapd %ymm10, (%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 64(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 96(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 32(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 96(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 32(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 64(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm7, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovapd %ymm3, 64(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm1, (%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 32(%rax) -; AVX1-ONLY-NEXT: addq $552, %rsp # imm = 0x228 +; AVX1-ONLY-NEXT: vmovapd %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 32(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm14, (%rax) +; AVX1-ONLY-NEXT: addq $504, %rsp # imm = 0x1F8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride7_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $520, %rsp # imm = 0x208 -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm4 +; AVX2-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovdqa 720(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 496(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = mem[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovdqa 496(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 720(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm5[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm6[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 352(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm5 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm4, %ymm5 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX2-ONLY-NEXT: vmovdqa 464(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm3, %ymm5 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] ; AVX2-ONLY-NEXT: vmovdqa 240(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 800(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] -; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 128(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = mem[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 576(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm2, %ymm5 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] +; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm0[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm15 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm4[1],ymm15[3],ymm4[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm11[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, (%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm12, 64(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm14, 96(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r9) +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm14[1],ymm0[1],ymm14[3],ymm0[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq %xmm11, %ymm11 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm3[1],ymm12[3],ymm3[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm15[1],ymm3[3],ymm15[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm13, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm13, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm13, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm13, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm13, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm13, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm13, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm13, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm13, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm13, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm13, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm13, (%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm10, 32(%r8) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm10, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm10, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 96(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 32(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, 64(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm9, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm10, (%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 96(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 96(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, (%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm13, 96(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm15, 32(%rax) -; AVX2-ONLY-NEXT: addq $520, %rsp # imm = 0x208 +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 96(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, 64(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-ONLY-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -2112,15 +2095,15 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i64_stride7_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $1448, %rsp # imm = 0x5A8 +; SSE-NEXT: subq $1432, %rsp # imm = 0x598 ; SSE-NEXT: movapd 208(%rdi), %xmm3 ; SSE-NEXT: movapd 96(%rdi), %xmm2 ; SSE-NEXT: movapd 144(%rdi), %xmm4 ; SSE-NEXT: movapd 192(%rdi), %xmm6 ; SSE-NEXT: movapd 80(%rdi), %xmm5 ; SSE-NEXT: movapd 128(%rdi), %xmm8 -; SSE-NEXT: movapd 64(%rdi), %xmm10 ; SSE-NEXT: movapd 176(%rdi), %xmm11 +; SSE-NEXT: movapd 64(%rdi), %xmm10 ; SSE-NEXT: movapd (%rdi), %xmm12 ; SSE-NEXT: movapd 16(%rdi), %xmm9 ; SSE-NEXT: movapd 32(%rdi), %xmm7 @@ -2400,8 +2383,8 @@ ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1456(%rdi), %xmm9 ; SSE-NEXT: movapd 1504(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm12 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm9[0],xmm12[1] +; SSE-NEXT: movapd %xmm0, %xmm11 +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm9[0],xmm11[1] ; SSE-NEXT: movapd 1520(%rdi), %xmm13 ; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm13[0] ; SSE-NEXT: movapd 1472(%rdi), %xmm2 @@ -2411,7 +2394,7 @@ ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1488(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movapd 1552(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2419,96 +2402,95 @@ ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1568(%rdi), %xmm6 ; SSE-NEXT: movapd 1616(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm4 -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] +; SSE-NEXT: movapd %xmm0, %xmm5 +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] ; SSE-NEXT: movapd 1632(%rdi), %xmm8 ; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm8[0] -; SSE-NEXT: movapd 1584(%rdi), %xmm11 -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm11[0],xmm8[1] +; SSE-NEXT: movapd 1584(%rdi), %xmm12 +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm12[0],xmm8[1] ; SSE-NEXT: movapd 1648(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm1[0] +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm1[0] ; SSE-NEXT: movapd 1600(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1664(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] -; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 1680(%rdi), %xmm3 -; SSE-NEXT: movapd 1728(%rdi), %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] -; SSE-NEXT: movapd 1744(%rdi), %xmm5 -; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm5[0] +; SSE-NEXT: movapd 1680(%rdi), %xmm0 +; SSE-NEXT: movapd 1728(%rdi), %xmm3 +; SSE-NEXT: movapd %xmm3, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd 1744(%rdi), %xmm4 +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm4[0] ; SSE-NEXT: movapd 1696(%rdi), %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm7[0],xmm4[1] ; SSE-NEXT: movapd 1760(%rdi), %xmm10 ; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm10[0] ; SSE-NEXT: movapd 1712(%rdi), %xmm15 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm15[0],xmm10[1] -; SSE-NEXT: movapd 1776(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm0[0] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm4, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rsi) +; SSE-NEXT: movapd 1776(%rdi), %xmm2 +; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm2[0] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] ; SSE-NEXT: movapd %xmm1, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 176(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 176(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movapd %xmm5, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 128(%rsi) +; SSE-NEXT: movaps %xmm1, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps %xmm1, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movapd %xmm12, 208(%rsi) +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movapd %xmm11, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 144(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movapd %xmm0, 240(%rdx) ; SSE-NEXT: movapd %xmm6, 224(%rdx) -; SSE-NEXT: movapd %xmm3, 240(%rdx) -; SSE-NEXT: movapd %xmm14, 192(%rdx) ; SSE-NEXT: movapd %xmm9, 208(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rdx) +; SSE-NEXT: movapd %xmm14, 192(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rdx) +; SSE-NEXT: movaps %xmm0, 160(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movapd %xmm5, 240(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movapd %xmm4, 240(%rcx) ; SSE-NEXT: movapd %xmm8, 224(%rcx) ; SSE-NEXT: movapd %xmm13, 208(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2538,7 +2520,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movapd %xmm7, 240(%r8) -; SSE-NEXT: movapd %xmm11, 224(%r8) +; SSE-NEXT: movapd %xmm12, 224(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2570,7 +2552,7 @@ ; SSE-NEXT: movapd %xmm10, 240(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%r9) @@ -2600,7 +2582,7 @@ ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm15, 240(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) @@ -2631,8 +2613,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 240(%rax) +; SSE-NEXT: movapd %xmm2, 240(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2663,3246 +2644,1839 @@ ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: addq $1448, %rsp # imm = 0x5A8 +; SSE-NEXT: addq $1432, %rsp # imm = 0x598 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1736, %rsp # imm = 0x6C8 -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 720(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1280(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = mem[0],xmm1[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm10[0],xmm1[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 992(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovapd 944(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm12[0],xmm1[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1440(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1344(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovapd 1392(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm3[0],xmm0[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm0[0],ymm7[3],ymm0[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm6[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[3],ymm7[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[3],ymm6[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1728(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm14[0],ymm11[0],ymm14[3],ymm11[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm8[0],ymm4[3],ymm8[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: subq $1576, %rsp # imm = 0x628 +; AVX1-ONLY-NEXT: vmovapd 1440(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 992(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm5[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm9[0],ymm14[0],ymm9[3],ymm14[2] -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 944(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 1392(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1056(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm15[0],ymm13[0],ymm15[3],ymm13[2] -; AVX1-ONLY-NEXT: vmovdqa 960(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm12[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm4[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = mem[0],xmm6[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[3],ymm5[2] -; AVX1-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm2[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 800(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm12 = xmm10[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovapd 1136(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm6[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vmovapd 768(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovapd 720(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = mem[0],xmm7[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1216(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1280(%rdi), %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3] -; AVX1-ONLY-NEXT: vmovapd 1584(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm12 = xmm11[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vmovapd 1168(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = mem[0],xmm8[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vmovdqa 912(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1,2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1472(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovdqa 1360(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm9[0],ymm2[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovapd 1664(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vmovapd 1616(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = mem[0],xmm9[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm12[0],ymm3[3],ymm12[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm11[0],ymm2[3],ymm11[2] +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm10[1],ymm5[0],ymm10[2],ymm5[2] -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1200(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd 1056(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[3],ymm10[2] +; AVX1-ONLY-NEXT: vmovdqa 960(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[3],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[1],ymm0[0],ymm6[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm8[0],ymm4[3],ymm8[2] +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1648(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm0[0],ymm11[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[0],ymm3[0],ymm6[3],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1424(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd 1728(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[0],ymm2[0],ymm7[3],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[1],ymm0[0],ymm15[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm7[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 976(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm15[0],ymm1[2],ymm15[2] -; AVX1-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm6[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm8[1],ymm0[0],ymm8[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm3[0],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm12[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm2[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm3[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm3[0],xmm9[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm0 = xmm7[0],xmm0[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1024(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vmovapd 912(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm1[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm14[0],xmm5[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 1024(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm9[0],xmm15[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd 1152(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm15[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 1696(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm12[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd 1600(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm13[0],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1472(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm0[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vmovapd 1360(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm0[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 800(%rdi), %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm9[0],mem[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm11[0],ymm2[3],ymm11[2] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm10[0],ymm2[3],ymm10[2] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 1136(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm8[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1696(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 1584(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm5[0],mem[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm7[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[3],ymm5[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[3],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm7[1],ymm2[0],ymm7[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm10[1],ymm3[0],ymm10[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm4[0],ymm6[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm9[1],ymm6[0],ymm9[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 976(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm7[0],ymm1[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1200(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[1],ymm1[0],ymm8[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1424(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm13[0],ymm0[2],ymm13[2] +; AVX1-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1648(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[1],ymm14[0],ymm5[2],ymm14[2] +; AVX1-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = mem[0],xmm2[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 992(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[3],ymm3[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = mem[0],xmm3[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm9[0],ymm1[3],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm15[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1536(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[3],ymm8[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = mem[0],xmm4[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1760(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[0],ymm1[0],ymm12[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = mem[0,1,2],ymm11[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm13[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm13[0,1],ymm11[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = mem[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm6[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm6[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1024(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1088(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = mem[0],xmm7[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm12[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1536(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm13[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1696(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1760(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm14[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm3[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm11[0],ymm5[0],ymm11[3],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm10[0],ymm6[0],ymm10[3],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm9[0],ymm4[0],ymm9[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm3[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[3],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[3],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 992(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[0],ymm10[0],ymm12[3],ymm10[2] +; AVX1-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 1536(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[0],ymm1[0],ymm8[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 1760(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm0[0],ymm7[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm7[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm4[3] ; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = xmm6[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm4[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2],ymm3[3] ; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm2[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm3[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm15, 224(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 160(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 128(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovapd %ymm12, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm15, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 224(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 192(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 160(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 128(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 96(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 64(%rax) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovapd %ymm10, 224(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 192(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 160(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 224(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 192(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 160(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm3, 128(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 96(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 96(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm6, 64(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 32(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm14, (%rax) -; AVX1-ONLY-NEXT: addq $1736, %rsp # imm = 0x6C8 +; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm5, (%rax) +; AVX1-ONLY-NEXT: addq $1576, %rsp # imm = 0x628 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride7_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1576, %rsp # imm = 0x628 -; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm6 +; AVX2-ONLY-NEXT: subq $1448, %rsp # imm = 0x5A8 +; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovdqa 496(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovdqa 944(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1392(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovdqa 720(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm4[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovdqa 720(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1568(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovdqa 1616(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm8[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = mem[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovdqa 496(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm10[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovdqa 944(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm12[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vmovdqa 1392(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1616(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm4[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = ymm11[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm12[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm14[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 352(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm12[1],ymm4[1],ymm12[3],ymm4[3] -; AVX2-ONLY-NEXT: vmovdqa 240(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 800(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1248(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovdqa 1136(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1696(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] -; AVX2-ONLY-NEXT: vmovdqa 1584(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 128(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = mem[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm9[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 576(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm12[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1024(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm3, %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] +; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1472(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm2, %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] +; AVX2-ONLY-NEXT: vmovdqa 464(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm1, %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] +; AVX2-ONLY-NEXT: vmovdqa 912(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm9 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX2-ONLY-NEXT: vmovdqa 1360(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm8, %ymm9 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX2-ONLY-NEXT: vmovdqa 240(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm7, %ymm9 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] +; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm6, %ymm9 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm9[1],ymm5[1],ymm9[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovdqa 1136(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm5, %ymm9 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] +; AVX2-ONLY-NEXT: vmovdqa 1584(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm8[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm7[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm6[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm9[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm5[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1376(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1600(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm14[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm11[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1376(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1600(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm13[1],ymm9[3],ymm13[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm6 = mem[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm5[1],ymm12[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm5[1],ymm11[1],ymm5[3],ymm11[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm10 = mem[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = mem[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r9) +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm6[1],ymm3[1],ymm6[3],ymm3[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm6[1],ymm2[1],ymm6[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm15, 224(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 224(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm15, 192(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm14, 160(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, 224(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 192(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 160(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, 128(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm13, 96(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm14, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 224(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 192(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 160(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 128(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, 96(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm12, 64(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 224(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 192(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 160(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 128(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 96(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 32(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm9, (%rax) -; AVX2-ONLY-NEXT: addq $1576, %rsp # imm = 0x628 +; AVX2-ONLY-NEXT: addq $1448, %rsp # imm = 0x5A8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; -; AVX512F-ONLY-SLOW-LABEL: load_i64_stride7_vf32: -; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $2216, %rsp # imm = 0x8A8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovaps 1024(%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm20, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm20, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] -; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm28, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [9,0,7,0,9,0,7,0] -; AVX512F-ONLY-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm30, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm30, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm20, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm28, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm21, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: movb $24, %al -; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm11[4,5,4,5],zmm10[4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] -; AVX512F-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm16, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [4,11,4,11] -; AVX512F-ONLY-SLOW-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm20, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm21, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm12[4,5,4,5],zmm24[4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm10, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm16, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm20, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm21, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm3[4,5,4,5],zmm15[4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm3, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm15, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm20, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm2[4,5,4,5],zmm9[4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm20, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 912(%rdi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 464(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm26, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm10, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [6,13] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 960(%rdi), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm15, %xmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: movb $-32, %al -; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm17 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm18 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm24, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm15, %zmm27, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm9, %zmm25, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm7, %zmm16, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpalignr $8, (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm15, %zmm16, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm16, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 192(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 192(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, (%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 128(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 192(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 128(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 192(%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%r9) -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm15, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $2216, %rsp # imm = 0x8A8 -; AVX512F-ONLY-SLOW-NEXT: vzeroupper -; AVX512F-ONLY-SLOW-NEXT: retq -; -; AVX512F-ONLY-FAST-LABEL: load_i64_stride7_vf32: -; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $2216, %rsp # imm = 0x8A8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovaps 1024(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] -; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] -; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] -; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5] -; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm20, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] -; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [9,0,7,0,9,0,7,0] -; AVX512F-ONLY-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm30, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm7, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm20, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm28, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm21, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: movb $24, %al -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm11[4,5,4,5],zmm10[4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7] -; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] -; AVX512F-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm16, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [4,11,4,11] -; AVX512F-ONLY-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm12[4,5,4,5],zmm24[4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm10, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm16, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm20, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm21, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm3[4,5,4,5],zmm15[4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm3, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm15, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm20, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm2[4,5,4,5],zmm9[4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 912(%rdi), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 464(%rdi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm26, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1360(%rdi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm10, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [6,13] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 960(%rdi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm15, %xmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm11 -; AVX512F-ONLY-FAST-NEXT: movb $-32, %al -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm18 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm12 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm29 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k2} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm24, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 {%k2} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm15, %zmm27, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm9, %zmm25, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 640(%rdi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1408(%rdi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm7, %zmm16, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpalignr $8, (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm15, %zmm16, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm16, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 192(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 128(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 64(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 192(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 128(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 192(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 128(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 192(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%r9) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%r9) -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm15, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $2216, %rsp # imm = 0x8A8 -; AVX512F-ONLY-FAST-NEXT: vzeroupper -; AVX512F-ONLY-FAST-NEXT: retq -; -; AVX512DQ-SLOW-LABEL: load_i64_stride7_vf32: -; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $2216, %rsp # imm = 0x8A8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovaps 1024(%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] -; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] -; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] -; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5] -; AVX512DQ-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm20, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm20, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] -; AVX512DQ-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm28, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm16 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [9,0,7,0,9,0,7,0] -; AVX512DQ-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm30, %zmm14 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm30, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm20, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm28, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm3, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm21, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm30 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: movb $24, %al -; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm11[4,5,4,5],zmm10[4,5,4,5] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7] -; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm26 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] -; AVX512DQ-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm16, %zmm27 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [4,11,4,11] -; AVX512DQ-SLOW-NEXT: # ymm20 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm20, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm21, %zmm17 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm12[4,5,4,5],zmm24[4,5,4,5] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm10, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm16, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm20, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm21, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm8 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm3[4,5,4,5],zmm15[4,5,4,5] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm15, %zmm3, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm15, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm20, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm2[4,5,4,5],zmm9[4,5,4,5] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm20, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 912(%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 464(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm26, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm10, %zmm20 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm13 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [6,13] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 960(%rdi), %ymm15 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm15, %xmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %ymm15 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm11 -; AVX512DQ-SLOW-NEXT: movb $-32, %al -; AVX512DQ-SLOW-NEXT: kmovw %eax, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm17 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm18 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k2} -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm24, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 {%k2} -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm15, %zmm27, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm9, %zmm25, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %ymm15 -; AVX512DQ-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm7 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm7, %zmm16, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm15 -; AVX512DQ-SLOW-NEXT: vpalignr $8, (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm15 -; AVX512DQ-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinsertf64x4 $0, %ymm15, %zmm16, %zmm15 -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm16, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 192(%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 192(%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 128(%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 192(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 128(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 192(%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%r9) -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512DQ-SLOW-NEXT: vmovaps %zmm15, 64(%rax) -; AVX512DQ-SLOW-NEXT: addq $2216, %rsp # imm = 0x8A8 -; AVX512DQ-SLOW-NEXT: vzeroupper -; AVX512DQ-SLOW-NEXT: retq -; -; AVX512DQ-FAST-LABEL: load_i64_stride7_vf32: -; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $2216, %rsp # imm = 0x8A8 -; AVX512DQ-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovaps 1024(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] -; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] -; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] -; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5] -; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm20, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] -; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm16 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [9,0,7,0,9,0,7,0] -; AVX512DQ-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm14 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm30, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm7, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm7, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm20, %zmm29 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm31 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm28, %zmm22 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm21, %zmm3, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm21, %zmm30 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm30 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: movb $24, %al -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm11[4,5,4,5],zmm10[4,5,4,5] -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [14,0,0,7,14,0,0,7] -; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm26 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] -; AVX512DQ-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm16, %zmm27 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [4,11,4,11] -; AVX512DQ-FAST-NEXT: # ymm20 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm17 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm12[4,5,4,5],zmm24[4,5,4,5] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm10, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm16, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm20, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm21, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm8 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm3[4,5,4,5],zmm15[4,5,4,5] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm15, %zmm3, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm24 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm15, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm20, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm2[4,5,4,5],zmm9[4,5,4,5] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 912(%rdi), %xmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 464(%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm26, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa 1360(%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm10, %zmm20 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 1024(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [4,11] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm13 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [6,13] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 960(%rdi), %ymm15 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vextracti32x4 $1, %ymm15, %xmm19 -; AVX512DQ-FAST-NEXT: vmovdqa 512(%rdi), %ymm15 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm9 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm11 -; AVX512DQ-FAST-NEXT: movb $-32, %al -; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm18 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm12 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm29 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k2} -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm24, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 {%k2} -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm15, %zmm27, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm9, %zmm25, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa 640(%rdi), %ymm15 -; AVX512DQ-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vmovdqa 1408(%rdi), %ymm7 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm7, %zmm16, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm15 -; AVX512DQ-FAST-NEXT: vpalignr $8, (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa 1536(%rdi), %ymm15 -; AVX512DQ-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm2 -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinsertf64x4 $0, %ymm15, %zmm16, %zmm15 -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14 -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm16, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 192(%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 128(%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 64(%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 192(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 128(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 192(%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, (%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 128(%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 192(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 128(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 192(%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%r9) -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 128(%rax) -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512DQ-FAST-NEXT: vmovaps %zmm15, 64(%rax) -; AVX512DQ-FAST-NEXT: addq $2216, %rsp # imm = 0x8A8 -; AVX512DQ-FAST-NEXT: vzeroupper -; AVX512DQ-FAST-NEXT: retq +; AVX512F-LABEL: load_i64_stride7_vf32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: subq $2248, %rsp # imm = 0x8C8 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm21 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm8, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm16 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,4,11,4,11,4,11,4] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [13,6,13,6,13,6,13,6] +; AVX512F-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm8, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [12,5,12,5,12,5,12,5] +; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm31, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm31, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm31, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm30, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm30, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm15 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,7,14,0,0,7,14,0] +; AVX512F-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm27, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm27, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm27, %zmm20 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [9,0,7,0,9,0,7,0] +; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm26, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm28 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,9,2,9,2,9,2,9] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm26, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm26, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm18, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm30, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm16, %zmm30, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm18, %zmm4, %zmm6 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm14, %zmm30, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm18, %zmm4, %zmm31 +; AVX512F-NEXT: vpermi2q %zmm18, %zmm4, %zmm30 +; AVX512F-NEXT: vpermi2q %zmm18, %zmm4, %zmm27 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm18, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,9,0,5,6,9] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm5 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,10,0,5,6,10] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,11,0,5,6,11] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm22 +; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm23 +; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm31 +; AVX512F-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,12,0,5,6,12] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm24 +; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm25 +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm30 +; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,13,4,5,6,13] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm22 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm27 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,14,4,5,6,14] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm28 +; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm26 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,8,15,4,5,8,15] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm15 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <0,7,14,u> +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm6, %zmm19 +; AVX512F-NEXT: movb $24, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm13[4,5,4,5],zmm21[4,5,4,5] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0] +; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm18, %zmm25 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11] +; AVX512F-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm21, %zmm9, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm17 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm18, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm6, %zmm13 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm3[4,5,4,5],zmm16[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm18, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm9, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm11 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm11, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm8[4,5,4,5],zmm14[4,5,4,5] +; AVX512F-NEXT: vpermi2q %zmm14, %zmm8, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm8, %zmm14, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm9, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm9 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX512F-NEXT: vinserti32x4 $0, %xmm9, %zmm2, %zmm30 +; AVX512F-NEXT: vmovdqa 464(%rdi), %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm23, %zmm29 +; AVX512F-NEXT: vmovdqa 912(%rdi), %xmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm28 +; AVX512F-NEXT: vmovdqa 1360(%rdi), %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm23 +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [4,11] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm9 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm8 +; AVX512F-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm8 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm8 +; AVX512F-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 1472(%rdi), %ymm8 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-NEXT: vmovdqa64 %ymm8, %ymm31 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm11, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm27 +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [5,12] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm10 = [6,13] +; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm10, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti32x4 $1, %ymm9, %xmm20 +; AVX512F-NEXT: vmovdqa 512(%rdi), %ymm9 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti32x4 $1, %ymm9, %xmm16 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm10, %zmm12 +; AVX512F-NEXT: vpermi2q %zmm11, %zmm3, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm3, %zmm11, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm10, %zmm11 +; AVX512F-NEXT: movb $-32, %al +; AVX512F-NEXT: kmovw %eax, %k2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm30 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} +; AVX512F-NEXT: vinserti32x4 $0, %xmm20, %zmm25, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 {%k2} +; AVX512F-NEXT: vinserti32x4 $0, %xmm16, %zmm24, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa64 %ymm22, %ymm10 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vmovdqa 960(%rdi), %ymm10 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512F-NEXT: vinserti32x4 $0, %xmm10, %zmm21, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm16 +; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm8 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqa 1408(%rdi), %ymm15 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm18, %zmm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm15 {%k2} +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm14, %zmm18 +; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm14 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm14 +; AVX512F-NEXT: vmovdqa64 %ymm31, %ymm8 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm14 = ymm8[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm26, %zmm1 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm14 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm14 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm14, %zmm8, %zmm14 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm8, %zmm20, %zmm8 +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm20, %zmm12 +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm20, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm13, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm19, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm5, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm23, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm28, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm29, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm30, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm15, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm3, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm27, 192(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%r9) +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512F-NEXT: vmovaps %zmm8, 64(%rax) +; AVX512F-NEXT: vmovaps %zmm14, (%rax) +; AVX512F-NEXT: addq $2248, %rsp # imm = 0x8C8 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: load_i64_stride7_vf32: ; AVX512BW-ONLY-SLOW: # %bb.0: ; AVX512BW-ONLY-SLOW-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm18 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovaps 576(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm28 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] ; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm31, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,4,11,4,11,4,11,4] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [13,6,13,6,13,6,13,6] +; AVX512BW-ONLY-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] -; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm28, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] -; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [12,5,12,5,12,5,12,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] +; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm26, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [9,0,7,0,9,0,7,0] +; AVX512BW-ONLY-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm27, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,9,2,9,2,9,2,9] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm27, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm27, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm31, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm30, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm31, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm19, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm5, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm28, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm5, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm21, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm31, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm30, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm31, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm31, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm31, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm31, %zmm18, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm3, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm17 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm24 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm15 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <0,7,14,u> +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm18 ; AVX512BW-ONLY-SLOW-NEXT: movb $24, %al ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm21, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm14[4,5,4,5],zmm11[4,5,4,5] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [14,0,0,7,14,0,0,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm13[4,5,4,5],zmm28[4,5,4,5] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,0,0,7,14,0,0,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm9, %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [7,0,9,0,7,0,9,0] ; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm17, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [4,11,4,11] -; AVX512BW-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm21, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm0[4,5,4,5],zmm22[4,5,4,5] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm21, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm6[4,5,4,5],zmm3[4,5,4,5] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm17, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [4,11,4,11] +; AVX512BW-ONLY-SLOW-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm21, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm2[4,5,4,5] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm9, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm14 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm6, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm3, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm22[4,5,4,5] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm0[4,5,4,5] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 912(%rdi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm12, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[4,5,4,5],zmm25[4,5,4,5] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm0, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm25, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm21, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm11, %zmm29 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 464(%rdi), %xmm0 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm22, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 912(%rdi), %xmm0 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm2, %zmm25 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm0 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm11, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm9, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,11] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [4,11] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm12, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [5,12] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [6,13] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm5 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm5 = ymm19[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm19[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm12, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm6, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm12, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm5 = ymm21[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm21[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm26 ; AVX512BW-ONLY-SLOW-NEXT: movb $-32, %al ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm24 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm12 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm19 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm23 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 960(%rdi), %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm22 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm5, %zmm24, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm6 ; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm6, %zmm25, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm7, %zmm26, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm9, %zmm30, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm20, %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm20, %zmm17, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 192(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 64(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm6, %zmm23, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 960(%rdi), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm8, %zmm20, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %ymm19 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm19 = mem[8,9,10,11,12,13,14,15],ymm19[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm19[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm19, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm17, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm19 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm19[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm19[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm11 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm11 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm11, %zmm19, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm19, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm31, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 128(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rsi) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rsi) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 128(%rdx) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rdx) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 192(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 128(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, (%rcx) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 192(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 128(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 128(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%r8) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%r9) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%r9) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm11, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq @@ -5910,444 +4484,441 @@ ; AVX512BW-ONLY-FAST-LABEL: load_i64_stride7_vf32: ; AVX512BW-ONLY-FAST: # %bb.0: ; AVX512BW-ONLY-FAST-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm18 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovaps 576(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm28 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] ; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm31, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] -; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,4,11,4,11,4,11,4] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] -; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [13,6,13,6,13,6,13,6] +; AVX512BW-ONLY-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] -; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm28, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] -; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [12,5,12,5,12,5,12,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] +; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm26, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [9,0,7,0,9,0,7,0] +; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm27, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,9,2,9,2,9,2,9] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm27, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm31, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm30, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm31, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm19, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm5, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm5, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm21, %zmm5, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm21, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm31, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm30, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm31, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm31, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm31, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm18, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm3, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm17 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm24 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm15 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm9 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,7,14,u> +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm18 ; AVX512BW-ONLY-FAST-NEXT: movb $24, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm14[4,5,4,5],zmm11[4,5,4,5] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [14,0,0,7,14,0,0,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm13[4,5,4,5],zmm28[4,5,4,5] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,0,0,7,14,0,0,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm9, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [7,0,9,0,7,0,9,0] ; AVX512BW-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm17, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [4,11,4,11] -; AVX512BW-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm0[4,5,4,5],zmm22[4,5,4,5] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm21, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm6[4,5,4,5],zmm3[4,5,4,5] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm17, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [4,11,4,11] +; AVX512BW-ONLY-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm21, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm16 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm6, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm22[4,5,4,5] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm2[4,5,4,5] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm9, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 912(%rdi), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm0[4,5,4,5] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm12, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[4,5,4,5],zmm25[4,5,4,5] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm0, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm25, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm21, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm11, %zmm29 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 464(%rdi), %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm22, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 912(%rdi), %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm2, %zmm25 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1360(%rdi), %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm11, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm9, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [4,11] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,11] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1472(%rdi), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm12, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [5,12] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [6,13] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1088(%rdi), %ymm5 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm5 = ymm19[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm19[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm12, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: movb $-32, %al -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm12 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm19 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm23 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm12, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm5 = ymm21[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm21[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 960(%rdi), %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: movb $-32, %al +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm22 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm5, %zmm24, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm6 ; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm6, %zmm25, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm7 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm7, %zmm26, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm9, %zmm30, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %ymm20 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm20, %zmm17, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %ymm20 -; AVX512BW-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 192(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 128(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 64(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm6, %zmm23, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 960(%rdi), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm8, %zmm20, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %ymm19 +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm19 = mem[8,9,10,11,12,13,14,15],ymm19[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm19[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm19, %xmm19 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm17, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %ymm19 +; AVX512BW-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm19[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm19[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1088(%rdi), %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm11, %zmm19, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm19, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm31, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 192(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 128(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 64(%rsi) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rsi) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 128(%rdx) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rdx) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 192(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 128(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 128(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 64(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, (%rcx) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 192(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 64(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 128(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 128(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%r8) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%r9) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%r9) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, (%rax) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm11, (%rax) ; AVX512BW-ONLY-FAST-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq @@ -6355,444 +4926,441 @@ ; AVX512DQBW-SLOW-LABEL: load_i64_stride7_vf32: ; AVX512DQBW-SLOW: # %bb.0: ; AVX512DQBW-SLOW-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm18 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovaps 576(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm28 ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] ; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm31, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] -; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,4,11,4,11,4,11,4] +; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm8 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] -; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [13,6,13,6,13,6,13,6] +; AVX512DQBW-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm8 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm18 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] -; AVX512DQBW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm19, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] -; AVX512DQBW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm28, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm25 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] -; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] -; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm29, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [12,5,12,5,12,5,12,5] +; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] +; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm26, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [9,0,7,0,9,0,7,0] +; AVX512DQBW-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm27, %zmm15 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,9,2,9,2,9,2,9] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm27, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm27, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm18, %zmm31, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm30, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm31, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm19, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm5, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm28, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm5, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm29, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm21, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] -; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm18, %zmm31, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm30, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm18, %zmm31, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm18, %zmm31, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm18, %zmm31, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm31, %zmm18, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm3, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] -; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] -; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm17 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm30 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm24 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm26 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm15 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm27 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm9 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm29 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] -; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <0,7,14,u> +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm18 ; AVX512DQBW-SLOW-NEXT: movb $24, %al ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm21, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm14[4,5,4,5],zmm11[4,5,4,5] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [14,0,0,7,14,0,0,7] -; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm13 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm13[4,5,4,5],zmm28[4,5,4,5] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,0,0,7,14,0,0,7] +; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm9, %zmm11 ; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [7,0,9,0,7,0,9,0] ; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm17, %zmm26 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [4,11,4,11] -; AVX512DQBW-SLOW-NEXT: # ymm19 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm21, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm0[4,5,4,5],zmm22[4,5,4,5] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm17, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm21, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm6[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm17, %zmm24 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [4,11,4,11] +; AVX512DQBW-SLOW-NEXT: # ymm21 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm21, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm2[4,5,4,5] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm9, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm14 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm6, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm3, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm22[4,5,4,5] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm19, %zmm1 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm0[4,5,4,5] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm17, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 912(%rdi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm12, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[4,5,4,5],zmm25[4,5,4,5] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm25, %zmm0, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm25, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm21, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm11, %zmm29 ; AVX512DQBW-SLOW-NEXT: vmovdqa 464(%rdi), %xmm0 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm22, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa 912(%rdi), %xmm0 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm23 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm2, %zmm25 ; AVX512DQBW-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm0 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm11, %zmm22 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm9, %zmm22 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,11] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [4,11] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm12, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [5,12] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [6,13] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm5 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm5 = ymm19[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm19[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm12, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm6, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm12, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm12, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm5 = ymm21[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm21[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm26 ; AVX512DQBW-SLOW-NEXT: movb $-32, %al ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm24 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm12 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm19 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm23 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa 960(%rdi), %ymm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm22 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm5, %zmm24, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa 512(%rdi), %ymm6 ; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm6, %zmm25, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa 512(%rdi), %ymm7 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm7, %zmm26, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm9, %zmm30, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %ymm20 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm20, %xmm20 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm20, %zmm17, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm20 -; AVX512DQBW-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX512DQBW-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm13 -; AVX512DQBW-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 192(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 64(%rsi) +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm6, %zmm23, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa 960(%rdi), %ymm8 +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm8, %zmm20, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %ymm19 +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm19 = mem[8,9,10,11,12,13,14,15],ymm19[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm19[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm19, %xmm19 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm17, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm19 +; AVX512DQBW-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm19[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm19[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm11 +; AVX512DQBW-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm11 +; AVX512DQBW-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinsertf64x4 $0, %ymm11, %zmm19, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm19, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm31, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 128(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rsi) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, (%rsi) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 128(%rdx) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, (%rdx) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 192(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 128(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, (%rcx) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 192(%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 64(%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 128(%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 128(%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, (%r8) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%r9) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%r9) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, (%rax) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm11, (%rax) ; AVX512DQBW-SLOW-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq @@ -6800,444 +5368,441 @@ ; AVX512DQBW-FAST-LABEL: load_i64_stride7_vf32: ; AVX512DQBW-FAST: # %bb.0: ; AVX512DQBW-FAST-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm18 ; AVX512DQBW-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovaps 576(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm28 ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,3,10,3,10,3,10,3] ; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm31, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] -; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,4,11,4,11,4,11,4] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm8 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] -; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [13,6,13,6,13,6,13,6] +; AVX512DQBW-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm8 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm18 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] -; AVX512DQBW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm19, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,7,14,0,0,7,14,0] -; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm28, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm25 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] -; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm29, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,9,2,9,2,9,2,9] -; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [12,5,12,5,12,5,12,5] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] +; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm26, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm24 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [9,0,7,0,9,0,7,0] +; AVX512DQBW-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm27, %zmm15 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,9,2,9,2,9,2,9] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm27, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm18, %zmm31, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm30, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm31, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm19, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm5, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm5, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm21, %zmm5, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm21, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,9,0,5,6,9] -; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm18, %zmm31, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm30, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm18, %zmm31, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm18, %zmm31, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm18, %zmm31, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm31, %zmm18, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm3, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,10,0,5,6,10] -; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,11,0,5,6,11] -; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm17 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12] -; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm30 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm24 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13] -; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm26 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm15 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm28 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14] -; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm27 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm9 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm29 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,8,15,4,5,8,15] -; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,7,14,u> +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm18 ; AVX512DQBW-FAST-NEXT: movb $24, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,7,14,u> -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm14[4,5,4,5],zmm11[4,5,4,5] -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [14,0,0,7,14,0,0,7] -; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm13 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm13[4,5,4,5],zmm28[4,5,4,5] +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [14,0,0,7,14,0,0,7] +; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm9, %zmm11 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [7,0,9,0,7,0,9,0] ; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm17, %zmm26 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [4,11,4,11] -; AVX512DQBW-FAST-NEXT: # ymm19 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm0[4,5,4,5],zmm22[4,5,4,5] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm21, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm8 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm6[4,5,4,5],zmm3[4,5,4,5] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm17, %zmm24 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [4,11,4,11] +; AVX512DQBW-FAST-NEXT: # ymm21 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm21, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm16 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm6, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm22[4,5,4,5] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm1 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm2[4,5,4,5] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm9, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 912(%rdi), %xmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm0[4,5,4,5] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm12 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm12, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[4,5,4,5],zmm25[4,5,4,5] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm25, %zmm0, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm25, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm21, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm11, %zmm29 ; AVX512DQBW-FAST-NEXT: vmovdqa 464(%rdi), %xmm0 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm22, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa 912(%rdi), %xmm0 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm23, %zmm23 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm2, %zmm25 ; AVX512DQBW-FAST-NEXT: vmovdqa 1360(%rdi), %xmm0 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm11, %zmm22 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm9, %zmm22 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [4,11] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm11 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,11] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 1472(%rdi), %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm12, %zmm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [5,12] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [6,13] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa 1088(%rdi), %ymm5 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm5 = ymm19[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm19[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm12, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm12, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm12, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm5 = ymm21[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm21[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm26 ; AVX512DQBW-FAST-NEXT: movb $-32, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm24 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm16 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm12 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm19 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm23 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa 960(%rdi), %ymm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm22 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm5, %zmm24, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa 512(%rdi), %ymm6 ; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm6, %zmm25, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa 512(%rdi), %ymm7 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm7, %zmm26, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm9, %zmm30, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %ymm20 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm20 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm20, %zmm17, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %ymm20 -; AVX512DQBW-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm5 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX512DQBW-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa 1536(%rdi), %ymm13 -; AVX512DQBW-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm1 -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm13 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm14 -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 192(%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 128(%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 64(%rsi) +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm6, %zmm23, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa 960(%rdi), %ymm8 +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm8, %zmm20, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %ymm19 +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm19 = mem[8,9,10,11,12,13,14,15],ymm19[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm19[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm19, %xmm19 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm17, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %ymm19 +; AVX512DQBW-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm19[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm19[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 1088(%rdi), %ymm11 +; AVX512DQBW-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa 1536(%rdi), %ymm11 +; AVX512DQBW-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinsertf64x4 $0, %ymm11, %zmm19, %zmm11 +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm15 +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm19, %zmm13 +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm31, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 192(%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 128(%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 64(%rsi) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, (%rsi) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 192(%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 128(%rdx) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, (%rdx) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 192(%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 64(%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 128(%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 128(%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 64(%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, (%rcx) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 192(%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 64(%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 128(%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 128(%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 64(%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, (%r8) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%r9) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%r9) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, (%rax) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm11, (%rax) ; AVX512DQBW-FAST-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq @@ -8395,3266 +6960,2997 @@ ; ; AVX1-ONLY-LABEL: load_i64_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $4232, %rsp # imm = 0x1088 -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 720(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1280(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1664(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1568(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1616(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2112(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 2176(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: subq $3784, %rsp # imm = 0xEC8 +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 2016(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2064(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = mem[0],xmm2[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2560(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, 2624(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 2464(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2512(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = mem[0],xmm2[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3008(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 3072(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 2912(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 2960(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3456(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 3520(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 3360(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps 3408(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps 944(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1952(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1952(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2336(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2400(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2288(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2784(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2848(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 2688(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2736(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3232(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 3296(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 3136(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3184(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2336(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2400(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 2288(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = mem[0],xmm2[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2784(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2848(%rdi), %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[3],ymm2[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 2736(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm3[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 3232(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, 3296(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 3184(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm3[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm3[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 768(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 720(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = mem[0],xmm4[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1216(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1280(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm13[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 1168(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = mem[0],xmm4[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1664(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 1616(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = mem[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2112(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2176(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm15[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 2064(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = mem[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2560(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2624(%rdi), %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm4[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd 2512(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm11 = mem[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm11[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 3008(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 3072(%rdi), %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm11[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd 2960(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm12 = mem[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm12[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 3456(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, 3520(%rdi), %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm14[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd 3408(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm12 = mem[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm12[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[3],ymm8[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm4[0],ymm14[0],ymm4[3],ymm14[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm6[0],ymm12[0],ymm6[3],ymm12[2] +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm12[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm13[0],ymm3[3],ymm13[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 1056(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[3],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 960(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1728(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[0],ymm12[0],ymm6[3],ymm12[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 2080(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 2176(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm7[0],ymm11[0],ymm7[3],ymm11[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 2528(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 2624(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm8[0],ymm10[0],ymm8[3],ymm10[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 1952(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 2976(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 3072(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 3424(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 3520(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[3],ymm15[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 1856(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2400(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm7[0],ymm5[0],ymm7[3],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 2304(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2848(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 2752(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1056(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 960(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1952(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[3],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 1856(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2400(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[3],ymm6[2] -; AVX1-ONLY-NEXT: vmovdqa 2304(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 3296(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2848(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[3],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 2752(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 3200(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 3296(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[3],ymm2[2] -; AVX1-ONLY-NEXT: vmovdqa 3200(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm10[0],ymm0[0],ymm10[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm13[0],ymm10[0],ymm13[3],ymm10[2] +; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1728(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[3],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2176(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm15[0],ymm13[0],ymm15[3],ymm13[2] +; AVX1-ONLY-NEXT: vmovdqa 2080(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2624(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm9[0],ymm4[3],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 2528(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 3072(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm11[0],ymm3[0],ymm11[3],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 2976(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm11[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 3520(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm14[0],ymm11[0],ymm14[3],ymm11[2] +; AVX1-ONLY-NEXT: vmovdqa 3424(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm14[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm15[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm0[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm4[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm12 = xmm4[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1],ymm8[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vmovapd 1136(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 800(%rdi), %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm12 = xmm1[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1],ymm8[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vmovapd 1584(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1024(%rdi), %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovapd 912(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm12 = xmm8[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm12[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vmovapd 1136(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm12 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm12[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1472(%rdi), %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovapd 1360(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm14 = xmm12[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm14[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1696(%rdi), %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 1584(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = xmm14[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1920(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm2[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovapd 1808(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm2[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 2144(%rdi), %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm6[0,1,2],ymm13[3] +; AVX1-ONLY-NEXT: vmovapd 2032(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm6[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm13[0,1],ymm7[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2144(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3] -; AVX1-ONLY-NEXT: vmovapd 2032(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2368(%rdi), %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd 2256(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm13[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 2592(%rdi), %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm5[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vmovapd 2480(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm5[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2592(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vmovapd 2480(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2816(%rdi), %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovaps 2704(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 3040(%rdi), %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 2928(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3040(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 2928(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 3264(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovaps 3152(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3488(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovapd 3376(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm1[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovdqa 3152(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2816(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovdqa 2704(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 2368(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovdqa 2256(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1920(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vmovapd 1808(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm5[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1472(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovapd 1360(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm3[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm1[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovapd 912(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm1[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm6[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm11[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm12[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm12[1],ymm13[0],ymm12[2],ymm13[2] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm0[1],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm11[0],ymm6[2],ymm11[2] -; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovupd %ymm9, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm9[0],ymm6[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 976(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm1[1],ymm9[0],ymm1[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1200(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm6[0],ymm4[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1424(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm0[0],ymm3[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 3488(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovapd 3376(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1648(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm7[0],ymm3[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm15[1],ymm9[0],ymm15[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1872(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm11[0],ymm0[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm5[1],ymm0[0],ymm5[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2096(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 2208(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[1],ymm0[0],ymm4[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2320(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 976(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[1],ymm0[0],ymm8[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1200(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[1],ymm0[0],ymm10[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 2432(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm14[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2544(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm14[0],ymm0[2],ymm14[2] -; AVX1-ONLY-NEXT: vmovdqa 2656(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 1424(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2768(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm0[1],ymm4[0],ymm0[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 2880(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[1],ymm0[0],ymm12[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1648(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm14[1],ymm1[0],ymm14[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1872(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[1],ymm12[0],ymm2[2],ymm12[2] +; AVX1-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2096(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[1],ymm0[0],ymm6[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 2208(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2320(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm13[1],ymm15[0],ymm13[2],ymm15[2] +; AVX1-ONLY-NEXT: vmovdqa 2432(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2544(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm5[1],ymm4[0],ymm5[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 2656(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2768(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 2880(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2992(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm5[0],ymm4[2],ymm5[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2992(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm10[0],ymm2[2],ymm10[2] ; AVX1-ONLY-NEXT: vmovdqa 3104(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 3216(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm5[0],ymm4[2],ymm5[2] -; AVX1-ONLY-NEXT: vmovdqa 3328(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 3216(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm7[0],ymm2[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovdqa 3328(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 3440(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm4[1],ymm5[0],ymm4[2],ymm5[2] -; AVX1-ONLY-NEXT: vmovdqa 3552(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = mem[0],xmm13[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm5[0],ymm2[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 3552(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm14[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm9[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm13[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm0[0],xmm11[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm11[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm5[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd $2, (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[1] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm5[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 1024(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm0[0],xmm9[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm15[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 1696(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 1600(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm0[0],xmm7[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm12[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 2144(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 2048(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 1024(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1088(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = mem[0],xmm6[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1536(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1696(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1760(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 2368(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 2272(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 1920(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1984(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm12 = mem[0],xmm12[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 2592(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 2144(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2208(%rdi), %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm1[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm0 = mem[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 2496(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm0[0],xmm14[1] +; AVX1-ONLY-NEXT: vmovapd 2368(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2432(%rdi), %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm15 = mem[0],xmm15[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2592(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vmovupd %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 2656(%rdi), %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm15[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = mem[0],xmm4[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2816(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 2880(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm12[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm3[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 3040(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 3104(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = mem[0],xmm10[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 3264(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, 3328(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = mem[0],xmm7[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 3488(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 3552(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm14[0],ymm3[0],ymm14[3],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm13[0],ymm3[0],ymm13[3],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm8[0],ymm13[0],ymm8[3],ymm13[2] +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm11[0],ymm12[0],ymm11[3],ymm12[2] +; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm9[0],ymm11[0],ymm9[3],ymm11[2] +; AVX1-ONLY-NEXT: vmovdqa 992(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm5[0],ymm10[0],ymm5[3],ymm10[2] +; AVX1-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1536(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovupd (%rsp), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm9[0],ymm3[3],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1760(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm6[0],ymm8[0],ymm6[3],ymm8[2] +; AVX1-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1984(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[3],ymm7[2] +; AVX1-ONLY-NEXT: vmovdqa 1888(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2208(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[3],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 2112(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 2816(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 2720(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps 3040(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 2944(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 3264(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 3168(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 3488(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 3392(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 2432(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[3],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 2336(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm13[0],ymm0[0],ymm13[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[3],ymm1[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 992(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[3],ymm1[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1536(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm9[0],ymm15[0],ymm9[3],ymm15[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1760(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2656(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1888(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1984(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm7[0],ymm11[0],ymm7[3],ymm11[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2208(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[0],ymm12[0],ymm10[3],ymm12[2] -; AVX1-ONLY-NEXT: vmovdqa 2112(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 2560(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 2336(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 2432(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm3[0],ymm9[0],ymm3[3],ymm9[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2656(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovapd 2880(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[3],ymm8[2] -; AVX1-ONLY-NEXT: vmovdqa 2560(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[3],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 2784(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 2784(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 2880(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm5[0],ymm2[3],ymm5[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 3104(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovapd 3104(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[3],ymm6[2] -; AVX1-ONLY-NEXT: vmovdqa 3008(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[3],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 3008(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 3232(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 3328(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm14[0],ymm3[0],ymm14[3],ymm3[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 3552(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[3],ymm2[2] -; AVX1-ONLY-NEXT: vmovdqa 3456(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 3328(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 3232(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vmovapd 3552(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm14[0],ymm0[0],ymm14[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 3456(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2],ymm13[3] ; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm14 = xmm14[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm14[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2,3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = mem[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm13[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm13[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = mem[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm12 = xmm12[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm12[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = mem[0,1,2],ymm10[3] ; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm11 = xmm11[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm9[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm5[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm3[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r9) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm10[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm10[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 480(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 416(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 352(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 288(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 448(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 384(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 256(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 480(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 416(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 352(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 288(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 448(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 384(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 256(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 480(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 448(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 416(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 384(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 352(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 288(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 256(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 480(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 448(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 416(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 384(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 352(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 288(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 256(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 480(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 448(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 416(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 384(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 352(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 288(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 256(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 160(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 128(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 480(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 448(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 416(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 384(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 352(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 288(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 256(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rax) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovapd %ymm0, 480(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 448(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 416(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 384(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 352(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 320(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm10, 288(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 256(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 224(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm15, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 448(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 416(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 384(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 352(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 320(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 288(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 256(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 224(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 192(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 160(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 128(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 32(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $4232, %rsp # imm = 0x1088 +; AVX1-ONLY-NEXT: addq $3784, %rsp # imm = 0xEC8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride7_vf64: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $3928, %rsp # imm = 0xF58 -; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: subq $3544, %rsp # imm = 0xDD8 +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 496(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 944(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 1392(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 1840(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2336(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 2400(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovdqa 2288(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2784(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovdqa 2848(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2736(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3232(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovdqa 3296(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 3184(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 720(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1568(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 1616(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2112(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovdqa 2112(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vmovdqa 2176(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2016(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 2064(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2560(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovdqa 2560(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqa 2624(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2464(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 2512(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3008(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovdqa 3008(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqa 3072(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2912(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2960(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2960(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3456(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3520(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 3360(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3408(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 3456(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 3520(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa 496(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 944(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1392(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1952(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1792(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1840(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2336(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2400(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2240(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2288(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2784(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2848(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2688(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2736(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3232(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3296(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqa 3136(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3184(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm9[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm10[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm5[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm11[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm12[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm7[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2080(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2176(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm8[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2624(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm13[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 3408(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1952(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1856(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2400(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm6[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2848(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2752(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3296(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm8[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 3200(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm10[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm11[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm13[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2176(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm15[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2080(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2624(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3072(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 2976(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 3072(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3424(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 3520(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3520(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 3424(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1952(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1856(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2400(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2848(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2752(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3296(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 3200(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 800(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 1248(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps 1136(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1696(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX2-ONLY-NEXT: vmovdqa 1584(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm3, %ymm2 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 2144(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] -; AVX2-ONLY-NEXT: vmovdqa 2032(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm4, %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovdqa 240(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 2592(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] -; AVX2-ONLY-NEXT: vmovdqa 2480(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm13, %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovdqa 464(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 3040(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm5, %ymm0 ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] -; AVX2-ONLY-NEXT: vmovdqa 2928(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 3488(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm10, %ymm0 ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] -; AVX2-ONLY-NEXT: vmovdqa 3376(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 3264(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 2816(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 2368(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1920(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 912(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1472(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm9, %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] +; AVX2-ONLY-NEXT: vmovdqa 1136(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm11, %ymm0 ; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1360(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 1024(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 576(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm14, %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm7[1],ymm0[3],ymm7[3] +; AVX2-ONLY-NEXT: vmovdqa 1584(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 1808(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 128(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 2144(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] +; AVX2-ONLY-NEXT: vmovdqa 2032(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2368(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 2256(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 2592(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] +; AVX2-ONLY-NEXT: vmovdqa 2480(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 2816(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm12, %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovdqa 2704(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1856(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 3040(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm8, %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] +; AVX2-ONLY-NEXT: vmovdqa 2928(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2080(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 3264(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm7, %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovdqa 3152(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2432(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 3488(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm6, %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vmovdqa 3376(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2656(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2752(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2880(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2976(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 3104(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm13[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm10[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm10[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm10[2,3],ymm9[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm10[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm14[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm10[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1856(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3200(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 3328(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3424(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 3552(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2080(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm15[2,3],ymm14[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2432(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2656(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1376(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1600(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 1920(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1824(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2752(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2880(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2976(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 3104(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm12[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3200(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 3328(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3424(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 3552(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1376(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 2144(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2048(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1600(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1920(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 2368(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2272(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1824(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 2144(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2048(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vmovdqa 2368(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2272(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vmovdqa 2592(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 2496(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vmovdqa 2816(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2816(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 2720(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 3040(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 3040(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 2944(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 3264(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 3168(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 3488(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 3392(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm13[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm6[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm11[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm4[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm9[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm15[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm5[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2112(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2336(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2432(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2112(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2656(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2560(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2432(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpalignr $8, (%rsp), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2336(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2656(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2560(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2784(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2880(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2880(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2784(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3104(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 3008(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 3104(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm14[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 3008(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3232(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 3328(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpalignr $8, (%rsp), %ymm3, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3552(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 3456(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 3328(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 3232(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovdqa 3552(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 3456(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = mem[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm10 = mem[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r9) +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm11[1],ymm8[1],ymm11[3],ymm8[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm11[1],ymm6[1],ymm11[3],ymm6[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm11[1],ymm3[1],ymm11[3],ymm3[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm11[1],ymm2[1],ymm11[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm1[1],ymm11[3],ymm1[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm11[1],ymm0[1],ymm11[3],ymm0[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 480(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 416(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 352(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 288(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 448(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 384(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 320(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 256(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 480(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 416(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 352(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 288(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 448(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 384(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 320(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 256(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 480(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 448(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 416(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 384(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 352(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 320(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 288(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 256(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 480(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 448(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 416(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 384(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 352(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 320(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 288(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 256(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 224(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 160(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 480(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 448(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 416(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 384(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 352(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 320(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 288(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 256(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 224(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 192(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 160(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 128(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 96(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 64(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rax) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 480(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 448(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 416(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 384(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 352(%rax) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 320(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 288(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 256(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 224(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 192(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 160(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 128(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm11, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm14, 480(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 448(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 416(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm8, 384(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 352(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 320(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, 288(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm13, 256(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm15, 224(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 480(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 448(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 416(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 384(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 352(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 320(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, 288(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 256(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, 224(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm9, 192(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 160(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, 128(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm13, 96(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm14, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 32(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX2-ONLY-NEXT: addq $3928, %rsp # imm = 0xF58 +; AVX2-ONLY-NEXT: addq $3544, %rsp # imm = 0xDD8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride7_vf64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $6728, %rsp # imm = 0x1A48 -; AVX512F-NEXT: vmovdqa64 3328(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3328(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 3264(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3008(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm22 -; AVX512F-NEXT: vmovdqa64 2880(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2816(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512F-NEXT: vmovdqa64 2688(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 2880(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 2816(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm12 ; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm24 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm15 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa 2704(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa 912(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa 1808(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa 2704(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm17, %zmm3, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm14, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa 2816(%rdi), %ymm0 +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512F-NEXT: vmovdqa64 %ymm0, %ymm18 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [4,11] ; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm5, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm16, %zmm5, %zmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,7,14,0,0,7,14,0] ; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqa64 3072(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [4,5,6,13,4,5,6,13] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm2 ; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm28 +; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2 +; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm28, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm13 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm12 ; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm7 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm3 +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm3 +; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm4, %zmm6 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm6 +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm10, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm27 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm6 +; AVX512F-NEXT: vmovdqa 1472(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7] ; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm25 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm25, %zmm5, %zmm6 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm25, %zmm5, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm28 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm4, %zmm6 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm6 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm0 +; AVX512F-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm8 +; AVX512F-NEXT: vmovdqa 1920(%rdi), %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm5, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm8 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm23 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm23, %zmm5, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm19, %zmm4, %zmm9 +; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm22 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm7, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm20, %zmm4, %zmm8 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm8 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm1 +; AVX512F-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm8 ; AVX512F-NEXT: vmovdqa 2368(%rdi), %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm21, %zmm5, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm27 +; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm10 +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm8 -; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm26 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm9, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa 1920(%rdi), %ymm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-NEXT: vmovdqa64 %ymm0, %ymm29 -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm5, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm11 +; AVX512F-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm8 +; AVX512F-NEXT: vmovdqa 2816(%rdi), %ymm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 2688(%rdi), %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm11 -; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm0 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 3008(%rdi), %zmm30 +; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm4, %zmm11 +; AVX512F-NEXT: vmovdqa64 3072(%rdi), %zmm0 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm8 -; AVX512F-NEXT: vmovdqa 3264(%rdi), %ymm12 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-NEXT: vmovdqa64 3200(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 3136(%rdi), %zmm6 -; AVX512F-NEXT: vpermi2q %zmm11, %zmm6, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 3456(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 3392(%rdi), %zmm31 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm31, %zmm4 +; AVX512F-NEXT: vmovdqa 3264(%rdi), %ymm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-NEXT: vmovdqa64 3200(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 3136(%rdi), %zmm10 +; AVX512F-NEXT: vpermi2q %zmm9, %zmm10, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 3456(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 3392(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 ; AVX512F-NEXT: vmovdqa64 3520(%rdi), %zmm0 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm4, %zmm4 +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 2880(%rdi), %ymm4 -; AVX512F-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm4 +; AVX512F-NEXT: vmovdqa64 %ymm18, %ymm0 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload -; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0] ; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm16 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,6,14,4,5,6,14] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm16 # 64-byte Folded Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm16, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm9 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm15, %zmm4, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm9, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm24, %zmm4, %zmm15 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,14,4,5,6,14] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm15, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm7 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm8, %zmm7 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm2 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm18, %zmm4, %zmm7 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm1 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm15, %zmm4, %zmm2 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm1 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm25, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm2 -; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm30, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 2432(%rdi), %ymm2 -; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm8, %zmm3 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 1984(%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa64 %ymm29, %ymm3 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm27, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm8, %zmm3 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 3328(%rdi), %ymm2 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vpermi2q %zmm11, %zmm6, %zmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vpermi2q %zmm31, %zmm5, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm8, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm4, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm5, %zmm2 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 1984(%rdi), %ymm1 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm5, %zmm2 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 2432(%rdi), %ymm1 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm28 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 2880(%rdi), %ymm1 +; AVX512F-NEXT: vmovdqa64 %ymm31, %ymm2 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm4, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm5, %zmm2 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa 3328(%rdi), %ymm1 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vpermi2q %zmm9, %zmm10, %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm4 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm5, %zmm4 ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [10,3,10,3,10,3,10,3] -; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm26, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [11,4,11,4,11,4,11,4] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [12,5,12,5,12,5,12,5] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,6,13,6,13,6,13,6] -; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm26, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm7, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm12, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm26, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm7, %zmm2 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [10,3,10,3,10,3,10,3] +; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm21, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm12, %zmm2 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [11,4,11,4,11,4,11,4] +; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm24, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm25 -; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm26, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm3, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm7, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm12, %zmm2 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [12,5,12,5,12,5,12,5] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,6,13,6,13,6,13,6] +; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm26, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm3, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm7, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm12, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm27 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm31, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm21, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm24, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm16, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm21, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm24, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm16, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm23 +; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm21, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm24, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm16, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm25 +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm21, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm24, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm16, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm21, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm24, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm3, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm16, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm21, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm24, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm12, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm16, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm29 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm21 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm18, %zmm16, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm30, %zmm16, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm27, %zmm16, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm8 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm31, %zmm2 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm24 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm7, %zmm28 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm31, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm31, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,9,0,5,6,9] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm31 +; AVX512F-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm21 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,10,0,5,6,10] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm20 -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm20 -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm20 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm19 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm19 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm19 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm23 +; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm20 ; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm24 -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm24 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,11,0,5,6,11] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm28 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm25 -; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm22 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm19 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm19 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm19 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm22 ; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm17 ; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,12,0,5,6,12] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm23 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm21 -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm28 +; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm16 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,5,8,15,4,5,8,15] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm27 -; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm29 +; AVX512F-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <0,7,14,u> +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm25 ; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm23 = <0,7,14,u> -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm23, %zmm21 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm17 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm13[4,5,4,5],zmm29[4,5,4,5] -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,0,9,0,7,0,9,0] -; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm29, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,11,4,11] -; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm17, %zmm6, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm20, %zmm23, %zmm24 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[4,5,4,5],zmm25[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm25, %zmm6, %zmm1 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm13[4,5,4,5],zmm18[4,5,4,5] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [7,0,9,0,7,0,9,0] +; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm26, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm23, %zmm31 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [4,11,4,11] +; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm28, %zmm8, %zmm22 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k1} = zmm1[4,5,4,5],zmm17[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm6, %zmm1 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[4,5,4,5],zmm30[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm23, %zmm25 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm20, %zmm8, %zmm30 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm1[4,5,4,5],zmm5[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm30 {%k1} = zmm1[4,5,4,5],zmm27[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm18, %zmm23, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm8, %zmm31 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k1} = zmm4[4,5,4,5],zmm1[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm26, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm17 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm23, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm8, %zmm29 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm1[4,5,4,5],zmm7[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm11 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm17 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm13, %zmm10, %zmm23 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm2[4,5,4,5],zmm4[4,5,4,5] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm8, %zmm17 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm22 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm4, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm1[4,5,4,5],zmm5[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm1[4,5,4,5],zmm5[4,5,4,5] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <9,0,7,u> -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm16 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] -; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm11, %zmm8, %zmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm1[4,5,4,5],zmm4[4,5,4,5] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm9, %zmm5, %zmm8 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k1} = zmm1[4,5,4,5],zmm2[4,5,4,5] +; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm27 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm18 = [6,13] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm18, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm18, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm6 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm18, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm28 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm18, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm3, %zmm4 -; AVX512F-NEXT: vpermi2q %zmm10, %zmm13, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm18, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm18, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm18, %zmm10 ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm13 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm5, %zmm9, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm18, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: movb $-32, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm23 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm4 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm8 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 {%k2} ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 {%k2} ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa 2752(%rdi), %ymm15 +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm15[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm15 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm22, %zmm28 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm5, %zmm9 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqa 512(%rdi), %ymm15 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm1, %zmm21 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 {%k2} +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm15 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa 960(%rdi), %ymm1 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm15, %zmm0, %zmm0 -; AVX512F-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm23, %zmm23 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm15, %zmm1, %zmm1 +; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa 1408(%rdi), %ymm15 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm0, %zmm26 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm1, %zmm24 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} ; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload ; AVX512F-NEXT: # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa 960(%rdi), %ymm14 +; AVX512F-NEXT: vmovdqa 1856(%rdi), %ymm14 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm14, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm14, %zmm5, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa 2304(%rdi), %ymm1 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm15 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa 1856(%rdi), %ymm13 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 {%k2} +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm15 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm15 = ymm5[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa 2752(%rdi), %ymm13 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm13, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm13, %zmm5, %zmm13 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm13 {%k2} +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm15, %zmm5, %zmm5 ; AVX512F-NEXT: vmovdqa 3200(%rdi), %ymm15 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm29, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm15 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm23, 448(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm11, 384(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm19, 320(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm17, 256(%rsi) +; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm26, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm8, 448(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm12, 384(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm17, 320(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm29, 256(%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm31, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm24, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm3, 448(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm6, 320(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm9, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm20, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm16, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm4, 384(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm30, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm25, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm0, 448(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm2, 384(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm3, 320(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm28, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm27, (%rdx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm8, 320(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm18, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm27, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm30, 64(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 384(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm10, 320(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm19, 256(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm18, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm20, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm15, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm13, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm13, 384(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm1, 320(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm14, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm26, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm28, 384(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm14, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm24, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm23, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm9, (%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512F-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 320(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512F-NEXT: vmovaps %zmm0, 256(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 192(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%r9) +; AVX512F-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%r9) +; AVX512F-NEXT: vmovaps %zmm0, (%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-NEXT: vmovaps %zmm0, (%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: vmovaps %zmm5, 448(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512F-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512F-NEXT: addq $6728, %rsp # imm = 0x1A48 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -11662,783 +9958,779 @@ ; AVX512BW-LABEL: load_i64_stride7_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $6664, %rsp # imm = 0x1A08 -; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512BW-NEXT: vmovaps (%rdi), %zmm0 +; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa 2704(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm1 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa 912(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa 1808(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa 2704(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm17, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm0 ; AVX512BW-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1 -; AVX512BW-NEXT: vmovdqa 2816(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,11] -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm5, %zmm15 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,7,14,0,0,7,14,0] ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,6,13,4,5,6,13] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,5,6,13,4,5,6,13] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm2 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2 -; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 +; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm26 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm1 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm3 +; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm2 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm8, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm13, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 ; AVX512BW-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm29 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm28 ; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm5, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm9 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm8, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm5, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm9 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm5, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm8, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm6 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm1 +; AVX512BW-NEXT: vmovdqa 1920(%rdi), %ymm6 +; AVX512BW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm5, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm13, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm1 +; AVX512BW-NEXT: vmovdqa 2368(%rdi), %ymm6 +; AVX512BW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm9 -; AVX512BW-NEXT: vmovdqa 2368(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm8, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm9 -; AVX512BW-NEXT: vmovdqa 1920(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm9 +; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1 +; AVX512BW-NEXT: vmovdqa 2816(%rdi), %ymm6 +; AVX512BW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm12 -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm4, %zmm11 +; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm9 -; AVX512BW-NEXT: vmovdqa 3264(%rdi), %ymm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm10 -; AVX512BW-NEXT: vpermi2q %zmm26, %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 3264(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm9, %zmm11, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm27 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 2880(%rdi), %ymm4 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm4 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0] ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm15 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,14,4,5,6,14] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm15 # 64-byte Folded Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm8 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm8 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm13 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm13 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm4, %zmm13 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm13 # 64-byte Folded Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm2 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm7 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm5, %zmm7 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm5, %zmm3 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm5, %zmm3 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 2432(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 1984(%rdi), %ymm2 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm3 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 1984(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm5, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 2432(%rdi), %ymm2 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm5, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 2880(%rdi), %ymm2 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm4, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm5, %zmm3 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 3328(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpermi2q %zmm26, %zmm10, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm4 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 3328(%rdi), %ymm2 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpermi2q %zmm18, %zmm1, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm25, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm5, %zmm4 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [10,3,10,3,10,3,10,3] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm19, %zmm21 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [11,4,11,4,11,4,11,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [10,3,10,3,10,3,10,3] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm20, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [12,5,12,5,12,5,12,5] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm18, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm20, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [11,4,11,4,11,4,11,4] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm22, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [12,5,12,5,12,5,12,5] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,6,13,6,13,6,13,6] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm19, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm20, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm29 -; AVX512BW-NEXT: vmovdqu64 %zmm29, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm19, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm20, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm19, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm20, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm20, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm22, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm19, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm20, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm18, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm11, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm20, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm18, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm31 +; AVX512BW-NEXT: vmovdqu64 %zmm31, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm20, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm22, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm24 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm20, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm20, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm22, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm20, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm22, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm20, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm22, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm13, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm20, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm22, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm13, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm30 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm21, %zmm20 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm13, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm1 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm1 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm1 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm1 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm1 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm20 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm21, %zmm22 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm1 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm18, %zmm29 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm22 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm21, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm21, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm21 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,9,0,5,6,9] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm27 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm26 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm20 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,10,0,5,6,10] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm20 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm23 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm22 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,11,0,5,6,11] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm29 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm18 ; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,12,0,5,6,12] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm27 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm28 +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm13 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,5,8,15,4,5,8,15] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = <0,7,14,u> +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm18, %zmm1 ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,7,14,u> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm16, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,4,5],zmm31[4,5,4,5] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm15[4,5,4,5],zmm4[4,5,4,5] ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [7,0,9,0,7,0,9,0] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [7,0,9,0,7,0,9,0] +; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm27, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [4,11,4,11] ; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm16, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm18, %zmm29 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm1[4,5,4,5],zmm25[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm1[4,5,4,5],zmm26[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm16, %zmm27 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm18, %zmm28 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm27 {%k1} = zmm1[4,5,4,5],zmm23[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm1[4,5,4,5],zmm12[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm16, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm18, %zmm25 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm1[4,5,4,5],zmm9[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm1[4,5,4,5],zmm11[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm16, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm18, %zmm23 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm1[4,5,4,5],zmm7[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm16, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm18, %zmm16 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[4,5,4,5],zmm3[4,5,4,5] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm6[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm28, %zmm15, %zmm16 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm18, %zmm10 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[4,5,4,5],zmm2[4,5,4,5] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm21 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm1[4,5,4,5],zmm4[4,5,4,5] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm26, %zmm21 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm26 +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm5, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm1[4,5,4,5],zmm2[4,5,4,5] +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm27 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm5[4,5,4,5],zmm4[4,5,4,5] -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm14 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm7 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm6 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm4 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm15, %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups (%rsp), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovups (%rsp), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = ymm9[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm30, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 {%k2} ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -12448,176 +10740,189 @@ ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm31 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqa 2752(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm21, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm30 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} ; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm1 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm18 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm18 = mem[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm18, %xmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm18, %zmm19, %zmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %ymm17 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm17, %xmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm9, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm17 {%k2} ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %ymm19 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm19 = mem[8,9,10,11,12,13,14,15],ymm19[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm19[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti32x4 $1, %ymm19, %xmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm20, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %ymm20 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm20, %xmm20 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm20, %zmm21, %zmm20 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm20 {%k2} -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %ymm22 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm22 = mem[8,9,10,11,12,13,14,15],ymm22[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm22[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm22, %xmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm22, %zmm21, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm9, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm19 {%k2} ; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %ymm24 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm24 = mem[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti32x4 $1, %ymm24, %xmm24 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm24, %zmm21, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm24, %zmm9, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %ymm24 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm24 = mem[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm24, %xmm24 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm24, %zmm9, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %ymm24 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm24 = mem[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm24, %xmm24 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm24, %zmm9, %zmm20 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm20 {%k2} ; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %ymm24 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm24 = mem[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti32x4 $1, %ymm24, %xmm24 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm24, %zmm26, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm24 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%rsi) +; AVX512BW-NEXT: vinserti32x4 $0, %xmm24, %zmm27, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm18, 448(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm10, 384(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 320(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm10, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm10, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 320(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 256(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 64(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm9, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 320(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 192(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 448(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 320(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm29, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 384(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 448(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 384(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 320(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 256(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm31, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm24, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 256(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm18, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm30, (%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512BW-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 320(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512BW-NEXT: vmovaps %zmm0, 256(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%r9) +; AVX512BW-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%r9) +; AVX512BW-NEXT: vmovaps %zmm0, (%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, (%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-NEXT: addq $6664, %rsp # imm = 0x1A08 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -12647,7 +10952,11 @@ ; AVX2-SLOW: {{.*}} ; AVX512BW-FAST: {{.*}} ; AVX512BW-SLOW: {{.*}} +; AVX512DQ-FAST: {{.*}} +; AVX512DQ-SLOW: {{.*}} ; AVX512F-FAST: {{.*}} +; AVX512F-ONLY-FAST: {{.*}} +; AVX512F-ONLY-SLOW: {{.*}} ; AVX512F-SLOW: {{.*}} ; FALLBACK0: {{.*}} ; FALLBACK1: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll @@ -119,30 +119,30 @@ ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX512-NEXT: vmovaps (%rdi), %xmm1 -; AVX512-NEXT: vmovaps 16(%rdi), %xmm2 -; AVX512-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX512-NEXT: vmovaps 48(%rdi), %xmm4 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm0[0] -; AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX512-NEXT: vmovaps 80(%rdi), %xmm1 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm1[0] -; AVX512-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX512-NEXT: vmovaps 96(%rdi), %xmm2 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0] -; AVX512-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX512-NEXT: vmovaps 112(%rdi), %xmm3 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0] -; AVX512-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] -; AVX512-NEXT: vmovaps %xmm5, (%rsi) -; AVX512-NEXT: vmovaps %xmm0, (%rdx) -; AVX512-NEXT: vmovaps %xmm6, (%rcx) -; AVX512-NEXT: vmovaps %xmm1, (%r8) -; AVX512-NEXT: vmovaps %xmm7, (%r9) -; AVX512-NEXT: vmovaps %xmm2, (%r11) -; AVX512-NEXT: vmovaps %xmm8, (%r10) -; AVX512-NEXT: vmovaps %xmm3, (%rax) +; AVX512-NEXT: vmovaps (%rdi), %xmm0 +; AVX512-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX512-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX512-NEXT: vmovaps 48(%rdi), %xmm3 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vpbroadcastq 8(%rdi), %xmm4 +; AVX512-NEXT: vpunpcklqdq 72(%rdi){1to2}, %xmm4, %xmm4 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512-NEXT: vpbroadcastq 24(%rdi), %xmm5 +; AVX512-NEXT: vpunpcklqdq 88(%rdi){1to2}, %xmm5, %xmm5 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512-NEXT: vpbroadcastq 40(%rdi), %xmm6 +; AVX512-NEXT: vpunpcklqdq 104(%rdi){1to2}, %xmm6, %xmm6 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512-NEXT: vpbroadcastq 56(%rdi), %xmm7 +; AVX512-NEXT: vpunpcklqdq 120(%rdi){1to2}, %xmm7, %xmm7 +; AVX512-NEXT: vmovaps %xmm0, (%rsi) +; AVX512-NEXT: vmovdqa %xmm4, (%rdx) +; AVX512-NEXT: vmovaps %xmm1, (%rcx) +; AVX512-NEXT: vmovdqa %xmm5, (%r8) +; AVX512-NEXT: vmovaps %xmm2, (%r9) +; AVX512-NEXT: vmovdqa %xmm6, (%r11) +; AVX512-NEXT: vmovaps %xmm3, (%r10) +; AVX512-NEXT: vmovdqa %xmm7, (%rax) ; AVX512-NEXT: retq %wide.vec = load <16 x i64>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <16 x i64> %wide.vec, <16 x i64> poison, <2 x i32> @@ -167,69 +167,69 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i64_stride8_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movaps 112(%rdi), %xmm6 ; SSE-NEXT: movaps 240(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm9 -; SSE-NEXT: movaps 224(%rdi), %xmm11 +; SSE-NEXT: movaps 112(%rdi), %xmm6 +; SSE-NEXT: movaps 224(%rdi), %xmm9 ; SSE-NEXT: movaps 160(%rdi), %xmm0 -; SSE-NEXT: movaps 80(%rdi), %xmm14 -; SSE-NEXT: movaps 208(%rdi), %xmm15 -; SSE-NEXT: movaps 144(%rdi), %xmm2 -; SSE-NEXT: movaps 64(%rdi), %xmm12 -; SSE-NEXT: movaps (%rdi), %xmm7 +; SSE-NEXT: movaps 96(%rdi), %xmm11 +; SSE-NEXT: movaps 208(%rdi), %xmm14 +; SSE-NEXT: movaps 144(%rdi), %xmm3 +; SSE-NEXT: movaps 80(%rdi), %xmm15 +; SSE-NEXT: movaps 192(%rdi), %xmm12 +; SSE-NEXT: movaps 128(%rdi), %xmm7 +; SSE-NEXT: movaps 64(%rdi), %xmm13 +; SSE-NEXT: movaps (%rdi), %xmm8 ; SSE-NEXT: movaps 16(%rdi), %xmm5 -; SSE-NEXT: movaps 32(%rdi), %xmm3 -; SSE-NEXT: movaps 48(%rdi), %xmm4 -; SSE-NEXT: movaps 192(%rdi), %xmm13 -; SSE-NEXT: movaps 128(%rdi), %xmm8 +; SSE-NEXT: movaps 32(%rdi), %xmm4 +; SSE-NEXT: movaps 48(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm8, %xmm10 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm13[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm13[1] ; SSE-NEXT: movaps %xmm7, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm12[1] -; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: movaps %xmm5, %xmm12 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm15[1] -; SSE-NEXT: movaps %xmm5, %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm15[1] +; SSE-NEXT: movaps %xmm3, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm14[1] -; SSE-NEXT: movaps %xmm0, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm14[1] +; SSE-NEXT: movaps %xmm4, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm11 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm11[1] +; SSE-NEXT: movaps %xmm0, %xmm11 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm9[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm9[1] -; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm9 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] ; SSE-NEXT: movaps 176(%rdi), %xmm6 ; SSE-NEXT: movaps %xmm6, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps %xmm13, (%rsi) -; SSE-NEXT: movaps %xmm10, 16(%rsi) -; SSE-NEXT: movaps %xmm7, (%rdx) -; SSE-NEXT: movaps %xmm8, 16(%rdx) -; SSE-NEXT: movaps %xmm15, (%rcx) -; SSE-NEXT: movaps %xmm12, 16(%rcx) +; SSE-NEXT: movaps %xmm13, 16(%rsi) +; SSE-NEXT: movaps %xmm10, (%rsi) +; SSE-NEXT: movaps %xmm7, 16(%rdx) +; SSE-NEXT: movaps %xmm8, (%rdx) +; SSE-NEXT: movaps %xmm15, 16(%rcx) +; SSE-NEXT: movaps %xmm12, (%rcx) +; SSE-NEXT: movaps %xmm3, 16(%r8) ; SSE-NEXT: movaps %xmm5, (%r8) -; SSE-NEXT: movaps %xmm2, 16(%r8) -; SSE-NEXT: movaps %xmm11, (%r9) -; SSE-NEXT: movaps %xmm14, 16(%r9) +; SSE-NEXT: movaps %xmm11, 16(%r9) +; SSE-NEXT: movaps %xmm14, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps %xmm4, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm9, (%rax) ; SSE-NEXT: movaps %xmm1, 16(%rax) +; SSE-NEXT: movaps %xmm9, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm6, 16(%rax) -; SSE-NEXT: movaps %xmm4, (%rax) +; SSE-NEXT: movaps %xmm2, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride8_vf4: @@ -422,18 +422,18 @@ ; SSE-NEXT: movaps 336(%rdi), %xmm0 ; SSE-NEXT: movaps 464(%rdi), %xmm1 ; SSE-NEXT: movaps 400(%rdi), %xmm8 -; SSE-NEXT: movaps 80(%rdi), %xmm2 -; SSE-NEXT: movaps 208(%rdi), %xmm3 +; SSE-NEXT: movaps 208(%rdi), %xmm2 ; SSE-NEXT: movaps 144(%rdi), %xmm9 +; SSE-NEXT: movaps 80(%rdi), %xmm3 ; SSE-NEXT: movaps 320(%rdi), %xmm4 ; SSE-NEXT: movaps 256(%rdi), %xmm11 ; SSE-NEXT: movaps 448(%rdi), %xmm5 ; SSE-NEXT: movaps 384(%rdi), %xmm12 -; SSE-NEXT: movaps 64(%rdi), %xmm6 -; SSE-NEXT: movaps (%rdi), %xmm13 +; SSE-NEXT: movaps 192(%rdi), %xmm6 +; SSE-NEXT: movaps 128(%rdi), %xmm13 +; SSE-NEXT: movaps 64(%rdi), %xmm7 +; SSE-NEXT: movaps (%rdi), %xmm14 ; SSE-NEXT: movaps 16(%rdi), %xmm10 -; SSE-NEXT: movaps 192(%rdi), %xmm7 -; SSE-NEXT: movaps 128(%rdi), %xmm14 ; SSE-NEXT: movaps %xmm14, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -454,16 +454,16 @@ ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm3[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm3 +; SSE-NEXT: movaps %xmm10, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm3[1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm2[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm8, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -475,31 +475,31 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 96(%rdi), %xmm0 -; SSE-NEXT: movaps 32(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps 32(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 224(%rdi), %xmm0 -; SSE-NEXT: movaps 160(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: movaps 160(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 352(%rdi), %xmm0 -; SSE-NEXT: movaps 288(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 480(%rdi), %xmm0 -; SSE-NEXT: movaps 416(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm11 +; SSE-NEXT: movaps 416(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm11 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] +; SSE-NEXT: movaps 352(%rdi), %xmm0 +; SSE-NEXT: movaps 288(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] ; SSE-NEXT: movaps 112(%rdi), %xmm0 ; SSE-NEXT: movaps 48(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: movaps %xmm7, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movaps 240(%rdi), %xmm0 ; SSE-NEXT: movaps 176(%rdi), %xmm5 @@ -521,48 +521,48 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movaps %xmm15, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps %xmm12, 32(%r9) -; SSE-NEXT: movaps %xmm11, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%r9) +; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movaps %xmm10, 32(%r9) +; SSE-NEXT: movaps %xmm11, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm8, 48(%rax) -; SSE-NEXT: movaps %xmm10, 32(%rax) -; SSE-NEXT: movaps %xmm9, 16(%rax) -; SSE-NEXT: movaps %xmm14, (%rax) +; SSE-NEXT: movaps %xmm8, 32(%rax) +; SSE-NEXT: movaps %xmm9, 48(%rax) +; SSE-NEXT: movaps %xmm14, 16(%rax) +; SSE-NEXT: movaps %xmm13, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 48(%rax) ; SSE-NEXT: movaps %xmm4, 32(%rax) +; SSE-NEXT: movaps %xmm3, 48(%rax) ; SSE-NEXT: movaps %xmm6, 16(%rax) -; SSE-NEXT: movaps %xmm13, (%rax) +; SSE-NEXT: movaps %xmm12, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm1, 48(%rax) ; SSE-NEXT: movaps %xmm2, 32(%rax) @@ -574,24 +574,24 @@ ; AVX1-ONLY-LABEL: load_i64_stride8_vf8: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $184, %rsp -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm8[0],xmm7[0] ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm10[0],xmm9[0] ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm12[0],xmm11[0] ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] @@ -603,14 +603,14 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm12[1],xmm11[1] ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm11[0],xmm10[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm13[0],xmm12[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -622,37 +622,37 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm13[1],xmm12[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm0[0],xmm2[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm3[0],xmm4[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] @@ -662,47 +662,47 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsi) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rdx) ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm8, (%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 48(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm15, (%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %xmm11, (%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 48(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) ; AVX1-ONLY-NEXT: addq $184, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -710,26 +710,26 @@ ; AVX2-ONLY-LABEL: load_i64_stride8_vf8: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $136, %rsp -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm1, %ymm2 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm4 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm1, %ymm4 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm3, %ymm6 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm3, %ymm5 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm3, %ymm6 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] @@ -744,9 +744,9 @@ ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm6[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm14[1],ymm7[1],ymm14[3],ymm7[3] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm15[2,3] @@ -754,26 +754,26 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm11[2,3] -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm10, %ymm11 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm10, %ymm13 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm10, %ymm11 +; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm14, %ymm13 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm15, %ymm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm14, %ymm14 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm14, %ymm14 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm15, %ymm15 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm3[0],ymm9[2],ymm3[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] @@ -784,31 +784,31 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, (%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm13, (%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 32(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 32(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm10, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm14, (%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rax) ; AVX2-ONLY-NEXT: addq $136, %rsp ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -1064,21 +1064,21 @@ ; SSE-LABEL: load_i64_stride8_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $664, %rsp # imm = 0x298 -; SSE-NEXT: movaps 832(%rdi), %xmm0 -; SSE-NEXT: movaps 320(%rdi), %xmm1 -; SSE-NEXT: movaps 256(%rdi), %xmm8 -; SSE-NEXT: movaps 960(%rdi), %xmm2 -; SSE-NEXT: movaps 896(%rdi), %xmm9 -; SSE-NEXT: movaps 448(%rdi), %xmm3 -; SSE-NEXT: movaps 384(%rdi), %xmm10 -; SSE-NEXT: movaps 576(%rdi), %xmm4 -; SSE-NEXT: movaps 512(%rdi), %xmm11 -; SSE-NEXT: movaps 64(%rdi), %xmm5 -; SSE-NEXT: movaps (%rdi), %xmm12 -; SSE-NEXT: movaps 704(%rdi), %xmm6 -; SSE-NEXT: movaps 640(%rdi), %xmm13 -; SSE-NEXT: movaps 192(%rdi), %xmm7 -; SSE-NEXT: movaps 128(%rdi), %xmm14 +; SSE-NEXT: movaps 960(%rdi), %xmm0 +; SSE-NEXT: movaps 448(%rdi), %xmm1 +; SSE-NEXT: movaps 384(%rdi), %xmm8 +; SSE-NEXT: movaps 832(%rdi), %xmm2 +; SSE-NEXT: movaps 768(%rdi), %xmm9 +; SSE-NEXT: movaps 320(%rdi), %xmm3 +; SSE-NEXT: movaps 256(%rdi), %xmm10 +; SSE-NEXT: movaps 704(%rdi), %xmm4 +; SSE-NEXT: movaps 640(%rdi), %xmm11 +; SSE-NEXT: movaps 192(%rdi), %xmm5 +; SSE-NEXT: movaps 128(%rdi), %xmm12 +; SSE-NEXT: movaps 576(%rdi), %xmm6 +; SSE-NEXT: movaps 512(%rdi), %xmm13 +; SSE-NEXT: movaps 64(%rdi), %xmm7 +; SSE-NEXT: movaps (%rdi), %xmm14 ; SSE-NEXT: movaps %xmm14, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1114,7 +1114,7 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 768(%rdi), %xmm1 +; SSE-NEXT: movaps 896(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1219,11 +1219,11 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 864(%rdi), %xmm0 -; SSE-NEXT: movaps 800(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movaps 800(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 992(%rdi), %xmm0 ; SSE-NEXT: movaps 928(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm9, %xmm1 @@ -1238,31 +1238,31 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 240(%rdi), %xmm0 -; SSE-NEXT: movaps 176(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps 176(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 368(%rdi), %xmm0 -; SSE-NEXT: movaps 304(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm15 +; SSE-NEXT: movaps 304(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 496(%rdi), %xmm0 ; SSE-NEXT: movaps 432(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, %xmm11 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] ; SSE-NEXT: movaps 624(%rdi), %xmm0 -; SSE-NEXT: movaps 560(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: movaps 560(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] ; SSE-NEXT: movaps 752(%rdi), %xmm0 -; SSE-NEXT: movaps 688(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: movaps 688(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] ; SSE-NEXT: movaps 880(%rdi), %xmm0 ; SSE-NEXT: movaps 816(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm4 @@ -1274,54 +1274,54 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rcx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rcx) +; SSE-NEXT: movaps %xmm0, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rcx) +; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%r8) @@ -1355,7 +1355,7 @@ ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm9, 112(%rax) -; SSE-NEXT: movaps %xmm12, 96(%rax) +; SSE-NEXT: movaps %xmm14, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1382,11 +1382,11 @@ ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm2, 112(%rax) ; SSE-NEXT: movaps %xmm1, 96(%rax) -; SSE-NEXT: movaps %xmm6, 80(%rax) -; SSE-NEXT: movaps %xmm5, 64(%rax) +; SSE-NEXT: movaps %xmm5, 80(%rax) +; SSE-NEXT: movaps %xmm6, 64(%rax) ; SSE-NEXT: movaps %xmm10, 48(%rax) -; SSE-NEXT: movaps %xmm13, 32(%rax) -; SSE-NEXT: movaps %xmm14, 16(%rax) +; SSE-NEXT: movaps %xmm12, 32(%rax) +; SSE-NEXT: movaps %xmm13, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: addq $664, %rsp # imm = 0x298 @@ -1395,85 +1395,85 @@ ; AVX1-ONLY-LABEL: load_i64_stride8_vf16: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $808, %rsp # imm = 0x328 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] ; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0] ; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm10[0] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm9[1],xmm8[1] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovaps 848(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm13 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm13[0],ymm10[0],ymm13[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 848(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm15[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1495,195 +1495,195 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm10[0],xmm13[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm6[0],xmm12[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm11[0],xmm12[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm10[0],xmm11[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm4[0],ymm9[0],ymm4[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm3[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm4[0],ymm8[0],ymm4[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm0[0],xmm3[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm9[1],ymm4[3],ymm9[3] +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm14[1],mem[1],ymm14[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm13[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm8[1],ymm4[3],ymm8[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm14[1],mem[1],ymm14[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm12[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm10[1],xmm11[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm11[1],xmm12[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rsi) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsi) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rsi) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rsi) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rsi) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rdx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rdx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rdx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rdx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rdx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rax) ; AVX1-ONLY-NEXT: addq $808, %rsp # imm = 0x328 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -1691,43 +1691,43 @@ ; AVX2-ONLY-LABEL: load_i64_stride8_vf16: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $808, %rsp # imm = 0x328 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, 960(%rdi), %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, 896(%rdi), %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm9, %ymm9 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm11, %ymm11 -; AVX2-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm11, %ymm11 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm12, %ymm12 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vinsertf128 $1, 960(%rdi), %ymm12, %ymm12 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, 896(%rdi), %ymm6, %ymm6 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm12[0],ymm6[2],ymm12[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm12[1],ymm6[3],ymm12[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1736,26 +1736,26 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm4 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm6[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm3 ; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] @@ -1778,6 +1778,22 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 @@ -1794,131 +1810,115 @@ ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm12[0],ymm13[0],ymm12[2],ymm13[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm11[0],ymm7[2],ymm11[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm8[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm13[2,3],ymm8[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm3[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm12[1],ymm10[3],ymm12[3] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm5[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm14[1],mem[1],ymm14[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%rax) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rax) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm14, (%rax) ; AVX2-ONLY-NEXT: addq $808, %rsp # imm = 0x328 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -2390,21 +2390,21 @@ ; SSE-LABEL: load_i64_stride8_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $1688, %rsp # imm = 0x698 -; SSE-NEXT: movaps 832(%rdi), %xmm0 -; SSE-NEXT: movaps 320(%rdi), %xmm2 -; SSE-NEXT: movaps 256(%rdi), %xmm8 -; SSE-NEXT: movaps 960(%rdi), %xmm1 -; SSE-NEXT: movaps 896(%rdi), %xmm10 -; SSE-NEXT: movaps 448(%rdi), %xmm4 -; SSE-NEXT: movaps 384(%rdi), %xmm9 -; SSE-NEXT: movaps 576(%rdi), %xmm3 -; SSE-NEXT: movaps 512(%rdi), %xmm12 -; SSE-NEXT: movaps 64(%rdi), %xmm6 -; SSE-NEXT: movaps (%rdi), %xmm11 -; SSE-NEXT: movaps 704(%rdi), %xmm5 -; SSE-NEXT: movaps 640(%rdi), %xmm14 -; SSE-NEXT: movaps 192(%rdi), %xmm7 -; SSE-NEXT: movaps 128(%rdi), %xmm13 +; SSE-NEXT: movaps 960(%rdi), %xmm0 +; SSE-NEXT: movaps 448(%rdi), %xmm2 +; SSE-NEXT: movaps 384(%rdi), %xmm8 +; SSE-NEXT: movaps 832(%rdi), %xmm1 +; SSE-NEXT: movaps 768(%rdi), %xmm10 +; SSE-NEXT: movaps 320(%rdi), %xmm4 +; SSE-NEXT: movaps 256(%rdi), %xmm9 +; SSE-NEXT: movaps 704(%rdi), %xmm3 +; SSE-NEXT: movaps 640(%rdi), %xmm12 +; SSE-NEXT: movaps 192(%rdi), %xmm6 +; SSE-NEXT: movaps 128(%rdi), %xmm11 +; SSE-NEXT: movaps 576(%rdi), %xmm5 +; SSE-NEXT: movaps 512(%rdi), %xmm14 +; SSE-NEXT: movaps 64(%rdi), %xmm7 +; SSE-NEXT: movaps (%rdi), %xmm13 ; SSE-NEXT: movaps %xmm13, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2440,14 +2440,7 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 768(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1216(%rdi), %xmm0 -; SSE-NEXT: movaps 1152(%rdi), %xmm1 +; SSE-NEXT: movaps 896(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2460,8 +2453,8 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1472(%rdi), %xmm0 -; SSE-NEXT: movaps 1408(%rdi), %xmm1 +; SSE-NEXT: movaps 1216(%rdi), %xmm0 +; SSE-NEXT: movaps 1152(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2474,8 +2467,8 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1728(%rdi), %xmm0 -; SSE-NEXT: movaps 1664(%rdi), %xmm1 +; SSE-NEXT: movaps 1472(%rdi), %xmm0 +; SSE-NEXT: movaps 1408(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2488,8 +2481,8 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1984(%rdi), %xmm0 -; SSE-NEXT: movaps 1920(%rdi), %xmm1 +; SSE-NEXT: movaps 1728(%rdi), %xmm0 +; SSE-NEXT: movaps 1664(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2502,6 +2495,13 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 1984(%rdi), %xmm0 +; SSE-NEXT: movaps 1920(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rdi), %xmm0 ; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -2824,14 +2824,6 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rsi) @@ -2840,13 +2832,13 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movaps %xmm0, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps %xmm0, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2856,38 +2848,46 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rdx) +; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rdx) +; SSE-NEXT: movaps %xmm0, 224(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rdx) +; SSE-NEXT: movaps %xmm0, 192(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rdx) +; SSE-NEXT: movaps %xmm0, 160(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%rcx) @@ -3072,54 +3072,54 @@ ; ; AVX1-ONLY-LABEL: load_i64_stride8_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2216, %rsp # imm = 0x8A8 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: subq $2168, %rsp # imm = 0x878 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm5[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm6[0],xmm4[0] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm8[0] +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm2[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm10[0],xmm9[0] +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm5[1] ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm7[0],xmm5[0] -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm9[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm10[0] -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm8[0] +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm10[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm3[0] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 @@ -3133,26 +3133,26 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -3210,87 +3210,82 @@ ; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm14[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1872(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm9[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm15[0],ymm11[2],ymm15[2] +; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm10[0],ymm14[0],ymm10[2],ymm14[2] ; AVX1-ONLY-NEXT: vmovaps 848(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm4[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm3[0],ymm13[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm2[0],xmm6[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 1872(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm3[0],xmm7[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm15[1],ymm11[3],ymm15[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],xmm14[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm3[1],ymm13[3],ymm3[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm14[1],ymm10[3],ymm14[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm9[1] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm6[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3301,80 +3296,79 @@ ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm7[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -3385,8 +3379,8 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -3397,6 +3391,12 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 @@ -3409,512 +3409,506 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm15[0],ymm11[2],ymm15[2] +; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm10[0],ymm14[0],ymm10[2],ymm14[2] +; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm2[0],xmm6[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovaps 1904(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 1904(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm3[0],xmm7[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm15[1],ymm11[3],ymm15[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm14[1],ymm10[3],ymm14[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm6[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsi) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%r9) +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 224(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 240(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 176(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 112(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 208(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 144(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 80(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 240(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 176(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 208(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 144(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 80(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 224(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 240(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 176(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 112(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 48(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 192(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 128(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, (%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 208(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 144(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 80(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 240(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 208(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 176(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 144(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 112(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 80(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 48(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $2216, %rsp # imm = 0x8A8 +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rax) +; AVX1-ONLY-NEXT: addq $2168, %rsp # imm = 0x878 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride8_vf32: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $2248, %rsp # imm = 0x8C8 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, 960(%rdi), %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vinsertf128 $1, 896(%rdi), %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1472(%rdi), %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1152(%rdi), %ymm5, %ymm5 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1984(%rdi), %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1920(%rdi), %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1664(%rdi), %ymm7, %ymm7 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm9, %ymm9 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, 960(%rdi), %ymm10, %ymm10 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 896(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1472(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1152(%rdi), %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm2, %ymm2 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1984(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1664(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1920(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm5[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm12[0],ymm15[0],ymm12[2],ymm15[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm13[0],ymm4[0],ymm13[2],ymm4[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm12[1],ymm15[1],ymm12[3],ymm15[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm14[1],ymm12[3],ymm14[3] -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm4[1],ymm13[3],ymm4[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 @@ -4109,14 +4103,6 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) @@ -4125,13 +4111,13 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -4141,13 +4127,13 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -4157,22 +4143,30 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r9) @@ -4240,204 +4234,206 @@ ; ; AVX512F-LABEL: load_i64_stride8_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm28 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm18 +; AVX512F-NEXT: subq $2600, %rsp # imm = 0xA28 +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm28 +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm29 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm22 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm23 ; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm30 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm22 ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %ymm29 -; AVX512F-NEXT: vmovdqa 1152(%rdi), %ymm13 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm13[0],ymm29[0],ymm13[2],ymm29[2] -; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm6 -; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm4 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm9[2,3],ymm5[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm8 -; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm9 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm24 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm24[0],ymm1[2],ymm24[2] +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm10[2,3],ymm7[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm10 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] ; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %ymm16 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm11[2,3],ymm10[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm5, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm25 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm25[0],ymm5[0],ymm25[2],ymm5[2] -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm26 -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm27 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm15[2,3],ymm11[2,3] -; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa 512(%rdi), %ymm15 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm13[2,3],ymm11[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm7, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm10, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512F-NEXT: vpermi2q %zmm14, %zmm23, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa 1728(%rdi), %ymm10 -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %ymm28 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm28[0],ymm10[0],ymm28[2],ymm10[2] -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %ymm31 -; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm2[0],ymm31[0],ymm2[2],ymm31[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa 1216(%rdi), %ymm7 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %ymm17 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm17[0],ymm7[0],ymm17[2],ymm7[2] +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %ymm18 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %ymm19 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm13[2,3] +; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm26 +; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm13 +; AVX512F-NEXT: vpermi2q %zmm28, %zmm13, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa 1728(%rdi), %ymm11 +; AVX512F-NEXT: vmovdqa 1664(%rdi), %ymm14 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm14[0],ymm11[0],ymm14[2],ymm11[2] +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %ymm21 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %ymm31 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm31[0],ymm21[0],ymm31[2],ymm21[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm5[2,3],ymm2[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm13[1],ymm29[1],ymm13[3],ymm29[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm24[1],ymm1[3],ymm24[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm5[1],ymm25[3],ymm5[3] -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm8 -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm9 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm17[1],ymm7[1],ymm17[3],ymm7[3] +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm10 +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm11, %zmm23, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm28, %zmm13, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm28[1],ymm10[1],ymm28[3],ymm10[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm31[1],ymm2[3],ymm31[3] +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm14[1],ymm11[1],ymm14[3],ymm11[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm31[1],ymm21[1],ymm31[3],ymm21[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm23[0],zmm30[2],zmm23[2],zmm30[4],zmm23[4],zmm30[6],zmm23[6] ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] ; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm13 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm6[0],zmm29[0],zmm6[2],zmm29[2],zmm6[4],zmm29[4],zmm6[6],zmm29[6] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm22 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm20 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm14[0],zmm30[2],zmm14[2],zmm30[4],zmm14[4],zmm30[6],zmm14[6] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm25[0],zmm27[2],zmm25[2],zmm27[4],zmm25[4],zmm27[6],zmm25[6] +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm28 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm11[0],zmm7[2],zmm11[2],zmm7[4],zmm11[4],zmm7[6],zmm11[6] +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm13[0],zmm28[0],zmm13[2],zmm28[2],zmm13[4],zmm28[4],zmm13[6],zmm28[6] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -4452,507 +4448,500 @@ ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm31, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm16, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [1,9,1,9,1,9,1,9] -; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm24, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm3 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm21, %zmm11, %zmm18 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm25[0],zmm17[0],zmm25[2],zmm17[2],zmm25[4],zmm17[4],zmm25[6],zmm17[6] -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm25[1],zmm17[1],zmm25[3],zmm17[3],zmm25[5],zmm17[5],zmm25[7],zmm17[7] -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm31, %zmm25 -; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm24, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm17, %zmm5, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm5, %zmm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] -; AVX512F-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm8, %zmm9, %zmm10 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm29 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm26[1],zmm1[1],zmm26[3],zmm1[3],zmm26[5],zmm1[5],zmm26[7],zmm1[7] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm26 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm14[0],zmm17[0],zmm14[2],zmm17[2],zmm14[4],zmm17[4],zmm14[6],zmm17[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm26 {%k1} = zmm14[1],zmm17[1],zmm14[3],zmm17[3],zmm14[5],zmm17[5],zmm14[7],zmm17[7] +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm16, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm17, %zmm18, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [5,13,5,13,5,13,5,13] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm17, %zmm6, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] +; AVX512F-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm20, %zmm7, %zmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm31, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm24, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm4, %zmm16 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm2[1],zmm29[1],zmm2[3],zmm29[3],zmm2[5],zmm29[5],zmm2[7],zmm29[7] -; AVX512F-NEXT: vpermt2q %zmm29, %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k1} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm0[1],zmm26[3],zmm0[3],zmm26[5],zmm0[5],zmm26[7],zmm0[7] -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm21 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm16, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm18, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm10, %zmm20 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm25 = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7] +; AVX512F-NEXT: vpermt2q %zmm31, %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k1} = zmm30[0],zmm4[0],zmm30[2],zmm4[2],zmm30[4],zmm4[4],zmm30[6],zmm4[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm30[1],zmm4[1],zmm30[3],zmm4[3],zmm30[5],zmm4[5],zmm30[7],zmm4[7] +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm16, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm18, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm6, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm7, %zmm9 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm16, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm21 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm27[1],zmm0[1],zmm27[3],zmm0[3],zmm27[5],zmm0[5],zmm27[7],zmm0[7] +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm26 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm9, %zmm10 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %ymm1, %ymm27 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm31, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm24, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm30[1],zmm6[1],zmm30[3],zmm6[3],zmm30[5],zmm6[5],zmm30[7],zmm6[7] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm11, %zmm30 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k1} = zmm17[0],zmm0[0],zmm17[2],zmm0[2],zmm17[4],zmm0[4],zmm17[6],zmm0[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm26 {%k1} = zmm17[1],zmm0[1],zmm17[3],zmm0[3],zmm17[5],zmm0[5],zmm17[7],zmm0[7] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm27 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k1} = zmm22[0],zmm1[0],zmm22[2],zmm1[2],zmm22[4],zmm1[4],zmm22[6],zmm1[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm22[1],zmm1[1],zmm22[3],zmm1[3],zmm22[5],zmm1[5],zmm22[7],zmm1[7] -; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm24, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm31, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm30 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm24, %zmm28 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm3[1],zmm2[1],zmm3[3],zmm2[3],zmm3[5],zmm2[5],zmm3[7],zmm2[7] -; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm7[0],zmm1[0],zmm7[2],zmm1[2],zmm7[4],zmm1[4],zmm7[6],zmm1[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm7[1],zmm1[1],zmm7[3],zmm1[3],zmm7[5],zmm1[5],zmm7[7],zmm1[7] -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm13, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm31 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm12, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm23 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm12[1],zmm2[1],zmm12[3],zmm2[3],zmm12[5],zmm2[5],zmm12[7],zmm2[7] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm12 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm1[1],zmm0[1],zmm1[3],zmm0[3],zmm1[5],zmm0[5],zmm1[7],zmm0[7] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm14, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 -; AVX512F-NEXT: vpermi2q %zmm14, %zmm0, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm15 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm9, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm25 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, (%rsp), %zmm25, %zmm0 # 32-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm15, %zmm6, %zmm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14] -; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm2, %zmm5 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] -; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm8 -; AVX512F-NEXT: vpermi2q %zmm15, %zmm6, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm21 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm4, %ymm4 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm5, %ymm5 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa 576(%rdi), %xmm8 -; AVX512F-NEXT: vinserti128 $1, 704(%rdi), %ymm8, %ymm8 -; AVX512F-NEXT: vmovdqa 512(%rdi), %xmm10 -; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm18, %zmm11 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa 1088(%rdi), %xmm12 -; AVX512F-NEXT: vinserti128 $1, 1216(%rdi), %ymm12, %ymm12 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %xmm16 -; AVX512F-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm16, %ymm16 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm22, %zmm13, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %xmm25 -; AVX512F-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm25, %ymm25 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %xmm26 -; AVX512F-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm26, %ymm26 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm26[0],ymm25[0],ymm26[2],ymm25[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm31, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm25, %zmm7, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm13, %zmm2, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [7,15,7,15] +; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm3, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm5, %zmm6 +; AVX512F-NEXT: vpermi2q %zmm13, %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm5, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm27 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm2, %ymm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa 576(%rdi), %xmm7 +; AVX512F-NEXT: vinserti128 $1, 704(%rdi), %ymm7, %ymm7 +; AVX512F-NEXT: vmovdqa 512(%rdi), %xmm13 +; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm13, %ymm13 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm20, %zmm22, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %xmm17 +; AVX512F-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm17, %ymm17 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %xmm20 +; AVX512F-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm20, %ymm20 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm24, %zmm14, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %xmm24 +; AVX512F-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm24, %ymm24 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %xmm29 +; AVX512F-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm29, %ymm29 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm29[0],ymm24[0],ymm29[2],ymm24[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm30, %zmm16, %zmm22 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm20 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm17 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm25[1],ymm26[3],ymm25[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm1 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm9 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm19, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm18 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm24[1],ymm29[3],ymm24[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm16 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm10, %zmm13, %zmm10 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm13 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm13 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm14 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm14 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm3, %zmm14 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm15 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm3, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm27, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm22, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm4, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 192(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, (%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 64(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 128(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 192(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, (%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 64(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 128(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm3, 192(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, (%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 64(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 128(%r9) +; AVX512F-NEXT: vinsertf64x4 $0, %ymm9, %zmm10, %zmm9 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm13 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm13 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm13, %zmm10, %zmm13 +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm14 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm26, %zmm14 +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm15 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm22, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm5, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm16, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 192(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 128(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 64(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, (%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 192(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 128(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 64(%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, (%r8) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 128(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, 64(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm1, (%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm30, 192(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm3, (%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm3, 64(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm3, 128(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm27, (%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm14, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm13, (%rax) -; AVX512F-NEXT: vmovaps %zmm10, 64(%rax) -; AVX512F-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512F-NEXT: vmovdqa64 %zmm15, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512F-NEXT: vmovaps %zmm13, 64(%rax) +; AVX512F-NEXT: vmovaps %zmm9, (%rax) +; AVX512F-NEXT: addq $2600, %rsp # imm = 0xA28 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride8_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm18 +; AVX512BW-NEXT: subq $2600, %rsp # imm = 0xA28 +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm23 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm22 ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %ymm29 -; AVX512BW-NEXT: vmovdqa 1152(%rdi), %ymm13 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm13[0],ymm29[0],ymm13[2],ymm29[2] -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm6 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm4 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm9[2,3],ymm5[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm8 -; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm24 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm24[0],ymm1[2],ymm24[2] +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm10[2,3],ymm7[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm3 +; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] ; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm11[2,3],ymm10[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm5 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm25 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm25[0],ymm5[0],ymm25[2],ymm5[2] -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm26 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm27 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm15[2,3],ymm11[2,3] -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm15 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm13[2,3],ymm11[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm7, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm10 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %ymm28 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm28[0],ymm10[0],ymm28[2],ymm10[2] -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %ymm31 -; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm2[0],ymm31[0],ymm2[2],ymm31[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa 1216(%rdi), %ymm7 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %ymm17 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm17[0],ymm7[0],ymm17[2],ymm7[2] +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %ymm18 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %ymm19 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm13[2,3] +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm26 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm28, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm11 +; AVX512BW-NEXT: vmovdqa 1664(%rdi), %ymm14 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm14[0],ymm11[0],ymm14[2],ymm11[2] +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %ymm21 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %ymm31 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm31[0],ymm21[0],ymm31[2],ymm21[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm5[2,3],ymm2[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm13[1],ymm29[1],ymm13[3],ymm29[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm24[1],ymm1[3],ymm24[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm25[1],ymm5[1],ymm25[3],ymm5[3] -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm5 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm8 -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm9 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm17[1],ymm7[1],ymm17[3],ymm7[3] +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm10 +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm5 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm11, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm28, %zmm13, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm28[1],ymm10[1],ymm28[3],ymm10[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm31[1],ymm2[3],ymm31[3] +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm14[1],ymm11[1],ymm14[3],ymm11[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm31[1],ymm21[1],ymm31[3],ymm21[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm23[0],zmm30[2],zmm23[2],zmm30[4],zmm23[4],zmm30[6],zmm23[6] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm13 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm6[0],zmm29[0],zmm6[2],zmm29[2],zmm6[4],zmm29[4],zmm6[6],zmm29[6] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm20 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm14[0],zmm30[2],zmm14[2],zmm30[4],zmm14[4],zmm30[6],zmm14[6] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm27[0],zmm25[0],zmm27[2],zmm25[2],zmm27[4],zmm25[4],zmm27[6],zmm25[6] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm28 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm11[0],zmm7[2],zmm11[2],zmm7[4],zmm11[4],zmm7[6],zmm11[6] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm13[0],zmm28[0],zmm13[2],zmm28[2],zmm13[4],zmm28[4],zmm13[6],zmm28[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -4967,304 +4956,295 @@ ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm31, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm16, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm24, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm3 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm11, %zmm18 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm25[0],zmm17[0],zmm25[2],zmm17[2],zmm25[4],zmm17[4],zmm25[6],zmm17[6] -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm18 {%k1} = zmm25[1],zmm17[1],zmm25[3],zmm17[3],zmm25[5],zmm17[5],zmm25[7],zmm17[7] -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm31, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm24, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm5, %zmm3 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm9, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm29 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm26[1],zmm1[1],zmm26[3],zmm1[3],zmm26[5],zmm1[5],zmm26[7],zmm1[7] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm26 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm14[0],zmm17[0],zmm14[2],zmm17[2],zmm14[4],zmm17[4],zmm14[6],zmm17[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm26 {%k1} = zmm14[1],zmm17[1],zmm14[3],zmm17[3],zmm14[5],zmm17[5],zmm14[7],zmm17[7] +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm16, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm18, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm6, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] +; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm31, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm24, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm16 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 = zmm2[1],zmm29[1],zmm2[3],zmm29[3],zmm2[5],zmm29[5],zmm2[7],zmm29[7] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k1} = zmm26[0],zmm0[0],zmm26[2],zmm0[2],zmm26[4],zmm0[4],zmm26[6],zmm0[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm0[1],zmm26[3],zmm0[3],zmm26[5],zmm0[5],zmm26[7],zmm0[7] -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm24, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm16, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm18, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm10, %zmm20 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm25 = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7] +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k1} = zmm30[0],zmm4[0],zmm30[2],zmm4[2],zmm30[4],zmm4[4],zmm30[6],zmm4[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm30[1],zmm4[1],zmm30[3],zmm4[3],zmm30[5],zmm4[5],zmm30[7],zmm4[7] +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm16, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm7, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm16, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm21 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm27[1],zmm0[1],zmm27[3],zmm0[3],zmm27[5],zmm0[5],zmm27[7],zmm0[7] +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm26 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm9, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %ymm1, %ymm27 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm31, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm24, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm4, %zmm19 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm30[1],zmm6[1],zmm30[3],zmm6[3],zmm30[5],zmm6[5],zmm30[7],zmm6[7] -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm30 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k1} = zmm17[0],zmm0[0],zmm17[2],zmm0[2],zmm17[4],zmm0[4],zmm17[6],zmm0[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm26 {%k1} = zmm17[1],zmm0[1],zmm17[3],zmm0[3],zmm17[5],zmm0[5],zmm17[7],zmm0[7] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm27 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k1} = zmm22[0],zmm1[0],zmm22[2],zmm1[2],zmm22[4],zmm1[4],zmm22[6],zmm1[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm22[1],zmm1[1],zmm22[3],zmm1[3],zmm22[5],zmm1[5],zmm22[7],zmm1[7] -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm31, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm31, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm30 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm24, %zmm28 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm23 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm3[1],zmm2[1],zmm3[3],zmm2[3],zmm3[5],zmm2[5],zmm3[7],zmm2[7] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm7[0],zmm1[0],zmm7[2],zmm1[2],zmm7[4],zmm1[4],zmm7[6],zmm1[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm7[1],zmm1[1],zmm7[3],zmm1[3],zmm7[5],zmm1[5],zmm7[7],zmm1[7] -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm13, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm31 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm12, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm23 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm12[1],zmm2[1],zmm12[3],zmm2[3],zmm12[5],zmm2[5],zmm12[7],zmm2[7] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k1} = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm1[1],zmm0[1],zmm1[3],zmm0[3],zmm1[5],zmm0[5],zmm1[7],zmm0[7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm15 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm9, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm25 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, (%rsp), %zmm25, %zmm0 # 32-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm15, %zmm6, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm5 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] -; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm8 -; AVX512BW-NEXT: vpermi2q %zmm15, %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm21 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm4, %ymm4 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm5, %ymm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm8 -; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm8, %ymm8 -; AVX512BW-NEXT: vmovdqa 512(%rdi), %xmm10 -; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm10, %ymm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm18, %zmm11 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm12 -; AVX512BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm12, %ymm12 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %xmm16 -; AVX512BW-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm16, %ymm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm22, %zmm13, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %xmm25 -; AVX512BW-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm25, %ymm25 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %xmm26 -; AVX512BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm26, %ymm26 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm26[0],ymm25[0],ymm26[2],ymm25[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm31, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm7, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm13, %zmm2, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [7,15,7,15] +; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm3, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm13, %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm29, %zmm27 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm1, %ymm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm2, %ymm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm7 +; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm7, %ymm7 +; AVX512BW-NEXT: vmovdqa 512(%rdi), %xmm13 +; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm13, %ymm13 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm13[0],ymm7[0],ymm13[2],ymm7[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm22, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %xmm17 +; AVX512BW-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm17, %ymm17 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %xmm20 +; AVX512BW-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm20, %ymm20 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm20[0],ymm17[0],ymm20[2],ymm17[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm24, %zmm14, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %xmm24 +; AVX512BW-NEXT: vinserti32x4 $1, 1728(%rdi), %ymm24, %ymm24 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %xmm29 +; AVX512BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm29, %ymm29 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm29[0],ymm24[0],ymm29[2],ymm24[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm30, %zmm16, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm20 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm26[1],ymm25[1],ymm26[3],ymm25[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm24, %zmm5 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm19, %zmm1 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm9 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm13[1],ymm7[1],ymm13[3],ymm7[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm20[1],ymm17[1],ymm20[3],ymm17[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm19, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm18 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm24[1],ymm29[3],ymm24[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm16 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm10, %zmm13, %zmm10 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm13 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm9, %zmm13 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm14 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm14 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm3, %zmm14 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm15 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm3, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 192(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, (%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 64(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 128(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 192(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, (%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 64(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 128(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, 192(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, (%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 64(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 128(%r9) +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm9, %zmm10, %zmm9 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm13 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm13 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm13, %zmm10, %zmm13 +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm14 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm26, %zmm14 +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm15 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm22, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 192(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 128(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 64(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, (%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 192(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 128(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 64(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, (%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 128(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, 64(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm1, (%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm30, 192(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, (%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, 64(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm3, 128(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm27, (%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rax) -; AVX512BW-NEXT: vmovaps %zmm10, 64(%rax) -; AVX512BW-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512BW-NEXT: vmovaps %zmm13, 64(%rax) +; AVX512BW-NEXT: vmovaps %zmm9, (%rax) +; AVX512BW-NEXT: addq $2600, %rsp # imm = 0xA28 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <256 x i64>, ptr %in.vec, align 64 @@ -6675,75 +6655,99 @@ ; ; AVX1-ONLY-LABEL: load_i64_stride8_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $5016, %rsp # imm = 0x1398 -; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3008(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 2944(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3520(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 3456(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2880(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 3392(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 3904(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 3840(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 4032(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 3968(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm10[0] -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0] -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: subq $5048, %rsp # imm = 0x13B8 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm5[0] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm2[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3328(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm7[0] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2816(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm10[0],xmm9[0] +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm8[0] +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm6[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 2048(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm4[1],xmm6[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2752(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 2688(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2368(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2304(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 2624(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 2560(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 3200(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 3136(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 3072(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 3776(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 3712(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 3648(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 3584(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -6772,283 +6776,267 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3776(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 3712(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3648(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 3584(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 3200(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3136(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 3072(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2752(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 2688(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2624(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 2560(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2368(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 2304(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 3008(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 2944(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2880(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 2816(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 3520(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 3456(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 2048(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 3392(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 3328(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 4032(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 3968(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 3904(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 3840(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 848(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1104(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1872(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 2384(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 2128(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2320(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2064(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3008(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2752(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2944(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 2688(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 2896(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 2640(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2832(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2576(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3520(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3456(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 3200(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 3408(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 3152(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3344(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 3088(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 4032(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3776(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3968(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 3712(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 3920(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 3664(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3856(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 3600(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm8[0],xmm0[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 848(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1104(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm2[0],xmm10[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 1872(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm11[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2128(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 2064(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 2384(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 2320(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm12[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2752(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 3008(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2688(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 2944(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2640(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 2576(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 2896(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 2832(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm13[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 3520(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3200(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 3456(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 3152(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 3088(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 3408(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 3344(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm6[0],xmm14[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3776(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 4032(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3712(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 3968(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovaps 3664(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps 3600(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 3920(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 3856(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm15[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm8 = xmm8[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -7056,7 +7044,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm8 = mem[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -7071,7 +7059,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -7152,14 +7140,6 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm15[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -7169,7 +7149,7 @@ ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 @@ -7213,7 +7193,7 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -7364,18 +7344,6 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 @@ -7388,18 +7356,6 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm0 @@ -7412,18 +7368,6 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm0 @@ -7436,18 +7380,6 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1904(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2272(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2208(%rdi), %ymm0 @@ -7460,18 +7392,6 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2528(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2464(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 2416(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2352(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2784(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2720(%rdi), %ymm0 @@ -7484,533 +7404,584 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3040(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3296(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2976(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 3232(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 2928(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 3184(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2864(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 3120(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3296(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3808(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3232(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 3744(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 3184(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 3120(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3552(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovaps 3488(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovaps 3440(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 3376(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3808(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 3744(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovaps 3696(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 3632(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovaps 3696(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 3632(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 4064(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 4000(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovaps 3952(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 3888(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm8[0],xmm0[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm2[0],xmm10[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 1904(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm11[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2528(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2464(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 2416(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 2352(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm12[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 3040(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2976(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 2928(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 2864(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm13[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 464(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 448(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 256(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 384(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 320(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 272(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 400(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 336(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 496(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 480(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 416(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 352(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 288(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 432(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 368(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 304(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 256(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 272(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 320(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 336(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 384(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 400(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 448(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 464(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 288(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 304(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 352(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 368(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 416(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 432(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 480(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 496(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 496(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 480(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 464(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 448(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 432(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 416(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 400(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 384(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 368(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 352(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 336(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 320(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 304(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 288(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 272(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 256(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps 3552(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 3488(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 3440(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 3376(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm6[0],xmm14[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 4064(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 4000(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovaps 3952(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 3888(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm15[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%r9) +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm10[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%r9) +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm11[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%r9) +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm12[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%r9) +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm13[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%r9) +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm14[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%r9) +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm15[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 480(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 416(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 352(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 288(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 224(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 496(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 432(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 368(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 304(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 240(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 112(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 448(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 384(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 320(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 256(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 464(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 400(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 336(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 272(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 208(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 144(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 80(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 480(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 496(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 416(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 432(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 352(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 368(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 288(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 304(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 240(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 464(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 400(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 320(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 336(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 256(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 272(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 208(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 144(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 80(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 480(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 416(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 352(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 288(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 448(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 384(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 320(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 256(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 480(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 448(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 416(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 384(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 352(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 320(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 288(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 256(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%r8) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 496(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 480(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 464(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 448(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 432(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 416(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 400(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 384(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 368(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 352(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 336(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 320(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 304(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 288(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 272(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 256(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 240(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 224(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 208(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 192(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 144(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 128(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 112(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 80(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 496(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 480(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 464(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 448(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 432(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 416(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 400(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 384(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 368(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 352(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 336(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 304(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 288(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 272(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 256(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%rax) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 496(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 480(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 464(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 432(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 400(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 368(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 336(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 304(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 288(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 272(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 240(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 208(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 144(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 112(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 80(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 480(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 416(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 352(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 288(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 448(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 384(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 320(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 256(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 448(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 416(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 384(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 352(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 288(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 256(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 160(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8019,143 +7990,154 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $5016, %rsp # imm = 0x1398 +; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rax) +; AVX1-ONLY-NEXT: addq $5048, %rsp # imm = 0x13B8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride8_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $5064, %rsp # imm = 0x13C8 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, 960(%rdi), %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vinsertf128 $1, 896(%rdi), %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1472(%rdi), %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1152(%rdi), %ymm5, %ymm5 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1984(%rdi), %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1920(%rdi), %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1664(%rdi), %ymm7, %ymm7 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2368(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, 2496(%rdi), %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vmovaps 2304(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, 2432(%rdi), %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vmovaps 2112(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, 2240(%rdi), %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vmovaps 2048(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, 2176(%rdi), %ymm9, %ymm9 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2880(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, 3008(%rdi), %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vmovaps 2624(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, 2752(%rdi), %ymm10, %ymm10 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2816(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 2944(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2560(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 2688(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3392(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 3520(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 3136(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 3264(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3328(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, 3456(%rdi), %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vmovaps 3072(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, 3200(%rdi), %ymm2, %ymm2 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3904(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 4032(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 3648(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 3776(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3840(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 3968(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 3584(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 3712(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 960(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 896(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1152(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1472(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 1664(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1984(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1920(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2112(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 2240(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 2048(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 2176(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 2368(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 2496(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2304(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 2432(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2624(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 2752(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 2560(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 2688(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 2880(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 3008(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2816(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 2944(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3136(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 3264(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 3072(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 3200(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 3392(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 3520(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 3328(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 3456(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3648(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, 3776(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 3584(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 3712(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 3904(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 4032(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 3840(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 3968(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 @@ -8168,284 +8150,272 @@ ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2368(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2304(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2496(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2432(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2880(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2816(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3008(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2944(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3392(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3328(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3520(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3456(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3904(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2112(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3840(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 2048(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 4032(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 2240(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3968(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2176(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2368(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 2304(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 2496(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2432(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2624(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 2560(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 2752(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2688(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2880(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 2816(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 3008(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2944(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 3136(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 3072(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] +; AVX2-ONLY-NEXT: vmovaps 3264(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 3200(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2112(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2048(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2240(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 2176(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vmovaps 3392(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 3328(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 3520(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 3456(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2624(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2560(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2752(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 2688(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vmovaps 3648(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 3584(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 3776(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 3712(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3136(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3072(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 3264(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 3200(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3648(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 3584(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 3776(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 3712(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 3904(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 3840(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 4032(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 3968(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm12[0],ymm6[2],ymm12[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm12[1],ymm6[3],ymm12[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 @@ -8864,22 +8834,6 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rsi) @@ -8896,21 +8850,21 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -8928,38 +8882,54 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%r8) @@ -9123,128 +9093,124 @@ ; ; AVX512F-LABEL: load_i64_stride8_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $6600, %rsp # imm = 0x19C8 -; AVX512F-NEXT: vmovdqa64 3392(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 3328(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3520(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3456(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm15 +; AVX512F-NEXT: subq $6664, %rsp # imm = 0x1A08 +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm15 ; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm16 ; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm3 ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm21 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 3264(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa 3200(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512F-NEXT: vmovdqa 3136(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa 3072(%rdi), %ymm14 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm14[0],ymm3[0],ymm14[2],ymm3[2] +; AVX512F-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm3[0],ymm11[2],ymm3[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm20 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512F-NEXT: vmovdqa64 576(%rdi), %ymm26 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %ymm26 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm0[0],ymm26[2],ymm0[2] +; AVX512F-NEXT: vmovdqa64 576(%rdi), %ymm24 ; AVX512F-NEXT: vmovdqa64 512(%rdi), %ymm23 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm26[0],ymm23[2],ymm26[2] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm24[0],ymm23[2],ymm24[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 1216(%rdi), %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm30 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm30[0],ymm0[0],ymm30[2],ymm0[2] -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm20 -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm16 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm20[0],ymm16[2],ymm20[2] +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %ymm19 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm0[0],ymm19[2],ymm0[2] +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %ymm17 +; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm12 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm17[0],ymm12[2],ymm17[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa 1728(%rdi), %ymm3 ; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-NEXT: vmovdqa 1664(%rdi), %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %ymm21 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %ymm17 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm21[0],ymm17[2],ymm21[2] +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %ymm30 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %ymm16 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm30[0],ymm16[2],ymm30[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 1216(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %ymm25 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm25[0],ymm0[0],ymm25[2],ymm0[2] -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %ymm24 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %ymm22 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm24[0],ymm22[2],ymm24[2] +; AVX512F-NEXT: vmovdqa64 2240(%rdi), %ymm29 +; AVX512F-NEXT: vmovdqa 2176(%rdi), %ymm15 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm29[0],ymm15[2],ymm29[2] +; AVX512F-NEXT: vmovdqa64 2112(%rdi), %ymm25 +; AVX512F-NEXT: vmovdqa64 2048(%rdi), %ymm18 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm25[0],ymm18[2],ymm25[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9264,29 +9230,30 @@ ; AVX512F-NEXT: vmovdqa 2688(%rdi), %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512F-NEXT: vmovdqa64 2624(%rdi), %ymm31 -; AVX512F-NEXT: vmovdqa 2560(%rdi), %ymm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm10[0],ymm31[0],ymm10[2],ymm31[2] +; AVX512F-NEXT: vmovdqa 2624(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vmovdqa64 2560(%rdi), %ymm31 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm31[0],ymm3[0],ymm31[2],ymm3[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 3520(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 3456(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3392(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3328(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 2240(%rdi), %ymm28 -; AVX512F-NEXT: vmovdqa64 2176(%rdi), %ymm19 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm28[0],ymm19[2],ymm28[2] -; AVX512F-NEXT: vmovdqa64 2112(%rdi), %ymm27 -; AVX512F-NEXT: vmovdqa 2048(%rdi), %ymm6 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm6[0],ymm27[0],ymm6[2],ymm27[2] +; AVX512F-NEXT: vmovdqa64 3264(%rdi), %ymm28 +; AVX512F-NEXT: vmovdqa64 3200(%rdi), %ymm27 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm28[0],ymm27[2],ymm28[2] +; AVX512F-NEXT: vmovdqa64 3136(%rdi), %ymm22 +; AVX512F-NEXT: vmovdqa 3072(%rdi), %ymm6 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm6[0],ymm22[0],ymm6[2],ymm22[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9295,278 +9262,280 @@ ; AVX512F-NEXT: vmovdqa64 3968(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 3904(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 3904(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 3840(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm5, %zmm3, %zmm2 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa 3776(%rdi), %ymm12 -; AVX512F-NEXT: vmovdqa 3712(%rdi), %ymm9 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm9[0],ymm12[0],ymm9[2],ymm12[2] +; AVX512F-NEXT: vmovdqa 3776(%rdi), %ymm8 +; AVX512F-NEXT: vmovdqa 3712(%rdi), %ymm5 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm8[0],ymm5[2],ymm8[2] ; AVX512F-NEXT: vmovdqa 3648(%rdi), %ymm4 ; AVX512F-NEXT: vmovdqa 3584(%rdi), %ymm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] ; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm26[1],ymm23[3],ymm26[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm20, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm26, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm26[1],mem[1],ymm26[3],mem[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm23[1],ymm24[1],ymm23[3],ymm24[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm23, %zmm2, %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm14 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm30, %ymm0 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm0 = ymm30[1],mem[1],ymm30[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm20[1],ymm16[3],ymm20[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm23, %zmm2, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm19, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm19[1],mem[1],ymm19[3],mem[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm12[1],ymm17[1],ymm12[3],ymm17[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm17, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm12, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm21[1],ymm17[3],ymm21[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm16[1],ymm30[1],ymm16[3],ymm30[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm17, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm25, %ymm0 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm0 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm22[1],ymm24[1],ymm22[3],ymm24[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm15[1],ymm29[1],ymm15[3],ymm29[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm18[1],ymm25[1],ymm18[3],ymm25[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm25, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm15, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],ymm31[1],ymm10[3],ymm31[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm11 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm11 = ymm31[1],mem[1],ymm31[3],mem[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm31, %zmm2, %zmm10 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm19[1],ymm28[1],ymm19[3],ymm28[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm27[1],ymm6[3],ymm27[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm27[1],ymm28[1],ymm27[3],ymm28[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm22[1],ymm6[3],ymm22[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm5, %zmm19, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm27, %zmm22, %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm9[1],ymm12[1],ymm9[3],ymm12[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm5[1],ymm8[1],ymm5[3],ymm8[3] ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 ; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm27[0],zmm12[0],zmm27[2],zmm12[2],zmm27[4],zmm12[4],zmm27[6],zmm12[6] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm13[0],zmm9[0],zmm13[2],zmm9[2],zmm13[4],zmm9[4],zmm13[6],zmm9[6] ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 3136(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3072(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 3264(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3200(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm9 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] ; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm4 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm4 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm7 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm29[0],zmm26[0],zmm29[2],zmm26[2],zmm29[4],zmm26[4],zmm29[6],zmm26[6] +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm23 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm26 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm4 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm19 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm14[0],zmm12[0],zmm14[2],zmm12[2],zmm14[4],zmm12[4],zmm14[6],zmm12[6] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm30 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm17 ; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm20 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm11[0],zmm9[2],zmm11[2],zmm9[4],zmm11[4],zmm9[6],zmm11[6] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm23[0],zmm16[2],zmm23[2],zmm16[4],zmm23[4],zmm16[6],zmm23[6] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm30 +; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm24[0],zmm15[0],zmm24[2],zmm15[2],zmm24[4],zmm15[4],zmm24[6],zmm15[6] +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm18[0],zmm25[2],zmm18[2],zmm25[4],zmm18[4],zmm25[6],zmm18[6] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm29 +; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm12 ; AVX512F-NEXT: vmovdqa64 2688(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm25 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm12, %zmm1, %zmm4 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 3136(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 3072(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 3264(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 3200(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm4[0],zmm8[0],zmm4[2],zmm8[2],zmm4[4],zmm8[4],zmm4[6],zmm8[6] -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm4[0],zmm31[0],zmm4[2],zmm31[2],zmm4[4],zmm31[4],zmm4[6],zmm31[6] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqa64 3648(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 3584(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqa64 3776(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3776(%rdi), %zmm6 ; AVX512F-NEXT: vmovdqa64 3712(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm1 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm3, %zmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm18 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm19[0],zmm28[0],zmm19[2],zmm28[2],zmm19[4],zmm28[4],zmm19[6],zmm28[6] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm1[0],zmm27[0],zmm1[2],zmm27[2],zmm1[4],zmm27[4],zmm1[6],zmm27[6] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] @@ -9575,118 +9544,116 @@ ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] ; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm9 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm3 ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm12[1],zmm27[3],zmm12[3],zmm27[5],zmm12[5],zmm27[7],zmm12[7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm4 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm4 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm31, %zmm1, %zmm8 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm13[1],zmm31[3],zmm13[3],zmm31[5],zmm13[5],zmm31[7],zmm13[7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm18[1],zmm20[1],zmm18[3],zmm20[3],zmm18[5],zmm20[5],zmm18[7],zmm20[7] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm4 = zmm29[1],mem[1],zmm29[3],mem[3],zmm29[5],mem[5],zmm29[7],mem[7] +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm4 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm4 = zmm24[1],mem[1],zmm24[3],mem[3],zmm24[5],mem[5],zmm24[7],mem[7] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm4 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm4 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm4 = zmm22[1],mem[1],zmm22[3],mem[3],zmm22[5],mem[5],zmm22[7],mem[7] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm29 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm30, %zmm1, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm7[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm9[1],zmm11[1],zmm9[3],zmm11[3],zmm9[5],zmm11[5],zmm9[7],zmm11[7] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm23[1],zmm16[3],zmm23[3],zmm16[5],zmm23[5],zmm16[7],zmm23[7] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm12, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm4 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm4 = zmm10[1],mem[1],zmm10[3],mem[3],zmm10[5],mem[5],zmm10[7],mem[7] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm8[1],zmm4[3],zmm8[3],zmm4[5],zmm8[5],zmm4[7],zmm8[7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm7[1],zmm9[1],zmm7[3],zmm9[3],zmm7[5],zmm9[5],zmm7[7],zmm9[7] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm5, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm6, %zmm8, %zmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm25[1],zmm18[1],zmm25[3],zmm18[3],zmm25[5],zmm18[5],zmm25[7],zmm18[7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm5[1],zmm23[1],zmm5[3],zmm23[3],zmm5[5],zmm23[5],zmm5[7],zmm23[7] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9694,12 +9661,12 @@ ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 # 64-byte Folded Reload ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm3 ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 {%k1} # 64-byte Folded Reload @@ -9707,212 +9674,208 @@ ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm4, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm31 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm31 {%k1} # 64-byte Folded Reload -; AVX512F-NEXT: # zmm31 {%k1} = zmm24[0],mem[0],zmm24[2],mem[2],zmm24[4],mem[4],zmm24[6],mem[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm18 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm18 {%k1} # 64-byte Folded Reload +; AVX512F-NEXT: # zmm18 {%k1} = zmm1[0],mem[0],zmm1[2],mem[2],zmm1[4],mem[4],zmm1[6],mem[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q %zmm26, %zmm4, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm14[0],zmm28[0],zmm14[2],zmm28[2],zmm14[4],zmm28[4],zmm14[6],zmm28[6] +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm28 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm18[0],zmm20[2],zmm18[2],zmm20[4],zmm18[4],zmm20[6],zmm18[6] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm4, %zmm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm30[0],zmm27[0],zmm30[2],zmm27[2],zmm30[4],zmm27[4],zmm30[6],zmm27[6] +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm17 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm1, %zmm15 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm31[0],zmm16[0],zmm31[2],zmm16[2],zmm31[4],zmm16[4],zmm31[6],zmm16[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm30, %zmm4, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k1} = zmm1[0],zmm30[0],zmm1[2],zmm30[2],zmm1[4],zmm30[4],zmm1[6],zmm30[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm11 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm3 {%k1} # 64-byte Folded Reload -; AVX512F-NEXT: # zmm3 {%k1} = zmm12[0],mem[0],zmm12[2],mem[2],zmm12[4],mem[4],zmm12[6],mem[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm29, %zmm4, %zmm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm10 {%k1} # 64-byte Folded Reload +; AVX512F-NEXT: # zmm10 {%k1} = zmm1[0],mem[0],zmm1[2],mem[2],zmm1[4],mem[4],zmm1[6],mem[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm21[0],zmm26[2],zmm21[2],zmm26[4],zmm21[4],zmm26[6],zmm21[6] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm8 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm4, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm12[0],zmm10[0],zmm12[2],zmm10[2],zmm12[4],zmm10[4],zmm12[6],zmm10[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm9[0],zmm5[0],zmm9[2],zmm5[2],zmm9[4],zmm5[4],zmm9[6],zmm5[6] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512F-NEXT: vpermi2q %zmm6, %zmm8, %zmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm6[0],zmm7[2],zmm6[2],zmm7[4],zmm6[4],zmm7[6],zmm6[6] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [7,15,7,15,7,15,7,15] ; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm3[1],zmm24[3],zmm3[3],zmm24[5],zmm3[5],zmm24[7],zmm3[7] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm27[1],zmm3[1],zmm27[3],zmm3[3],zmm27[5],zmm3[5],zmm27[7],zmm3[7] ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm1 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [0,8,0,8,0,8,0,8] ; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm24 -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9] -; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm3, %zmm23, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm27 +; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm26, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm14, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm26, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm14, %zmm15 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm15[1],zmm11[1],zmm15[3],zmm11[3],zmm15[5],zmm11[5],zmm15[7],zmm11[7] +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm28, %zmm23, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm22, %zmm14, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm23, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm26, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm4, %zmm19 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm30[1],zmm27[1],zmm30[3],zmm27[3],zmm30[5],zmm27[5],zmm30[7],zmm27[7] -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm14, %zmm30 -; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm27, %zmm23, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm4, %zmm1 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm18[1],zmm20[3],zmm18[3],zmm20[5],zmm18[5],zmm20[7],zmm18[7] +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm14, %zmm20 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm18, %zmm26, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm31, %zmm14, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm26, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm17 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm17 {%k1} = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm14, %zmm24 +; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm25, %zmm26, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm14, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm23, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm14, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm26, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm31 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm20 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm31[1],zmm16[1],zmm31[3],zmm16[3],zmm31[5],zmm16[5],zmm31[7],zmm16[7] -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm14, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm23, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm23, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm4, %zmm11 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm22[1],zmm30[1],zmm22[3],zmm30[3],zmm22[5],zmm30[5],zmm22[7],zmm30[7] ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm14, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm23, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm8[1],zmm26[3],zmm8[3],zmm26[5],zmm8[5],zmm26[7],zmm8[7] -; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm14, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm23, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm14, %zmm22 +; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm30, %zmm26, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm14, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm23, %zmm25 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm27, %zmm14, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm31 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm4, %zmm24 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm18, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm23, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm23, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm23, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm18[1],zmm0[1],zmm18[3],zmm0[3],zmm18[5],zmm0[5],zmm18[7],zmm0[7] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm18, %zmm0, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm4, %zmm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm9[1],zmm5[1],zmm9[3],zmm5[3],zmm9[5],zmm5[5],zmm9[7],zmm5[7] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm4, %zmm30 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm6[1],zmm3[1],zmm6[3],zmm3[3],zmm6[5],zmm3[5],zmm6[7],zmm3[7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm7 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q %zmm29, %zmm14, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm26, %zmm23 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm29 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm9, %zmm4, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm12[1],zmm10[1],zmm12[3],zmm10[3],zmm12[5],zmm10[5],zmm12[7],zmm10[7] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm14, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm26, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm14, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm26, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm14, %zmm25 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm1, %zmm16, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm26, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm16, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm7[1],zmm6[1],zmm7[3],zmm6[3],zmm7[5],zmm6[5],zmm7[7],zmm6[7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm8 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm9 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload @@ -9924,404 +9887,400 @@ ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm15 # 64-byte Folded Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] ; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm15 -; AVX512F-NEXT: vinserti128 $1, 128(%rdi), %ymm15, %ymm15 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm30, %zmm24, %zmm1 +; AVX512F-NEXT: vmovdqa64 (%rdi), %xmm16 +; AVX512F-NEXT: vinserti32x4 $1, 128(%rdi), %ymm16, %ymm16 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm16[0],ymm0[0],ymm16[2],ymm0[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm24, %zmm21, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 576(%rdi), %xmm24 -; AVX512F-NEXT: vinserti32x4 $1, 704(%rdi), %ymm24, %ymm30 +; AVX512F-NEXT: vmovdqa 576(%rdi), %xmm15 +; AVX512F-NEXT: vinserti128 $1, 704(%rdi), %ymm15, %ymm15 ; AVX512F-NEXT: vmovdqa64 512(%rdi), %xmm24 ; AVX512F-NEXT: vinserti32x4 $1, 640(%rdi), %ymm24, %ymm24 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm24[0],ymm30[0],ymm24[2],ymm30[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm1, %zmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm24[0],ymm15[0],ymm24[2],ymm15[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm21, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqa 1088(%rdi), %xmm9 -; AVX512F-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm9, %ymm27 -; AVX512F-NEXT: vmovdqa 1024(%rdi), %xmm9 -; AVX512F-NEXT: vinserti128 $1, 1152(%rdi), %ymm9, %ymm9 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm27[0],ymm9[2],ymm27[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqa 1600(%rdi), %xmm4 -; AVX512F-NEXT: vinserti128 $1, 1728(%rdi), %ymm4, %ymm4 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %xmm17 -; AVX512F-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm17, %ymm17 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm17[0],ymm4[0],ymm17[2],ymm4[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm19, %zmm29, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqa64 2112(%rdi), %xmm19 -; AVX512F-NEXT: vinserti32x4 $1, 2240(%rdi), %ymm19, %ymm19 -; AVX512F-NEXT: vmovdqa64 2048(%rdi), %xmm26 -; AVX512F-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm26, %ymm26 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm26[0],ymm19[0],ymm26[2],ymm19[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm29, %zmm20, %zmm5 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqa 2624(%rdi), %xmm12 -; AVX512F-NEXT: vinserti128 $1, 2752(%rdi), %ymm12, %ymm12 -; AVX512F-NEXT: vmovdqa64 2560(%rdi), %xmm29 -; AVX512F-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm29, %ymm29 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm29[0],ymm12[0],ymm29[2],ymm12[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm18, %zmm21, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqa 1088(%rdi), %xmm13 +; AVX512F-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %xmm21 +; AVX512F-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm21, %ymm21 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm21[0],ymm13[0],ymm21[2],ymm13[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm28, %zmm28 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqa 3136(%rdi), %xmm6 -; AVX512F-NEXT: vinserti128 $1, 3264(%rdi), %ymm6, %ymm6 -; AVX512F-NEXT: vmovdqa64 3072(%rdi), %xmm20 -; AVX512F-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm20[0],ymm6[0],ymm20[2],ymm6[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm21, %zmm16, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa 1600(%rdi), %xmm7 +; AVX512F-NEXT: vinserti128 $1, 1728(%rdi), %ymm7, %ymm7 +; AVX512F-NEXT: vmovdqa 1536(%rdi), %xmm12 +; AVX512F-NEXT: vinserti128 $1, 1664(%rdi), %ymm12, %ymm12 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm20, %zmm19, %zmm9 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqa 3648(%rdi), %xmm7 -; AVX512F-NEXT: vinserti128 $1, 3776(%rdi), %ymm7, %ymm7 -; AVX512F-NEXT: vmovdqa64 3584(%rdi), %xmm21 -; AVX512F-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm21, %ymm21 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm21[0],ymm7[0],ymm21[2],ymm7[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm22 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm20[1],ymm6[1],ymm20[3],ymm6[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm24[1],ymm30[1],ymm24[3],ymm30[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm17[1],ymm4[1],ymm17[3],ymm4[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm9[1],ymm27[1],ymm9[3],ymm27[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm31 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm29[1],ymm12[1],ymm29[3],ymm12[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm9 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm25 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm26[1],ymm19[1],ymm26[3],ymm19[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm25, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqa 2112(%rdi), %xmm8 +; AVX512F-NEXT: vinserti128 $1, 2240(%rdi), %ymm8, %ymm8 +; AVX512F-NEXT: vmovdqa64 2048(%rdi), %xmm20 +; AVX512F-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm20, %ymm20 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm8[0],ymm20[2],ymm8[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm27, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqa 2624(%rdi), %xmm5 +; AVX512F-NEXT: vinserti128 $1, 2752(%rdi), %ymm5, %ymm5 +; AVX512F-NEXT: vmovdqa64 2560(%rdi), %xmm27 +; AVX512F-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm27, %ymm27 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm27[0],ymm5[0],ymm27[2],ymm5[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm18, %zmm30, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa 3136(%rdi), %xmm2 +; AVX512F-NEXT: vinserti128 $1, 3264(%rdi), %ymm2, %ymm2 +; AVX512F-NEXT: vmovdqa64 3072(%rdi), %xmm30 +; AVX512F-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm30, %ymm30 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm30[0],ymm2[0],ymm30[2],ymm2[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm22, %zmm17, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa 3648(%rdi), %xmm3 +; AVX512F-NEXT: vinserti128 $1, 3776(%rdi), %ymm3, %ymm3 +; AVX512F-NEXT: vmovdqa64 3584(%rdi), %xmm22 +; AVX512F-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm22, %ymm22 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm22[0],ymm3[0],ymm22[2],ymm3[2] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm16[1],ymm0[1],ymm16[3],ymm0[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm24[1],ymm15[1],ymm24[3],ymm15[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm11, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm21[1],ymm13[1],ymm21[3],ymm13[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm31 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm20[1],ymm8[1],ymm20[3],ymm8[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm7[1],ymm21[3],ymm7[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm23, %zmm7 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm27[1],ymm5[1],ymm27[3],ymm5[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm29 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm30[1],ymm2[1],ymm30[3],ymm2[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm26 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm22[1],ymm3[1],ymm22[3],ymm3[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm26, %zmm3 ; AVX512F-NEXT: vmovdqa64 %zmm1, 448(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm2, 384(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm10, 384(%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm18, 320(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm5, 256(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm13, 192(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm6, 256(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm9, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm28, 128(%rsi) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm1, 64(%rsi) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm1, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm7, 448(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm10, 256(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm9, 320(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm3, 448(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm2, 384(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm8, 256(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm7, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm6, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm3, 384(%rdx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%rcx) +; AVX512F-NEXT: vmovaps %zmm0, 384(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 320(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rcx) +; AVX512F-NEXT: vmovaps %zmm0, 256(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 192(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rcx) +; AVX512F-NEXT: vmovaps %zmm0, 128(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%rcx) +; AVX512F-NEXT: vmovaps %zmm0, (%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%r8) +; AVX512F-NEXT: vmovaps %zmm0, 384(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 320(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%r8) +; AVX512F-NEXT: vmovaps %zmm0, 256(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 192(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%r8) +; AVX512F-NEXT: vmovaps %zmm0, 128(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%r8) +; AVX512F-NEXT: vmovaps %zmm0, (%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512F-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 320(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512F-NEXT: vmovaps %zmm0, 256(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 192(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%r9) +; AVX512F-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%r9) +; AVX512F-NEXT: vmovaps %zmm0, (%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-NEXT: vmovaps %zmm0, (%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-NEXT: vmovaps %zmm0, (%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-NEXT: addq $6600, %rsp # imm = 0x19C8 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-NEXT: addq $6664, %rsp # imm = 0x1A08 ; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: load_i64_stride8_vf64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $6600, %rsp # imm = 0x19C8 -; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm15 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: load_i64_stride8_vf64: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: subq $6664, %rsp # imm = 0x1A08 +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm15 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm16 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 3264(%rdi), %ymm3 +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 3200(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-NEXT: vmovdqa 3136(%rdi), %ymm3 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 3072(%rdi), %ymm14 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm14[0],ymm3[0],ymm14[2],ymm3[2] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm11[0],ymm3[0],ymm11[2],ymm3[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %ymm26 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %ymm26 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm26[0],ymm0[0],ymm26[2],ymm0[2] +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %ymm24 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm26[0],ymm23[2],ymm26[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm24[0],ymm23[2],ymm24[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqa 1216(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm30 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm30[0],ymm0[0],ymm30[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm20 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm20[0],ymm16[2],ymm20[2] +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %ymm19 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm0[0],ymm19[2],ymm0[2] +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %ymm17 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm12[0],ymm17[0],ymm12[2],ymm17[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm3 ; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vmovdqa 1664(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %ymm21 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %ymm17 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm21[0],ymm17[2],ymm21[2] +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %ymm30 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %ymm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm30[0],ymm16[2],ymm30[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 1216(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %ymm25 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm25[0],ymm0[0],ymm25[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %ymm24 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %ymm22 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm24[0],ymm22[2],ymm24[2] +; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %ymm29 +; AVX512BW-NEXT: vmovdqa 2176(%rdi), %ymm15 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm29[0],ymm15[2],ymm29[2] +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %ymm25 +; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %ymm18 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm25[0],ymm18[2],ymm25[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10341,29 +10300,30 @@ ; AVX512BW-NEXT: vmovdqa 2688(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %ymm31 -; AVX512BW-NEXT: vmovdqa 2560(%rdi), %ymm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm10[0],ymm31[0],ymm10[2],ymm31[2] +; AVX512BW-NEXT: vmovdqa 2624(%rdi), %ymm3 +; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %ymm31 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm31[0],ymm3[0],ymm31[2],ymm3[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %ymm28 -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %ymm19 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm28[0],ymm19[2],ymm28[2] -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %ymm27 -; AVX512BW-NEXT: vmovdqa 2048(%rdi), %ymm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm6[0],ymm27[0],ymm6[2],ymm27[2] +; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %ymm28 +; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %ymm27 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm28[0],ymm27[2],ymm28[2] +; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %ymm22 +; AVX512BW-NEXT: vmovdqa 3072(%rdi), %ymm6 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm6[0],ymm22[0],ymm6[2],ymm22[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10372,278 +10332,280 @@ ; AVX512BW-NEXT: vmovdqa64 3968(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 3904(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 3904(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 3840(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa 3776(%rdi), %ymm12 -; AVX512BW-NEXT: vmovdqa 3712(%rdi), %ymm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm9[0],ymm12[0],ymm9[2],ymm12[2] +; AVX512BW-NEXT: vmovdqa 3776(%rdi), %ymm8 +; AVX512BW-NEXT: vmovdqa 3712(%rdi), %ymm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm5[0],ymm8[0],ymm5[2],ymm8[2] ; AVX512BW-NEXT: vmovdqa 3648(%rdi), %ymm4 ; AVX512BW-NEXT: vmovdqa 3584(%rdi), %ymm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm26[1],ymm23[3],ymm26[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm26, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm26[1],mem[1],ymm26[3],mem[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm23[1],ymm24[1],ymm23[3],ymm24[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm14 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm30, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm30[1],mem[1],ymm30[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm16[1],ymm20[1],ymm16[3],ymm20[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm2, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm19, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm19[1],mem[1],ymm19[3],mem[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm12[1],ymm17[1],ymm12[3],ymm17[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm21[1],ymm17[3],ymm21[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm16[1],ymm30[1],ymm16[3],ymm30[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm25, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm22[1],ymm24[1],ymm22[3],ymm24[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm15[1],ymm29[1],ymm15[3],ymm29[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm18[1],ymm25[1],ymm18[3],ymm25[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm10[1],ymm31[1],ymm10[3],ymm31[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm11 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm11 = ymm31[1],mem[1],ymm31[3],mem[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm2, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm19[1],ymm28[1],ymm19[3],ymm28[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm27[1],ymm6[3],ymm27[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm27[1],ymm28[1],ymm27[3],ymm28[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm22[1],ymm6[3],ymm22[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm19, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm27, %zmm22, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm9[1],ymm12[1],ymm9[3],ymm12[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm5[1],ymm8[1],ymm5[3],ymm8[3] ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm27[0],zmm12[0],zmm27[2],zmm12[2],zmm27[4],zmm12[4],zmm27[6],zmm12[6] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm13[0],zmm9[0],zmm13[2],zmm9[2],zmm13[4],zmm9[4],zmm13[6],zmm9[6] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm9 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm20[0],zmm21[2],zmm20[2],zmm21[4],zmm20[4],zmm21[6],zmm20[6] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm29[0],zmm26[0],zmm29[2],zmm26[2],zmm29[4],zmm26[4],zmm29[6],zmm26[6] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm26 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm16[0],zmm8[2],zmm16[2],zmm8[4],zmm16[4],zmm8[6],zmm16[6] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm19 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm14[0],zmm12[0],zmm14[2],zmm12[2],zmm14[4],zmm12[4],zmm14[6],zmm12[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm17 ; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm9[0],zmm11[0],zmm9[2],zmm11[2],zmm9[4],zmm11[4],zmm9[6],zmm11[6] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm23[0],zmm16[2],zmm23[2],zmm16[4],zmm23[4],zmm16[6],zmm23[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm24[0],zmm15[0],zmm24[2],zmm15[2],zmm24[4],zmm15[4],zmm24[6],zmm15[6] +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm18[0],zmm25[2],zmm18[2],zmm25[4],zmm18[4],zmm25[6],zmm18[6] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm12 ; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm4[0],zmm8[0],zmm4[2],zmm8[2],zmm4[4],zmm8[4],zmm4[6],zmm8[6] -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm4[0],zmm31[0],zmm4[2],zmm31[2],zmm4[4],zmm31[4],zmm4[6],zmm31[6] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 3648(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 3584(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqa64 3776(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3776(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 3712(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm3, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm18 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm19[0],zmm28[0],zmm19[2],zmm28[2],zmm19[4],zmm28[4],zmm19[6],zmm28[6] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm1[0],zmm27[0],zmm1[2],zmm27[2],zmm1[4],zmm27[4],zmm1[6],zmm27[6] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] @@ -10652,118 +10614,116 @@ ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm3 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm27[1],zmm12[1],zmm27[3],zmm12[3],zmm27[5],zmm12[5],zmm27[7],zmm12[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm4 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm31[1],zmm13[1],zmm31[3],zmm13[3],zmm31[5],zmm13[5],zmm31[7],zmm13[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm18[1],zmm20[1],zmm18[3],zmm20[3],zmm18[5],zmm20[5],zmm18[7],zmm20[7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm14 -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm4 = zmm29[1],mem[1],zmm29[3],mem[3],zmm29[5],mem[5],zmm29[7],mem[7] +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm4 = zmm24[1],mem[1],zmm24[3],mem[3],zmm24[5],mem[5],zmm24[7],mem[7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm4 = zmm22[1],mem[1],zmm22[3],mem[3],zmm22[5],mem[5],zmm22[7],mem[7] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm4 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm23[1],zmm16[3],zmm23[3],zmm16[5],zmm23[5],zmm16[7],zmm23[7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm29 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm9[1],zmm11[1],zmm9[3],zmm11[3],zmm9[5],zmm11[5],zmm9[7],zmm11[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm4 = zmm10[1],mem[1],zmm10[3],mem[3],zmm10[5],mem[5],zmm10[7],mem[7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm8[1],zmm4[3],zmm8[3],zmm4[5],zmm8[5],zmm4[7],zmm8[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm7[1],zmm9[1],zmm7[3],zmm9[3],zmm7[5],zmm9[5],zmm7[7],zmm9[7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm8, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm25[1],zmm18[1],zmm25[3],zmm18[3],zmm25[5],zmm18[5],zmm25[7],zmm18[7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm5[1],zmm23[1],zmm5[3],zmm23[3],zmm5[5],zmm23[5],zmm5[7],zmm23[7] ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10771,12 +10731,12 @@ ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 # 64-byte Folded Reload ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm3 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 {%k1} # 64-byte Folded Reload @@ -10784,212 +10744,208 @@ ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm4, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm31 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm31 {%k1} # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm31 {%k1} = zmm24[0],mem[0],zmm24[2],mem[2],zmm24[4],mem[4],zmm24[6],mem[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm18 {%k1} # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm18 {%k1} = zmm1[0],mem[0],zmm1[2],mem[2],zmm1[4],mem[4],zmm1[6],mem[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm14[0],zmm28[0],zmm14[2],zmm28[2],zmm14[4],zmm28[4],zmm14[6],zmm28[6] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm28 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm18[0],zmm20[2],zmm18[2],zmm20[4],zmm18[4],zmm20[6],zmm18[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm3 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm30[0],zmm27[0],zmm30[2],zmm27[2],zmm30[4],zmm27[4],zmm30[6],zmm27[6] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm15 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm31[0],zmm16[0],zmm31[2],zmm16[2],zmm31[4],zmm16[4],zmm31[6],zmm16[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm4, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k1} = zmm1[0],zmm30[0],zmm1[2],zmm30[2],zmm1[4],zmm30[4],zmm1[6],zmm30[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm11 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm3 {%k1} # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 {%k1} = zmm12[0],mem[0],zmm12[2],mem[2],zmm12[4],mem[4],zmm12[6],mem[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm10 {%k1} # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm10 {%k1} = zmm1[0],mem[0],zmm1[2],mem[2],zmm1[4],mem[4],zmm1[6],mem[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm21[0],zmm26[2],zmm21[2],zmm26[4],zmm21[4],zmm26[6],zmm21[6] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm8 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm4, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm12[0],zmm10[0],zmm12[2],zmm10[2],zmm12[4],zmm10[4],zmm12[6],zmm10[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm9[0],zmm5[0],zmm9[2],zmm5[2],zmm9[4],zmm5[4],zmm9[6],zmm5[6] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm8, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm7[0],zmm6[0],zmm7[2],zmm6[2],zmm7[4],zmm6[4],zmm7[6],zmm6[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [7,15,7,15,7,15,7,15] ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm3[1],zmm24[3],zmm3[3],zmm24[5],zmm3[5],zmm24[7],zmm3[7] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm27[1],zmm3[1],zmm27[3],zmm3[3],zmm27[5],zmm3[5],zmm27[7],zmm3[7] ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [0,8,0,8,0,8,0,8] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm24 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm27 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm26, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm14, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm26, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm15[1],zmm28[1],zmm15[3],zmm28[3],zmm15[5],zmm28[5],zmm15[7],zmm28[7] -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm14, %zmm15 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm15[1],zmm11[1],zmm15[3],zmm11[3],zmm15[5],zmm11[5],zmm15[7],zmm11[7] +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm15 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm14, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm23, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm26, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm19 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm19 {%k1} = zmm30[1],zmm27[1],zmm30[3],zmm27[3],zmm30[5],zmm27[5],zmm30[7],zmm27[7] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm14, %zmm30 -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm23, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm4, %zmm1 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm18[1],zmm20[3],zmm18[3],zmm20[5],zmm18[5],zmm20[7],zmm18[7] +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm14, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm26, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm14, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm26, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm17 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm17 {%k1} = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm14, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm26, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm14, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm23, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm14, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm26, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm20 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm31[1],zmm16[1],zmm31[3],zmm16[3],zmm31[5],zmm16[5],zmm31[7],zmm16[7] -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm14, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm23, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm4, %zmm11 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm22[1],zmm30[1],zmm22[3],zmm30[3],zmm22[5],zmm30[5],zmm22[7],zmm30[7] ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm14, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm8[1],zmm26[3],zmm8[3],zmm26[5],zmm8[5],zmm26[7],zmm8[7] -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm23, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm14, %zmm22 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm26, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm23, %zmm25 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm14, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm31 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm24 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm23, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm23, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm18[1],zmm0[1],zmm18[3],zmm0[3],zmm18[5],zmm0[5],zmm18[7],zmm0[7] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm0, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm4, %zmm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm9[1],zmm5[1],zmm9[3],zmm5[3],zmm9[5],zmm5[5],zmm9[7],zmm5[7] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm4, %zmm30 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm30 {%k1} = zmm6[1],zmm3[1],zmm6[3],zmm3[3],zmm6[5],zmm3[5],zmm6[7],zmm3[7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm7 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm14, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm26, %zmm23 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm29 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm12[1],zmm10[1],zmm12[3],zmm10[3],zmm12[5],zmm10[5],zmm12[7],zmm10[7] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm26, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm26, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm26, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm16 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 {%k1} = zmm7[1],zmm6[1],zmm7[3],zmm6[3],zmm7[5],zmm6[5],zmm7[7],zmm6[7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm8 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm9 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload @@ -11001,277 +10957,277 @@ ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm15 # 64-byte Folded Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] ; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm15 -; AVX512BW-NEXT: vinserti128 $1, 128(%rdi), %ymm15, %ymm15 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm30, %zmm24, %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm16 +; AVX512BW-NEXT: vinserti32x4 $1, 128(%rdi), %ymm16, %ymm16 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm16[0],ymm0[0],ymm16[2],ymm0[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm24, %zmm21, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %xmm24 -; AVX512BW-NEXT: vinserti32x4 $1, 704(%rdi), %ymm24, %ymm30 +; AVX512BW-NEXT: vmovdqa 576(%rdi), %xmm15 +; AVX512BW-NEXT: vinserti128 $1, 704(%rdi), %ymm15, %ymm15 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %xmm24 ; AVX512BW-NEXT: vinserti32x4 $1, 640(%rdi), %ymm24, %ymm24 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm24[0],ymm30[0],ymm24[2],ymm30[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm9 -; AVX512BW-NEXT: vinserti32x4 $1, 1216(%rdi), %ymm9, %ymm27 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %xmm9 -; AVX512BW-NEXT: vinserti128 $1, 1152(%rdi), %ymm9, %ymm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm27[0],ymm9[2],ymm27[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm24[0],ymm15[0],ymm24[2],ymm15[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm21, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqa 1600(%rdi), %xmm4 -; AVX512BW-NEXT: vinserti128 $1, 1728(%rdi), %ymm4, %ymm4 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %xmm17 -; AVX512BW-NEXT: vinserti32x4 $1, 1664(%rdi), %ymm17, %ymm17 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm19 = ymm17[0],ymm4[0],ymm17[2],ymm4[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm19, %zmm29, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %xmm19 -; AVX512BW-NEXT: vinserti32x4 $1, 2240(%rdi), %ymm19, %ymm19 -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %xmm26 -; AVX512BW-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm26, %ymm26 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm26[0],ymm19[0],ymm26[2],ymm19[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm29, %zmm20, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm13 +; AVX512BW-NEXT: vinserti128 $1, 1216(%rdi), %ymm13, %ymm13 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %xmm21 +; AVX512BW-NEXT: vinserti32x4 $1, 1152(%rdi), %ymm21, %ymm21 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm21[0],ymm13[0],ymm21[2],ymm13[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm28, %zmm28 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa 2624(%rdi), %xmm12 -; AVX512BW-NEXT: vinserti128 $1, 2752(%rdi), %ymm12, %ymm12 -; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %xmm29 -; AVX512BW-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm29, %ymm29 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm29[0],ymm12[0],ymm29[2],ymm12[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm18, %zmm21, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa 1600(%rdi), %xmm7 +; AVX512BW-NEXT: vinserti128 $1, 1728(%rdi), %ymm7, %ymm7 +; AVX512BW-NEXT: vmovdqa 1536(%rdi), %xmm12 +; AVX512BW-NEXT: vinserti128 $1, 1664(%rdi), %ymm12, %ymm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm19, %zmm9 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa 3136(%rdi), %xmm6 -; AVX512BW-NEXT: vinserti128 $1, 3264(%rdi), %ymm6, %ymm6 -; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %xmm20 -; AVX512BW-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm20, %ymm20 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm21 = ymm20[0],ymm6[0],ymm20[2],ymm6[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm21, %zmm16, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa 3648(%rdi), %xmm7 -; AVX512BW-NEXT: vinserti128 $1, 3776(%rdi), %ymm7, %ymm7 -; AVX512BW-NEXT: vmovdqa64 3584(%rdi), %xmm21 -; AVX512BW-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm21, %ymm21 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm21[0],ymm7[0],ymm21[2],ymm7[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm28, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm22 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm20[1],ymm6[1],ymm20[3],ymm6[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm22, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm24[1],ymm30[1],ymm24[3],ymm30[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm17[1],ymm4[1],ymm17[3],ymm4[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm9[1],ymm27[1],ymm9[3],ymm27[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm31 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm29[1],ymm12[1],ymm29[3],ymm12[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm25 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm26[1],ymm19[1],ymm26[3],ymm19[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm25, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqa 2112(%rdi), %xmm8 +; AVX512BW-NEXT: vinserti128 $1, 2240(%rdi), %ymm8, %ymm8 +; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %xmm20 +; AVX512BW-NEXT: vinserti32x4 $1, 2176(%rdi), %ymm20, %ymm20 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm20[0],ymm8[0],ymm20[2],ymm8[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm27, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqa 2624(%rdi), %xmm5 +; AVX512BW-NEXT: vinserti128 $1, 2752(%rdi), %ymm5, %ymm5 +; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %xmm27 +; AVX512BW-NEXT: vinserti32x4 $1, 2688(%rdi), %ymm27, %ymm27 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm18 = ymm27[0],ymm5[0],ymm27[2],ymm5[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm18, %zmm30, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa 3136(%rdi), %xmm2 +; AVX512BW-NEXT: vinserti128 $1, 3264(%rdi), %ymm2, %ymm2 +; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %xmm30 +; AVX512BW-NEXT: vinserti32x4 $1, 3200(%rdi), %ymm30, %ymm30 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm22 = ymm30[0],ymm2[0],ymm30[2],ymm2[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm22, %zmm17, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa 3648(%rdi), %xmm3 +; AVX512BW-NEXT: vinserti128 $1, 3776(%rdi), %ymm3, %ymm3 +; AVX512BW-NEXT: vmovdqa64 3584(%rdi), %xmm22 +; AVX512BW-NEXT: vinserti32x4 $1, 3712(%rdi), %ymm22, %ymm22 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm22[0],ymm3[0],ymm22[2],ymm3[2] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm16[1],ymm0[1],ymm16[3],ymm0[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm24[1],ymm15[1],ymm24[3],ymm15[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm11, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm21[1],ymm13[1],ymm21[3],ymm13[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm31 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm20[1],ymm8[1],ymm20[3],ymm8[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm8 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm23 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm7[1],ymm21[3],ymm7[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm23, %zmm7 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm27[1],ymm5[1],ymm27[3],ymm5[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm29 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm30[1],ymm2[1],ymm30[3],ymm2[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm26 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm22[1],ymm3[1],ymm22[3],ymm3[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm26, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 448(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 384(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 384(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 320(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 192(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 256(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 128(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm1, 64(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm1, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 448(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%rdx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%rcx) +; AVX512BW-NEXT: vmovaps %zmm0, 384(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 320(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rcx) +; AVX512BW-NEXT: vmovaps %zmm0, 256(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rcx) +; AVX512BW-NEXT: vmovaps %zmm0, 128(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%rcx) +; AVX512BW-NEXT: vmovaps %zmm0, (%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%r8) +; AVX512BW-NEXT: vmovaps %zmm0, 384(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 320(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%r8) +; AVX512BW-NEXT: vmovaps %zmm0, 256(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%r8) +; AVX512BW-NEXT: vmovaps %zmm0, 128(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%r8) +; AVX512BW-NEXT: vmovaps %zmm0, (%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512BW-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 320(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512BW-NEXT: vmovaps %zmm0, 256(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%r9) +; AVX512BW-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%r9) +; AVX512BW-NEXT: vmovaps %zmm0, (%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, (%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, (%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-NEXT: addq $6600, %rsp # imm = 0x19C8 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-NEXT: addq $6664, %rsp # imm = 0x1A08 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <512 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll @@ -247,25 +247,25 @@ ; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: pand %xmm4, %xmm6 ; SSE-NEXT: packuswb %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: packuswb %xmm5, %xmm4 -; SSE-NEXT: psrlw $8, %xmm3 -; SSE-NEXT: psrlw $8, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 ; SSE-NEXT: psrlw $8, %xmm1 ; SSE-NEXT: psrlw $8, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm4, (%rsi) -; SSE-NEXT: movdqa %xmm6, 16(%rsi) -; SSE-NEXT: movdqa %xmm0, (%rdx) +; SSE-NEXT: psrlw $8, %xmm3 +; SSE-NEXT: psrlw $8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm4, 16(%rsi) +; SSE-NEXT: movdqa %xmm6, (%rsi) ; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride2_vf32: @@ -365,28 +365,28 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind { ; SSE-LABEL: load_i8_stride2_vf64: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa 80(%rdi), %xmm4 -; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: movdqa 112(%rdi), %xmm7 -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm9 +; SSE-NEXT: movdqa 96(%rdi), %xmm0 +; SSE-NEXT: movdqa 112(%rdi), %xmm4 +; SSE-NEXT: movdqa 64(%rdi), %xmm2 +; SSE-NEXT: movdqa 80(%rdi), %xmm7 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm11 ; SSE-NEXT: movdqa 32(%rdi), %xmm3 -; SSE-NEXT: movdqa 48(%rdi), %xmm11 +; SSE-NEXT: movdqa 48(%rdi), %xmm9 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: movdqa %xmm11, %xmm8 ; SSE-NEXT: pand %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pand %xmm6, %xmm5 ; SSE-NEXT: packuswb %xmm8, %xmm5 ; SSE-NEXT: movdqa %xmm9, %xmm10 ; SSE-NEXT: pand %xmm6, %xmm10 -; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pand %xmm6, %xmm8 ; SSE-NEXT: packuswb %xmm10, %xmm8 ; SSE-NEXT: movdqa %xmm7, %xmm12 ; SSE-NEXT: pand %xmm6, %xmm12 -; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: pand %xmm6, %xmm10 ; SSE-NEXT: packuswb %xmm12, %xmm10 ; SSE-NEXT: movdqa %xmm4, %xmm12 @@ -394,72 +394,72 @@ ; SSE-NEXT: pand %xmm0, %xmm6 ; SSE-NEXT: packuswb %xmm12, %xmm6 ; SSE-NEXT: psrlw $8, %xmm11 -; SSE-NEXT: psrlw $8, %xmm3 -; SSE-NEXT: packuswb %xmm11, %xmm3 +; SSE-NEXT: psrlw $8, %xmm1 +; SSE-NEXT: packuswb %xmm11, %xmm1 ; SSE-NEXT: psrlw $8, %xmm9 -; SSE-NEXT: psrlw $8, %xmm2 -; SSE-NEXT: packuswb %xmm9, %xmm2 +; SSE-NEXT: psrlw $8, %xmm3 +; SSE-NEXT: packuswb %xmm9, %xmm3 ; SSE-NEXT: psrlw $8, %xmm7 -; SSE-NEXT: psrlw $8, %xmm1 -; SSE-NEXT: packuswb %xmm7, %xmm1 +; SSE-NEXT: psrlw $8, %xmm2 +; SSE-NEXT: packuswb %xmm7, %xmm2 ; SSE-NEXT: psrlw $8, %xmm4 ; SSE-NEXT: psrlw $8, %xmm0 ; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm6, 32(%rsi) -; SSE-NEXT: movdqa %xmm10, 48(%rsi) -; SSE-NEXT: movdqa %xmm8, (%rsi) -; SSE-NEXT: movdqa %xmm5, 16(%rsi) -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm1, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) +; SSE-NEXT: movdqa %xmm6, 48(%rsi) +; SSE-NEXT: movdqa %xmm10, 32(%rsi) +; SSE-NEXT: movdqa %xmm8, 16(%rsi) +; SSE-NEXT: movdqa %xmm5, (%rsi) +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm2, 32(%rdx) ; SSE-NEXT: movdqa %xmm3, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride2_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpand %xmm1, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpand %xmm1, %xmm3, %xmm4 -; AVX1-ONLY-NEXT: vpackuswb %xmm0, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpand %xmm1, %xmm4, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpand %xmm1, %xmm6, %xmm7 -; AVX1-ONLY-NEXT: vpackuswb %xmm5, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpand %xmm1, %xmm5, %xmm0 +; AVX1-ONLY-NEXT: vpand %xmm1, %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vpackuswb %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpand %xmm1, %xmm3, %xmm6 +; AVX1-ONLY-NEXT: vpand %xmm1, %xmm2, %xmm7 +; AVX1-ONLY-NEXT: vpackuswb %xmm6, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpand %xmm1, %xmm7, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpand %xmm1, %xmm9, %xmm10 +; AVX1-ONLY-NEXT: vpackuswb %xmm8, %xmm10, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vpand %xmm1, %xmm10, %xmm11 -; AVX1-ONLY-NEXT: vpand %xmm1, %xmm9, %xmm12 -; AVX1-ONLY-NEXT: vpackuswb %xmm11, %xmm12, %xmm11 -; AVX1-ONLY-NEXT: vpand %xmm1, %xmm8, %xmm12 -; AVX1-ONLY-NEXT: vpand %xmm1, %xmm7, %xmm1 -; AVX1-ONLY-NEXT: vpackuswb %xmm12, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpand %xmm1, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vpackuswb %xmm11, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm9, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm5[0],xmm4[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsi) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 16(%rsi) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 32(%rsi) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 48(%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -470,29 +470,29 @@ ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] -; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm3, %ymm5 +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm1, %ymm5 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] -; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm2, %ymm7 +; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,1,3] -; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm1, %ymm4 -; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm2, %ymm6 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] -; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] -; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] ; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%rsi) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 32(%rsi) +; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 32(%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll @@ -768,13 +768,13 @@ ; SSE-LABEL: load_i8_stride3_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $168, %rsp -; SSE-NEXT: movdqa 80(%rdi), %xmm9 -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm10 +; SSE-NEXT: movdqa 64(%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm9 +; SSE-NEXT: movdqa 48(%rdi), %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] ; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: pandn %xmm1, %xmm6 @@ -850,7 +850,7 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa 32(%rdi), %xmm15 +; SSE-NEXT: movdqa 80(%rdi), %xmm15 ; SSE-NEXT: movdqa %xmm15, %xmm10 ; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] ; SSE-NEXT: movdqa %xmm12, %xmm1 @@ -869,10 +869,10 @@ ; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm14 +; SSE-NEXT: movdqa 112(%rdi), %xmm14 ; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: movdqa 144(%rdi), %xmm1 +; SSE-NEXT: movdqa 96(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill @@ -897,7 +897,7 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: packuswb %xmm0, %xmm9 -; SSE-NEXT: movdqa 176(%rdi), %xmm3 +; SSE-NEXT: movdqa 128(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] ; SSE-NEXT: movdqa %xmm12, %xmm0 @@ -916,11 +916,11 @@ ; SSE-NEXT: pand %xmm7, %xmm9 ; SSE-NEXT: por %xmm9, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm9 +; SSE-NEXT: movdqa 160(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: movdqa 96(%rdi), %xmm9 +; SSE-NEXT: movdqa 144(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm5, %xmm9 ; SSE-NEXT: por %xmm0, %xmm9 @@ -939,7 +939,7 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: packuswb %xmm0, %xmm9 -; SSE-NEXT: movdqa 128(%rdi), %xmm2 +; SSE-NEXT: movdqa 176(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm11 ; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1227,74 +1227,76 @@ ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movdqa %xmm7, 32(%rdx) -; SSE-NEXT: movdqa %xmm13, 48(%rdx) -; SSE-NEXT: movdqa %xmm10, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movdqa %xmm4, 32(%rcx) -; SSE-NEXT: movdqa %xmm0, 48(%rcx) -; SSE-NEXT: movdqa %xmm9, (%rcx) -; SSE-NEXT: movdqa %xmm5, 16(%rcx) +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movdqa %xmm7, 48(%rdx) +; SSE-NEXT: movdqa %xmm13, 32(%rdx) +; SSE-NEXT: movdqa %xmm10, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movdqa %xmm4, 48(%rcx) +; SSE-NEXT: movdqa %xmm0, 32(%rcx) +; SSE-NEXT: movdqa %xmm9, 16(%rcx) +; SSE-NEXT: movdqa %xmm5, (%rcx) ; SSE-NEXT: addq $168, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX1-ONLY-NEXT: pushq %rax +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm13 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u> ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm11, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm12, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm12 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm12 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm6 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm12, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm10, %xmm11 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm11 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm14, %xmm15 ; AVX1-ONLY-NEXT: vpor %xmm11, %xmm15, %xmm11 @@ -1307,7 +1309,7 @@ ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm0 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm5, %xmm1 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10] @@ -1317,25 +1319,26 @@ ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm15 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm10 ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm15, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm15 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm12 ; AVX1-ONLY-NEXT: vpor %xmm12, %xmm15, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm6, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm15 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm15, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm15 ; AVX1-ONLY-NEXT: vpor %xmm15, %xmm11, %xmm11 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm2 @@ -1346,18 +1349,19 @@ ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 32(%rsi) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, 48(%rsi) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 32(%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm11, (%rsi) ; AVX1-ONLY-NEXT: vmovdqa %xmm6, 16(%rsi) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm12, 48(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 32(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, (%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm14, 16(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, (%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 32(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm8, 48(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm9, 32(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, (%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm13, 16(%rcx) +; AVX1-ONLY-NEXT: popq %rax ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i8_stride3_vf64: @@ -1370,53 +1374,53 @@ ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm2 -; AVX2-ONLY-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3 +; AVX2-ONLY-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm4 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] ; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm7 +; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm7 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14] ; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm5 +; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm5 ; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm5, %ymm5 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,0,0,255,0,0,255,0,0,255,0,0,255,0,0,255,255,0,0,255,0,0,255,0,0,255,0,0,255,0,0,255] ; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm4, %ymm1, %ymm9 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0,1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0] ; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm8 +; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm8 ; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm8, %ymm8 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm8 = ymm5[11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26] ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = ymm7[11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7,8,9,10],ymm7[27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23,24,25,26] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,128,128,128,128,128,17,20,23,26,29,16,19,22,25,28,31,128,128,128,128,128] -; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm6, %ymm6 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128,128,128,128,128,128,128,18,21,24,27,30] +; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vpor %ymm2, %ymm6, %ymm2 +; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm6, %ymm4 -; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX2-ONLY-NEXT: vpor %ymm4, %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm5[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5] -; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255] ; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] -; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] +; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rsi) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 32(%rsi) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, (%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm8, 32(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 32(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm9, 32(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll @@ -101,9 +101,9 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 ; SSE-NEXT: packuswb %xmm4, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] @@ -1028,519 +1028,518 @@ ; SSE-LABEL: load_i8_stride4_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $664, %rsp # imm = 0x298 -; SSE-NEXT: movdqa 16(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm6 -; SSE-NEXT: movdqa 48(%rdi), %xmm14 -; SSE-NEXT: movdqa 128(%rdi), %xmm15 -; SSE-NEXT: movdqa 144(%rdi), %xmm10 -; SSE-NEXT: movdqa 160(%rdi), %xmm11 -; SSE-NEXT: movdqa 176(%rdi), %xmm3 -; SSE-NEXT: movdqa 64(%rdi), %xmm13 -; SSE-NEXT: movdqa 80(%rdi), %xmm7 -; SSE-NEXT: movdqa 96(%rdi), %xmm2 -; SSE-NEXT: movdqa 112(%rdi), %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,0,255,0,255,0] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: movdqa 144(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm10 +; SSE-NEXT: movdqa 64(%rdi), %xmm11 +; SSE-NEXT: movdqa 80(%rdi), %xmm9 +; SSE-NEXT: movdqa 96(%rdi), %xmm12 +; SSE-NEXT: movdqa 112(%rdi), %xmm8 +; SSE-NEXT: movdqa (%rdi), %xmm13 +; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: movdqa 32(%rdi), %xmm5 +; SSE-NEXT: movdqa 48(%rdi), %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa 128(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: movdqa 224(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: movdqa 192(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: packuswb %xmm0, %xmm5 -; SSE-NEXT: packuswb %xmm1, %xmm5 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: packuswb %xmm0, %xmm3 +; SSE-NEXT: packuswb %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pxor %xmm3, %xmm3 ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: packuswb %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: packuswb %xmm7, %xmm14 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm13, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: packuswb %xmm4, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm9[0,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: packuswb %xmm7, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm14[0,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: packuswb %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm14[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: packuswb %xmm7, %xmm15 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: packuswb %xmm4, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm9[0,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: packuswb %xmm7, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm15[0,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: packuswb %xmm7, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm4[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] -; SSE-NEXT: packuswb %xmm1, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,3],xmm0[0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm7[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; SSE-NEXT: packuswb %xmm0, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm15[0,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: packuswb %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: packuswb %xmm2, %xmm15 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: packuswb %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm15[0,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm5 ; SSE-NEXT: pand %xmm0, %xmm6 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm4[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm3[0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm5[0,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm3 +; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,7,6,5,4] -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,7,6,5,4] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,7,6,5,4] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm6, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3] -; SSE-NEXT: pand %xmm0, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm5[0,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm5, %xmm6 +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,7,6,5,4] +; SSE-NEXT: pand %xmm0, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm8, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm6[0,3] +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm8 ; SSE-NEXT: pand %xmm0, %xmm11 ; SSE-NEXT: pand %xmm0, %xmm9 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[1,0,3,2,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm7[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm8[0,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: packuswb %xmm4, %xmm7 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: packuswb %xmm8, %xmm9 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm7[0,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: packuswb %xmm8, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm9[0,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: packuswb %xmm7, %xmm8 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm7[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: packuswb %xmm10, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm8[0,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] -; SSE-NEXT: packuswb %xmm8, %xmm10 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; SSE-NEXT: packuswb %xmm9, %xmm10 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: packuswb %xmm11, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1] +; SSE-NEXT: packuswb %xmm9, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm10[0,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] ; SSE-NEXT: packuswb %xmm10, %xmm11 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[3,1,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1] +; SSE-NEXT: packuswb %xmm10, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm11[0,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; SSE-NEXT: packuswb %xmm11, %xmm12 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] -; SSE-NEXT: packuswb %xmm10, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm11[0,3] -; SSE-NEXT: movdqa %xmm5, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 16(%rsi) +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1] +; SSE-NEXT: packuswb %xmm11, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm12[0,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, (%rsi) ; SSE-NEXT: movaps %xmm1, 48(%rdx) -; SSE-NEXT: movaps %xmm15, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps %xmm14, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) ; SSE-NEXT: movaps %xmm6, 48(%rcx) -; SSE-NEXT: movaps %xmm3, 32(%rcx) +; SSE-NEXT: movaps %xmm5, 32(%rcx) ; SSE-NEXT: movaps %xmm2, 16(%rcx) -; SSE-NEXT: movaps %xmm12, (%rcx) +; SSE-NEXT: movaps %xmm4, (%rcx) ; SSE-NEXT: movaps %xmm13, 48(%r8) -; SSE-NEXT: movaps %xmm8, 32(%r8) -; SSE-NEXT: movaps %xmm7, 16(%r8) +; SSE-NEXT: movaps %xmm9, 32(%r8) +; SSE-NEXT: movaps %xmm8, 16(%r8) ; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: addq $664, %rsp # imm = 0x298 ; SSE-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll @@ -104,7 +104,7 @@ ; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm6[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,3,0,4,5,6,7] @@ -1525,142 +1525,146 @@ ; ; AVX1-ONLY-LABEL: load_i8_stride5_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[1,6,11] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u],zero,zero,zero,xmm2[3,8,13,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,4,9,14],zero,zero,zero,xmm3[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[0,5,10,15],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm8[u,u,u] +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm3[1,6,11] +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u],zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,4,9,14],zero,zero,zero,xmm7[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[0,5,10,15],zero,zero,zero,zero,zero,zero,xmm9[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm15 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u],zero,zero,zero,xmm4[2,7,12,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm1[u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u> -; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm5, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm11 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11] -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm12, %ymm14 -; AVX1-ONLY-NEXT: vorps %ymm14, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,7,12] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm14, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[3,8,13,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,u,4,9,14],zero,zero,zero,xmm8[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm15, %xmm6 -; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm14, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,0,5,10,15],zero,zero,zero,xmm3[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u> +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm8, %xmm11, %xmm8 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm11, %ymm8, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11] +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm11, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm14, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm3[2,7,12] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm2[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[u,u,u,u,u,u,u],zero,zero,zero,xmm4[3,8,13,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,4,9,14],zero,zero,zero,xmm1[u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm14, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[u,u,u,0,5,10,15],zero,zero,zero,xmm7[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm14, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3,4],xmm11[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm12, %ymm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12] -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm12, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm13, %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[u,u,u,1,6,11],zero,zero,zero,zero,xmm3[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm2[u,u,u],zero,zero,zero,xmm2[0,5,10,15,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm9[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm11, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12] +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm11, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm13, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm3[3,8,13] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,u],zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm13 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,7,12],zero,zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[2,7,12],zero,zero,zero,zero,zero,zero,zero,xmm9[u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2,3,4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,u],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[2,7,12],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,0,5,10,15],zero,zero,zero,xmm8[u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX1-ONLY-NEXT: vandps %ymm12, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm8[u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm13[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,u],zero,zero,zero,xmm10[0,5,10,15,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm13, %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,xmm2[1,6,11,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,2,7,12],zero,zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[3,8,13],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm13 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm15, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm0[4,9,14] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm13 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm15, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u],zero,zero,zero,xmm10[1,6,11,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,2,7,12],zero,zero,zero,xmm8[u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,xmm2[2,7,12,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,3,8,13],zero,zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[4,9,14],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,1,6,11],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[0,5,10,15] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm2, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0,1,2,3,4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm2[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u],zero,zero,zero,zero,xmm4[4,9,14,u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm10[0,1,2],xmm14[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm2[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm1[u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm15[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm11 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u],zero,zero,zero,xmm4[0,5,10,15,u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u],zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,2,7,12],zero,zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm15, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[3,8,13],zero,zero,zero,zero,zero,zero,xmm9[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm13, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm13, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm12, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm3[4,9,14] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm13, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,1,6,11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm3[0,5,10,15] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u],zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[4,9,14],zero,zero,zero,zero,zero,zero,xmm9[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm5, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,0,5,10,15],zero,zero,zero,xmm1[u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u],zero,zero,zero,xmm4[1,6,11,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,2,7,12],zero,zero,zero,xmm1[u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm11[4,9,14],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm12, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -2012,13 +2016,13 @@ ; SSE-LABEL: load_i8_stride5_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $552, %rsp # imm = 0x228 -; SSE-NEXT: movdqa 160(%rdi), %xmm9 -; SSE-NEXT: movdqa 176(%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm9 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm1 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 @@ -2076,7 +2080,7 @@ ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa 224(%rdi), %xmm3 +; SSE-NEXT: movdqa 64(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] @@ -2097,11 +2101,11 @@ ; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa 112(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa 48(%rdi), %xmm15 +; SSE-NEXT: movdqa 128(%rdi), %xmm15 ; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 @@ -2117,11 +2121,11 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3] ; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa 80(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2151,7 +2155,7 @@ ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2168,11 +2172,11 @@ ; SSE-NEXT: pand %xmm6, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm2 +; SSE-NEXT: movdqa 192(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa 288(%rdi), %xmm13 +; SSE-NEXT: movdqa 208(%rdi), %xmm13 ; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pand %xmm14, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 @@ -2188,11 +2192,11 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,3] ; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 256(%rdi), %xmm1 +; SSE-NEXT: movdqa 176(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa 240(%rdi), %xmm3 +; SSE-NEXT: movdqa 160(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm7 @@ -2220,7 +2224,7 @@ ; SSE-NEXT: packuswb %xmm3, %xmm3 ; SSE-NEXT: pand %xmm11, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa 304(%rdi), %xmm2 +; SSE-NEXT: movdqa 224(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2237,11 +2241,11 @@ ; SSE-NEXT: pand %xmm6, %xmm3 ; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm3 +; SSE-NEXT: movdqa 272(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa 128(%rdi), %xmm3 +; SSE-NEXT: movdqa 288(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm14, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 @@ -2257,11 +2261,11 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,3] ; SSE-NEXT: movdqa %xmm11, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa 96(%rdi), %xmm4 +; SSE-NEXT: movdqa 256(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: movdqa 80(%rdi), %xmm4 +; SSE-NEXT: movdqa 240(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm10, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 @@ -2279,7 +2283,7 @@ ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa 144(%rdi), %xmm12 +; SSE-NEXT: movdqa 304(%rdi), %xmm12 ; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3105,450 +3109,459 @@ ; SSE-NEXT: packuswb %xmm6, %xmm4 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rdx) +; SSE-NEXT: movaps %xmm3, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rdx) +; SSE-NEXT: movaps %xmm3, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rdx) +; SSE-NEXT: movaps %xmm3, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rcx) -; SSE-NEXT: movdqa %xmm9, 16(%r8) -; SSE-NEXT: movdqa %xmm10, 48(%r8) -; SSE-NEXT: movdqa %xmm11, (%r8) -; SSE-NEXT: movdqa %xmm14, 32(%r8) -; SSE-NEXT: movaps %xmm0, 16(%r9) -; SSE-NEXT: movaps %xmm5, 48(%r9) -; SSE-NEXT: movaps %xmm8, (%r9) -; SSE-NEXT: movaps %xmm2, 32(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movdqa %xmm9, 48(%r8) +; SSE-NEXT: movdqa %xmm10, 32(%r8) +; SSE-NEXT: movdqa %xmm11, 16(%r8) +; SSE-NEXT: movdqa %xmm14, (%r8) +; SSE-NEXT: movaps %xmm0, 48(%r9) +; SSE-NEXT: movaps %xmm5, 32(%r9) +; SSE-NEXT: movaps %xmm8, 16(%r9) +; SSE-NEXT: movaps %xmm2, (%r9) ; SSE-NEXT: addq $552, %rsp # imm = 0x228 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride5_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,9,14,0,4,9,14,0,4,9,14,0,4,9,14,0] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,9,14,0,4,9,14,0,4,9,14,0,4,9,14,0] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,2,7,12,0,0,128,128,128,2,7,12,0,0,128] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,2,7,12,0,0,128,128,128,2,7,12,0,0,128] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm10 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [8,13,128,128,128,0,0,3,8,13,128,128,128,0,0,3] ; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [8,13,128,128,128,0,0,3,8,13,128,128,128,0,0,3] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u> -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [1,6,11,128,128,128,128,0,1,6,11,128,128,128,128,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [1,6,11,128,128,128,128,0,1,6,11,128,128,128,128,0] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,128,0,5,10,15,0,128,128,128,0,5,10,15,0] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [128,128,128,0,5,10,15,0,128,128,128,0,5,10,15,0] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,3,8,13,0,0,128,128,128,3,8,13,0,0,128] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm6 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [9,14,128,128,128,0,0,4,9,14,128,128,128,0,0,4] -; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm4, %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,3,8,13,0,0,128,128,128,3,8,13,0,0,128] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [9,14,128,128,128,0,0,4,9,14,128,128,128,0,0,4] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm2, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] -; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm10, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm3, %xmm4, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm14, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm13, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [8,13,0,0,128,128,128,3,8,13,0,0,128,128,128,3] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,128,0,0,4,9,14,128,128,128,0,0,4,9,14,128] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm8[u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,0,5,10,15,0,128,128,128,0,5,10,15,0,128,128] ; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0,1],xmm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <0,5,10,15,128,128,128,128,128,128,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm12[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,1,6,11,0,1,6,11,0,1,6,11,0,1,6,11] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm9 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm12, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm10, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [11,0,0,128,128,128,1,6,11,0,0,128,128,128,1,6] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [128,0,0,2,7,12,128,128,128,0,0,2,7,12,128,128] +; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,4,5,6,7,8,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <3,8,13,128,128,128,128,128,128,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,4,9,14,0,4,9,14,0,4,9,14,0,4,9,14] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,128,128,128,128,4,9,14,0,128,128,128,128,4,9,14] +; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,0,5,10,15,128,128,128,0,0,5,10,15,128,128,128] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm7 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm0, %xmm7, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm4[u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm14, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm14, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm9 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm9, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm9, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u,1,6,11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[0,5,10,15] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm15, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[u,u,u],zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [128,0,0,3,8,13,128,128,128,0,0,3,8,13,128,128] +; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm14, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <4,9,14,128,128,128,128,128,128,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm0, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,u,u,u,u,u,u,u,1,6,11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[0,5,10,15] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u],zero,zero,zero,xmm7[2,7,12,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,4,5,6,7,8,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm9, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm13[1,6,11] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u],zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [128,128,0,0,4,9,14,128,128,128,0,0,4,9,14,128] +; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm11, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm13 +; AVX1-ONLY-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <0,5,10,15,128,128,128,128,128,128,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm13, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [0,1,6,11,0,1,6,11,0,1,6,11,0,1,6,11] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm6, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm13, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[1,6,11] ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u],zero,zero,zero,zero,xmm7[4,9,14,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,0,5,10,15],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,4,5,6,7,8,9,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm14[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u],zero,zero,zero,xmm5[3,8,13,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = mem[0,1],xmm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm13, %xmm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm14[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u],zero,zero,zero,zero,xmm10[4,9,14,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,0,0,5,10,15,128,128,128,0,0,5,10,15,128] +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[3,4,5,6,7,8,9,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm9[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm12 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,2,7,12,0,2,7,12,0,2,7,12,0,2,7,12] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm15 -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm12, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm15, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u],zero,zero,zero,zero,xmm10[4,9,14,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,0,5,10,15],zero,zero,zero,xmm9[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm10[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm11 +; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm6, %ymm15 +; AVX1-ONLY-NEXT: vorps %ymm15, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,1,6,11],zero,zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [10,15,0,128,128,128,0,5,10,15,0,128,128,128,0,5] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <2,7,12,128,128,128,128,128,128,128,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm14, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,4,9,14,0,128,128,128,128,4,9,14,0,128,128] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [2,7,12,128,128,128,0,0,2,7,12,128,128,128,0,0] -; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [10,15,128,128,128,0,0,5,10,15,128,128,128,0,0,5] -; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u],zero,zero,zero,zero,xmm5[4,9,14,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm13[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm12, %xmm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm12, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm3[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm7 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm6, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,1,6,11],zero,zero,zero,zero,xmm8[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,4,5,6,7,8,9,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm11[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [128,128,0,1,6,11,128,128,128,128,0,1,6,11,128,128] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u],zero,zero,zero,xmm10[0,5,10,15,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm8, %xmm13, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <2,7,12,128,128,128,128,128,128,128,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm9, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3,4],xmm2[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm10[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [128,128,4,9,14,0,128,128,128,128,4,9,14,0,128,128] +; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm15[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [2,7,12,128,128,128,0,0,2,7,12,128,128,128,0,0] +; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [10,15,128,128,128,0,0,5,10,15,128,128,128,0,0,5] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm12[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [0,3,8,13,0,3,8,13,0,3,8,13,0,3,8,13] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm8 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm6, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm5[u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u],zero,zero,zero,xmm7[0,5,10,15,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[1,6,11,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,7,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,4,5,6,7,8,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <3,8,13,128,128,128,128,128,128,u,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,4,9,14,0,4,9,14,0,4,9,14,0,4,9,14] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm12, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,zero,xmm4[0,5,10,15,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[3,4,5,6,7,8,9,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm11[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm14[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm14[u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u],zero,zero,zero,xmm8[0,5,10,15,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm2[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm9[4,9,14] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm2, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,1,6,11,128,128,128,128,0,1,6,11,128,128,128,128] -; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,128,128,128,0,5,10,15,0,128,128,128,0,5,10,15] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm8 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,1,6,11,0,0,128,128,128,1,6,11,0,0,128,128] ; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,0,0,3,8,13,128,128,128,0,0,3,8,13,128,128] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11,u,u,u,u] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,128,3,8,13,0,0,128,128,128,3,8,13,0,0] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3,4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [12,128,128,128,0,0,2,7,12,128,128,128,0,0,2,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm4 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [128,128,128,3,8,13,0,0,128,128,128,3,8,13,0,0] ; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [4,9,14,128,128,128,0,0,4,9,14,128,128,128,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3,4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [12,128,128,128,0,0,2,7,12,128,128,128,0,0,2,7] ; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm4, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[2,7,12,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,4,5,6,7,8,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[4,9,14],zero,zero,zero,zero,zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [4,9,14,128,128,128,0,0,4,9,14,128,128,128,0,0] +; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm9, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3,4,5],xmm10[6,7] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[1,6,11,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm10[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm13, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm14, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r9) ; AVX1-ONLY-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -3837,58 +3850,57 @@ ; AVX512F-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm7 ; AVX512F-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm8 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,6,11,16,21,26,31,20,25,30,19,24,29],zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,6,11,16,21,26,31,20,25,30,19,24,29],zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX512F-NEXT: vpternlogq $248, %ymm18, %ymm6, %ymm9 -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512F-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm7, %ymm10 -; AVX512F-NEXT: vmovdqa 208(%rdi), %xmm8 -; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm10 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,ymm10[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovdqa 176(%rdi), %xmm10 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX512F-NEXT: vmovdqa 160(%rdi), %xmm11 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] -; AVX512F-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpternlogq $186, %ymm12, %ymm16, %ymm1 -; AVX512F-NEXT: vmovdqa 144(%rdi), %xmm13 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm13[1,6,11] -; AVX512F-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512F-NEXT: vpor %xmm12, %xmm15, %xmm12 -; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm12, %zmm17 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-NEXT: vpternlogq $184, %zmm9, %zmm20, %zmm17 -; AVX512F-NEXT: vmovdqa 256(%rdi), %ymm15 -; AVX512F-NEXT: vmovdqa 288(%rdi), %ymm12 -; AVX512F-NEXT: vmovdqa %ymm5, %ymm9 -; AVX512F-NEXT: vpternlogq $202, %ymm15, %ymm12, %ymm9 -; AVX512F-NEXT: vextracti128 $1, %ymm9, %xmm2 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[1,6,11] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,4,9,14],zero,zero,zero,xmm9[2,7,12],zero,zero,zero -; AVX512F-NEXT: vpor %xmm2, %xmm9, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm17, %zmm19 +; AVX512F-NEXT: vpternlogq $248, %ymm18, %ymm6, %ymm7 +; AVX512F-NEXT: vmovdqa 144(%rdi), %xmm8 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[1,6,11] +; AVX512F-NEXT: vmovdqa 128(%rdi), %xmm10 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512F-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512F-NEXT: vpternlogq $184, %zmm7, %zmm19, %zmm6 +; AVX512F-NEXT: vmovdqa 256(%rdi), %ymm9 +; AVX512F-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX512F-NEXT: vmovdqa %ymm5, %ymm11 +; AVX512F-NEXT: vpternlogq $202, %ymm9, %ymm7, %ymm11 +; AVX512F-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u],zero,zero,zero,xmm12[3,8,13],zero,zero,zero,xmm12[1,6,11] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,4,9,14],zero,zero,zero,xmm11[2,7,12],zero,zero,zero +; AVX512F-NEXT: vpor %xmm12, %xmm11, %xmm11 +; AVX512F-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm1 +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm11 +; AVX512F-NEXT: vmovdqa 224(%rdi), %ymm12 +; AVX512F-NEXT: vmovdqa %ymm0, %ymm14 +; AVX512F-NEXT: vpternlogq $202, %ymm11, %ymm12, %ymm14 +; AVX512F-NEXT: vmovdqa 208(%rdi), %xmm13 +; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm14 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm14[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vmovdqa 176(%rdi), %xmm14 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] +; AVX512F-NEXT: vmovdqa 160(%rdi), %xmm15 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] +; AVX512F-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpternlogq $186, %ymm2, %ymm4, %ymm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm20 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512F-NEXT: vpternlogq $202, %ymm12, %ymm15, %ymm1 +; AVX512F-NEXT: vpternlogq $202, %ymm7, %ymm9, %ymm1 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,0,5,10,15],zero,zero,zero,xmm1[3,8,13],zero,zero,zero ; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm7, %ymm2 -; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm2 +; AVX512F-NEXT: vpternlogq $202, %ymm11, %ymm12, %ymm2 +; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm2 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm2[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[1,6,11],zero,zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm10[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpor %xmm3, %xmm9, %xmm3 -; AVX512F-NEXT: vpternlogq $186, %ymm2, %ymm16, %ymm3 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm14[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX512F-NEXT: vpternlogq $186, %ymm2, %ymm4, %ymm3 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpternlogq $226, %ymm1, %ymm16, %ymm3 ; AVX512F-NEXT: vmovdqa %ymm5, %ymm1 @@ -3897,37 +3909,37 @@ ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13,u,u,u] ; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] -; AVX512F-NEXT: vmovdqa %ymm9, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] +; AVX512F-NEXT: vmovdqa %ymm6, %ymm2 ; AVX512F-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm2 ; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm4 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,7,12,17,22,27,16,21,26,31,20,25,30],zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpternlogq $248, %ymm18, %ymm1, %ymm2 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm13[2,7,12] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm14[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[2,7,12] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero ; AVX512F-NEXT: vpor %xmm1, %xmm4, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpternlogq $184, %zmm2, %zmm20, %zmm1 +; AVX512F-NEXT: vpternlogq $184, %zmm2, %zmm19, %zmm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm17 ; AVX512F-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512F-NEXT: vpternlogq $202, %ymm12, %ymm15, %ymm1 +; AVX512F-NEXT: vpternlogq $202, %ymm7, %ymm9, %ymm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13] ; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512F-NEXT: vpternlogq $202, %ymm7, %ymm6, %ymm2 -; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm2 +; AVX512F-NEXT: vpternlogq $202, %ymm12, %ymm11, %ymm2 +; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm2 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm10[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm11[2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm14[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-NEXT: vpternlogq $226, %ymm1, %ymm16, %ymm2 -; AVX512F-NEXT: vmovdqa %ymm9, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm6, %ymm1 ; AVX512F-NEXT: vpternlogq $202, %ymm23, %ymm24, %ymm1 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[4,9,14,u,u,u] @@ -3939,25 +3951,25 @@ ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[3,8,13,18,23,28,17,22,27,16,21,26,31],zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpternlogq $248, %ymm18, %ymm1, %ymm3 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm13[3,8,13] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm14[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[3,8,13] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero ; AVX512F-NEXT: vpor %xmm1, %xmm4, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpternlogq $184, %zmm3, %zmm20, %zmm1 +; AVX512F-NEXT: vpternlogq $184, %zmm3, %zmm19, %zmm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm18 -; AVX512F-NEXT: vmovdqa %ymm9, %ymm1 -; AVX512F-NEXT: vpternlogq $202, %ymm15, %ymm12, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm6, %ymm1 +; AVX512F-NEXT: vpternlogq $202, %ymm9, %ymm7, %ymm1 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,7,12],zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero ; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512F-NEXT: vpternlogq $202, %ymm7, %ymm6, %ymm2 -; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm2 +; AVX512F-NEXT: vpternlogq $202, %ymm12, %ymm11, %ymm2 +; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm2 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm10[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm11[3,8,13],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm14[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] @@ -3974,24 +3986,24 @@ ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm13[4,9,14] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm14[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[4,9,14] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512F-NEXT: vpor %xmm1, %xmm4, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpternlogq $226, %zmm3, %zmm4, %zmm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %ymm15, %ymm0, %ymm12 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero -; AVX512F-NEXT: vextracti128 $1, %ymm12, %xmm3 +; AVX512F-NEXT: vpternlogq $226, %ymm9, %ymm0, %ymm7 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero +; AVX512F-NEXT: vextracti128 $1, %ymm7, %xmm3 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm7, %ymm9 -; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm9 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm10[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4,9,14],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpternlogq $202, %ymm11, %ymm12, %ymm6 +; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm6 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm14[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] @@ -4012,7 +4024,7 @@ ; AVX512F-NEXT: vpermd %ymm2, %ymm5, %ymm2 ; AVX512F-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm19, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm20, (%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm17, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm18, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm1, (%r8) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll @@ -97,74 +97,75 @@ ; SSE-LABEL: load_i8_stride6_vf4: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm5 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: psrld $16, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; SSE-NEXT: movaps (%rdi), %xmm4 +; SSE-NEXT: movaps 16(%rdi), %xmm0 +; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: andps %xmm3, %xmm1 +; SSE-NEXT: andnps %xmm0, %xmm3 +; SSE-NEXT: orps %xmm1, %xmm3 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm7 +; SSE-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: andnps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[2,3] +; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: andps %xmm6, %xmm0 +; SSE-NEXT: andnps %xmm4, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,2],xmm4[0,3] +; SSE-NEXT: psrld $16, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE-NEXT: movaps {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: andps %xmm9, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] +; SSE-NEXT: packuswb %xmm3, %xmm3 ; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: orps %xmm7, %xmm2 +; SSE-NEXT: andps %xmm9, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: orps %xmm0, %xmm6 +; SSE-NEXT: andps %xmm6, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: andps %xmm2, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,2],xmm5[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm1[8],xmm6[9],xmm1[9],xmm6[10],xmm1[10],xmm6[11],xmm1[11],xmm6[12],xmm1[12],xmm6[13],xmm1[13],xmm6[14],xmm1[14],xmm6[15],xmm1[15] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm7[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm7[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm8[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movd %xmm0, (%rsi) -; SSE-NEXT: movd %xmm4, (%rdx) -; SSE-NEXT: movd %xmm7, (%rcx) -; SSE-NEXT: movd %xmm6, (%r8) -; SSE-NEXT: movd %xmm1, (%r9) -; SSE-NEXT: movd %xmm2, (%rax) +; SSE-NEXT: movd %xmm3, (%rsi) +; SSE-NEXT: movss %xmm2, (%rdx) +; SSE-NEXT: movd %xmm4, (%rcx) +; SSE-NEXT: movd %xmm5, (%r8) +; SSE-NEXT: movd %xmm0, (%r9) +; SSE-NEXT: movd %xmm1, (%rax) ; SSE-NEXT: retq ; ; AVX-LABEL: load_i8_stride6_vf4: @@ -863,18 +864,18 @@ ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u] ; AVX2-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm2 -; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[4,10] -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero +; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[4,10] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-ONLY-NEXT: vpblendvb %xmm8, %xmm2, %xmm7, %xmm2 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] ; AVX2-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,11] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[5,11] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero ; AVX2-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX2-ONLY-NEXT: vpblendvb %xmm8, %xmm5, %xmm6, %xmm5 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> @@ -883,34 +884,34 @@ ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[2,8,14,u,u,u,u,u] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[2,8,14],zero,zero,xmm6[0,6,12],zero,zero,zero,xmm6[u,u,u,u,u] ; AVX2-ONLY-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,6,12] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[0,6,12] ; AVX2-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm10 ; AVX2-ONLY-NEXT: vpblendvb %xmm8, %xmm9, %xmm10, %xmm9 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[3,9,15,u,u,u,u,u] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,9,15],zero,zero,xmm6[1,7,13],zero,zero,zero,xmm6[u,u,u,u,u] ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,7,13] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[1,7,13] ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm7 ; AVX2-ONLY-NEXT: vpblendvb %xmm8, %xmm6, %xmm7, %xmm6 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = ; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[2,8,14] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[4,10],zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7] ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,8,14] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero -; AVX2-ONLY-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[3,9,15] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3,4],xmm1[5,6,7] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,11],zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,9,15] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero -; AVX2-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] +; AVX2-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, (%rsi) ; AVX2-ONLY-NEXT: vmovdqa %xmm5, (%rdx) ; AVX2-ONLY-NEXT: vmovdqa %xmm9, (%rcx) @@ -2370,11 +2371,11 @@ ; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm1 +; SSE-NEXT: movdqa 128(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa 336(%rdi), %xmm13 +; SSE-NEXT: movdqa 144(%rdi), %xmm13 ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm13, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2392,11 +2393,11 @@ ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa 304(%rdi), %xmm1 +; SSE-NEXT: movdqa 112(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa 288(%rdi), %xmm7 +; SSE-NEXT: movdqa 96(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 @@ -2411,11 +2412,11 @@ ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa 368(%rdi), %xmm1 +; SSE-NEXT: movdqa 176(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 352(%rdi), %xmm1 +; SSE-NEXT: movdqa 160(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 @@ -2497,11 +2498,11 @@ ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm2 +; SSE-NEXT: movdqa 320(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa 144(%rdi), %xmm10 +; SSE-NEXT: movdqa 336(%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm8, %xmm4 ; SSE-NEXT: pandn %xmm10, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2532,11 +2533,11 @@ ; SSE-NEXT: pandn %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: movdqa 112(%rdi), %xmm6 +; SSE-NEXT: movdqa 304(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm8, %xmm7 ; SSE-NEXT: movdqa %xmm8, %xmm9 ; SSE-NEXT: pandn %xmm6, %xmm9 -; SSE-NEXT: movdqa 160(%rdi), %xmm8 +; SSE-NEXT: movdqa 352(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm8 ; SSE-NEXT: movdqa %xmm0, %xmm3 @@ -2579,11 +2580,11 @@ ; SSE-NEXT: pandn %xmm6, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, %xmm15 -; SSE-NEXT: movdqa 96(%rdi), %xmm5 +; SSE-NEXT: movdqa 288(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm6 +; SSE-NEXT: movdqa 368(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, %xmm3 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm3 @@ -3567,48 +3568,48 @@ ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rcx) +; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r8) +; SSE-NEXT: movaps %xmm0, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movdqa %xmm7, 16(%r9) +; SSE-NEXT: movdqa %xmm7, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r9) +; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm4, 16(%rax) +; SSE-NEXT: movdqa %xmm4, 48(%rax) ; SSE-NEXT: movdqa %xmm6, 32(%rax) -; SSE-NEXT: movdqa %xmm12, 48(%rax) +; SSE-NEXT: movdqa %xmm12, 16(%rax) ; SSE-NEXT: movdqa %xmm2, (%rax) ; SSE-NEXT: addq $808, %rsp # imm = 0x328 ; SSE-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll @@ -1525,20 +1525,20 @@ ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm3 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0],xmm10[1],xmm9[2],xmm10[3] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm3 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm2[5,6,7] +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm3, %xmm2 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u] ; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero -; AVX2-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] +; AVX2-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> ; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm4, %xmm5 @@ -1625,90 +1625,92 @@ ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm5 -; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm4 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm7[2,9] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,7,14],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] -; AVX512F-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] +; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm4 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3,4],xmm3[5,6,7] +; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm9 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm10 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm5 = xmm9[0],xmm10[1],xmm9[2],xmm10[3] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm5[2,9] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,7,14],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] +; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512F-NEXT: vmovdqa %ymm8, %ymm6 -; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm6 -; AVX512F-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512F-NEXT: vpor %xmm7, %xmm6, %xmm9 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10] +; AVX512F-NEXT: vmovdqa %ymm8, %ymm4 +; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm4 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX512F-NEXT: vpor %xmm5, %xmm4, %xmm11 +; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm5[1],xmm6[2,3,4],xmm5[5],xmm6[6,7] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero -; AVX512F-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX512F-NEXT: vpor %xmm7, %xmm4, %xmm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX512F-NEXT: vpternlogq $184, %xmm9, %xmm7, %xmm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512F-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm10 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u,u,u] -; AVX512F-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero -; AVX512F-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512F-NEXT: vpternlogq $184, %xmm10, %xmm7, %xmm11 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-NEXT: vmovdqa %ymm10, %ymm12 +; AVX512F-NEXT: vpternlogq $184, %xmm11, %xmm7, %xmm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512F-NEXT: vmovdqa %ymm11, %ymm12 ; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm12 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u,u,u] ; AVX512F-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm13 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm13[5,12] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero +; AVX512F-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm13[4,11] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero +; AVX512F-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512F-NEXT: vpternlogq $184, %xmm12, %xmm7, %xmm13 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-NEXT: vmovdqa %ymm12, %ymm14 +; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm14 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] +; AVX512F-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512F-NEXT: vpternlogq $184, %xmm12, %xmm7, %xmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero +; AVX512F-NEXT: vpor %xmm15, %xmm10, %xmm10 +; AVX512F-NEXT: vpternlogq $184, %xmm14, %xmm7, %xmm10 ; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm8 -; AVX512F-NEXT: vextracti128 $1, %ymm8, %xmm12 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] +; AVX512F-NEXT: vextracti128 $1, %ymm8, %xmm14 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX512F-NEXT: vpor %xmm12, %xmm8, %xmm8 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm13[6,13] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero -; AVX512F-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512F-NEXT: vpternlogq $184, %xmm8, %xmm7, %xmm12 -; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm9 -; AVX512F-NEXT: vextracti128 $1, %ymm9, %xmm8 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] -; AVX512F-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm9[0,7,14] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero -; AVX512F-NEXT: vpor %xmm13, %xmm9, %xmm9 +; AVX512F-NEXT: vpor %xmm14, %xmm8, %xmm8 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero +; AVX512F-NEXT: vpor %xmm14, %xmm9, %xmm9 ; AVX512F-NEXT: vpternlogq $184, %xmm8, %xmm7, %xmm9 -; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm10 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] -; AVX512F-NEXT: vextracti128 $1, %ymm10, %xmm2 +; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm11 +; AVX512F-NEXT: vextracti128 $1, %ymm11, %xmm8 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u,u,u,u] +; AVX512F-NEXT: vpor %xmm8, %xmm11, %xmm8 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0],xmm6[1,2],xmm5[3],xmm6[4,5,6],xmm5[7] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero +; AVX512F-NEXT: vpor %xmm14, %xmm11, %xmm11 +; AVX512F-NEXT: vpternlogq $184, %xmm8, %xmm7, %xmm11 +; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm12 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[6,13],zero,zero,xmm12[2,9],zero,zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX512F-NEXT: vextracti128 $1, %ymm12, %xmm2 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm6[1,2,3],xmm5[4],xmm6[5,6],xmm5[7] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero ; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512F-NEXT: vpternlogq $184, %xmm1, %xmm7, %xmm0 -; AVX512F-NEXT: vmovdqa %xmm5, (%rsi) -; AVX512F-NEXT: vmovdqa %xmm6, (%rdx) -; AVX512F-NEXT: vmovdqa %xmm11, (%rcx) -; AVX512F-NEXT: vmovdqa %xmm14, (%r8) -; AVX512F-NEXT: vmovdqa %xmm12, (%r9) -; AVX512F-NEXT: vmovdqa %xmm9, (%r10) +; AVX512F-NEXT: vmovdqa %xmm3, (%rsi) +; AVX512F-NEXT: vmovdqa %xmm4, (%rdx) +; AVX512F-NEXT: vmovdqa %xmm13, (%rcx) +; AVX512F-NEXT: vmovdqa %xmm10, (%r8) +; AVX512F-NEXT: vmovdqa %xmm9, (%r9) +; AVX512F-NEXT: vmovdqa %xmm11, (%r10) ; AVX512F-NEXT: vmovdqa %xmm0, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1717,97 +1719,99 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9] -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm8[1],xmm7[2],xmm8[3] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX512BW-NEXT: movw $-28382, %r11w # imm = 0x9122 ; AVX512BW-NEXT: kmovd %r11d, %k1 -; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,7,14],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4],xmm4[5,6,7] -; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm5 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[4,11],zero,zero -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] -; AVX512BW-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX512BW-NEXT: movw $4644, %di # imm = 0x1224 -; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm6 {%k2} -; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0],xmm0[1],xmm1[2,3,4],xmm0[5],xmm1[6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero -; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm3 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3,4],xmm2[5,6,7] +; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] +; AVX512BW-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX512BW-NEXT: movw $4644, %r11w # imm = 0x1224 +; AVX512BW-NEXT: kmovd %r11d, %k2 +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2} +; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0],xmm5[1],xmm6[2,3,4],xmm5[5],xmm6[6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm9[3,10] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero +; AVX512BW-NEXT: vpor %xmm10, %xmm9, %xmm9 ; AVX512BW-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1} +; AVX512BW-NEXT: vmovdqu8 %xmm9, %xmm4 {%k1} ; AVX512BW-NEXT: movw $8772, %di # imm = 0x2244 ; AVX512BW-NEXT: kmovd %edi, %k3 -; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm7 {%k3} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero -; AVX512BW-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512BW-NEXT: vmovdqu8 %xmm8, %xmm7 {%k1} +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm9 {%k3} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[2,9],zero,zero,zero,xmm9[5,12],zero,zero,xmm9[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm10[4,11] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero +; AVX512BW-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX512BW-NEXT: vmovdqu8 %xmm10, %xmm9 {%k1} ; AVX512BW-NEXT: movw $9288, %di # imm = 0x2448 ; AVX512BW-NEXT: kmovd %edi, %k4 -; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm8 {%k4} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k4} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512BW-NEXT: vmovdqu8 %xmm10, %xmm8 {%k1} -; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm10 {%k2} -; AVX512BW-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm7[5,12] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero +; AVX512BW-NEXT: vpor %xmm11, %xmm8, %xmm8 +; AVX512BW-NEXT: vmovdqu8 %xmm8, %xmm10 {%k1} +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm8 {%k2} +; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm11 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15],zero,zero -; AVX512BW-NEXT: vpor %xmm11, %xmm9, %xmm9 -; AVX512BW-NEXT: vmovdqu8 %xmm9, %xmm10 {%k1} -; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm9 {%k3} -; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm11, %xmm8, %xmm8 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero +; AVX512BW-NEXT: vpor %xmm7, %xmm11, %xmm7 +; AVX512BW-NEXT: vmovdqu8 %xmm7, %xmm8 {%k1} +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm7 {%k3} +; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm11 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm11, %xmm9, %xmm9 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6],xmm0[7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm7, %xmm11, %xmm7 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0],xmm6[1,2],xmm5[3],xmm6[4,5,6],xmm5[7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero ; AVX512BW-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512BW-NEXT: vmovdqu8 %xmm11, %xmm9 {%k1} -; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k4} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6],xmm0[7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero +; AVX512BW-NEXT: vmovdqu8 %xmm11, %xmm7 {%k1} +; AVX512BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k4} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1} -; AVX512BW-NEXT: vmovdqa %xmm4, (%rsi) -; AVX512BW-NEXT: vmovdqa %xmm6, (%rdx) -; AVX512BW-NEXT: vmovdqa %xmm7, (%rcx) -; AVX512BW-NEXT: vmovdqa %xmm8, (%r8) -; AVX512BW-NEXT: vmovdqa %xmm10, (%r9) -; AVX512BW-NEXT: vmovdqa %xmm9, (%r10) -; AVX512BW-NEXT: vmovdqa %xmm2, (%rax) +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm6[1,2,3],xmm5[4],xmm6[5,6],xmm5[7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm1[1,8,15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} +; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512BW-NEXT: vmovdqa %xmm4, (%rdx) +; AVX512BW-NEXT: vmovdqa %xmm9, (%rcx) +; AVX512BW-NEXT: vmovdqa %xmm10, (%r8) +; AVX512BW-NEXT: vmovdqa %xmm8, (%r9) +; AVX512BW-NEXT: vmovdqa %xmm7, (%r10) +; AVX512BW-NEXT: vmovdqa %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <112 x i8>, ptr %in.vec, align 64 @@ -1832,16 +1836,16 @@ ; SSE-LABEL: load_i8_stride7_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $648, %rsp # imm = 0x288 -; SSE-NEXT: movdqa 208(%rdi), %xmm14 -; SSE-NEXT: movdqa 192(%rdi), %xmm5 +; SSE-NEXT: movdqa 96(%rdi), %xmm14 +; SSE-NEXT: movdqa 80(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm6 -; SSE-NEXT: movdqa 112(%rdi), %xmm4 -; SSE-NEXT: movdqa 128(%rdi), %xmm3 -; SSE-NEXT: movdqa 160(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm1 +; SSE-NEXT: movdqa 64(%rdi), %xmm6 +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 @@ -1928,11 +1932,11 @@ ; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm2 +; SSE-NEXT: movdqa 144(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa 48(%rdi), %xmm2 +; SSE-NEXT: movdqa 160(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 @@ -1946,12 +1950,12 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa 128(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm3, %xmm14 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa 112(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm4, %xmm9 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1972,12 +1976,12 @@ ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa 64(%rdi), %xmm2 +; SSE-NEXT: movdqa 176(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm15 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm8 +; SSE-NEXT: movdqa 192(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pand %xmm7, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 @@ -1988,7 +1992,7 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: movdqa 96(%rdi), %xmm2 +; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2883,29 +2887,29 @@ ; SSE-NEXT: andnps %xmm0, %xmm1 ; SSE-NEXT: orps %xmm10, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movdqa %xmm4, (%r9) -; SSE-NEXT: movdqa %xmm14, 16(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movdqa %xmm4, 16(%r9) +; SSE-NEXT: movdqa %xmm14, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm6, (%rax) -; SSE-NEXT: movdqa %xmm5, 16(%rax) +; SSE-NEXT: movdqa %xmm6, 16(%rax) +; SSE-NEXT: movdqa %xmm5, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm1, (%rax) -; SSE-NEXT: movdqa %xmm8, 16(%rax) +; SSE-NEXT: movaps %xmm1, 16(%rax) +; SSE-NEXT: movdqa %xmm8, (%rax) ; SSE-NEXT: addq $648, %rsp # imm = 0x288 ; SSE-NEXT: retq ; @@ -4270,186 +4274,194 @@ ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31> -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31> -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm2, %zmm4 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u> -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm2, %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u> -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm2, %zmm11 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15> -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm2, %zmm12 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31> -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm2, %zmm10 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31> -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm2, %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512BW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX512BW-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX512BW-SLOW-NEXT: movw $-28382, %r11w # imm = 0x9122 ; AVX512BW-SLOW-NEXT: kmovd %r11d, %k5 -; AVX512BW-SLOW-NEXT: vpblendmw %ymm2, %ymm3, %ymm1 {%k5} -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm7, %xmm1, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: movw $992, %r11w # imm = 0x3E0 +; AVX512BW-SLOW-NEXT: vpblendmw %ymm1, %ymm2, %ymm0 {%k5} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: movw $9288, %r11w # imm = 0x2448 ; AVX512BW-SLOW-NEXT: kmovd %r11d, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm6, %ymm1 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512BW-SLOW-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX512BW-SLOW-NEXT: vpblendmw %ymm4, %ymm3, %ymm5 {%k1} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: movw $992, %r11w # imm = 0x3E0 +; AVX512BW-SLOW-NEXT: kmovd %r11d, %k2 +; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm5, %ymm0 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512BW-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX512BW-SLOW-NEXT: movw $8772, %r11w # imm = 0x2244 -; AVX512BW-SLOW-NEXT: kmovd %r11d, %k1 -; AVX512BW-SLOW-NEXT: vpblendmw %ymm7, %ymm6, %ymm8 {%k1} -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm15 -; AVX512BW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm9 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm16 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] +; AVX512BW-SLOW-NEXT: kmovd %r11d, %k2 +; AVX512BW-SLOW-NEXT: vpblendmw %ymm6, %ymm5, %ymm7 {%k2} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm9 +; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm11 +; AVX512BW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm8 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm11[7] ; AVX512BW-SLOW-NEXT: movl $-524288, %edi # imm = 0xFFF80000 ; AVX512BW-SLOW-NEXT: kmovd %edi, %k4 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm13, %ymm1 {%k4} +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm9, %ymm0 {%k4} ; AVX512BW-SLOW-NEXT: movw $4644, %di # imm = 0x1224 -; AVX512BW-SLOW-NEXT: kmovd %edi, %k2 -; AVX512BW-SLOW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: movl $511, %edi # imm = 0x1FF ; AVX512BW-SLOW-NEXT: kmovd %edi, %k3 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k3} -; AVX512BW-SLOW-NEXT: movw $9288, %di # imm = 0x2448 -; AVX512BW-SLOW-NEXT: kmovd %edi, %k3 -; AVX512BW-SLOW-NEXT: vpblendmw %ymm7, %ymm6, %ymm13 {%k3} -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,6,13],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[u,u,u,u] -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,xmm13[4,11],zero,zero,xmm13[0,7,14,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm16 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k4} -; AVX512BW-SLOW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX512BW-SLOW-NEXT: vpblendmw %ymm1, %ymm2, %ymm9 {%k3} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[6,13],zero,zero,xmm11[2,9,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,8,15],zero,zero,xmm9[4,11],zero,zero,xmm9[u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm11, %xmm9, %xmm11 +; AVX512BW-SLOW-NEXT: vpblendmw %ymm3, %ymm4, %ymm9 {%k5} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4,5],ymm12[6],ymm9[7,8,9],ymm12[10],ymm9[11,12,13],ymm12[14],ymm9[15] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: movl $511, %edi # imm = 0x1FF +; AVX512BW-SLOW-NEXT: kmovd %edi, %k6 +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k6} +; AVX512BW-SLOW-NEXT: vpblendmw %ymm6, %ymm5, %ymm11 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u] +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm12, %xmm11, %xmm11 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4} +; AVX512BW-SLOW-NEXT: vpblendmw %ymm3, %ymm4, %ymm11 {%k3} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8,9,10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] +; AVX512BW-SLOW-NEXT: vpblendmw %ymm1, %ymm2, %ymm12 {%k2} +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm13, %xmm12, %xmm12 ; AVX512BW-SLOW-NEXT: movl $261632, %edi # imm = 0x3FE00 ; AVX512BW-SLOW-NEXT: kmovd %edi, %k4 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm13 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpblendmw %ymm6, %ymm7, %ymm12 {%k5} -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm15 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm15, %xmm12, %xmm12 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7] -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-SLOW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k2} -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm12 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpblendmw %ymm5, %ymm6, %ymm11 {%k5} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[1,8,15,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,0,7,14],zero,zero,xmm11[3,10],zero,zero,zero,xmm11[u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm13, %xmm11, %xmm11 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm10 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5,6],ymm10[7] +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1,2,3,4,5,6,7],ymm12[8],ymm10[9,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpblendmw %ymm3, %ymm4, %ymm11 {%k2} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2],ymm12[3],ymm11[4,5,6],ymm12[7,8],ymm11[9,10],ymm12[11],ymm11[12,13,14],ymm12[15] +; AVX512BW-SLOW-NEXT: vpblendmw %ymm1, %ymm2, %ymm12 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm12 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpblendmw %ymm5, %ymm6, %ymm11 {%k3} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm13, %xmm11, %xmm11 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512BW-SLOW-NEXT: vpor %xmm13, %xmm14, %xmm13 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512BW-SLOW-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-SLOW-NEXT: movl $-134217728, %edi # imm = 0xF8000000 ; AVX512BW-SLOW-NEXT: kmovd %edi, %k5 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512BW-SLOW-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k3} -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-SLOW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm13, %ymm11 {%k5} +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpblendmw %ymm3, %ymm4, %ymm12 {%k1} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3],ymm13[4],ymm12[5,6],ymm13[7,8],ymm12[9,10,11],ymm13[12],ymm12[13,14],ymm13[15] +; AVX512BW-SLOW-NEXT: vpblendmw %ymm2, %ymm1, %ymm13 {%k3} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u] ; AVX512BW-SLOW-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm13 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpblendmw %ymm5, %ymm6, %ymm12 {%k2} +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm14, %xmm12, %xmm12 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero ; AVX512BW-SLOW-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512BW-SLOW-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm14, %ymm12 {%k5} +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpblendmw %ymm4, %ymm3, %ymm13 {%k3} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13,14,15] +; AVX512BW-SLOW-NEXT: vpblendmw %ymm2, %ymm1, %ymm14 {%k2} ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u] ; AVX512BW-SLOW-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-SLOW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k3} -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm13[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpblendmw %ymm5, %ymm6, %ymm13 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u] ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512BW-SLOW-NEXT: vpor %xmm15, %xmm13, %xmm13 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14] -; AVX512BW-SLOW-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512BW-SLOW-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k1} -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm7, %ymm6 {%k2} -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15] -; AVX512BW-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k5} -; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k3} -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa %ymm1, (%rsi) -; AVX512BW-SLOW-NEXT: vmovdqa %ymm10, (%rdx) -; AVX512BW-SLOW-NEXT: vmovdqa %ymm12, (%rcx) +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm16 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14] +; AVX512BW-SLOW-NEXT: vporq %xmm15, %xmm16, %xmm15 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm15, %ymm13 {%k5} +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k2} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] +; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm2, %ymm1 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k4} = ymm3[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15] +; AVX512BW-SLOW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k5} +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512BW-SLOW-NEXT: vmovdqa %ymm9, (%rdx) +; AVX512BW-SLOW-NEXT: vmovdqa %ymm10, (%rcx) ; AVX512BW-SLOW-NEXT: vmovdqa %ymm11, (%r8) -; AVX512BW-SLOW-NEXT: vmovdqa %ymm5, (%r9) -; AVX512BW-SLOW-NEXT: vmovdqa %ymm4, (%r10) -; AVX512BW-SLOW-NEXT: vmovdqa %ymm0, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa %ymm12, (%r9) +; AVX512BW-SLOW-NEXT: vmovdqa %ymm13, (%r10) +; AVX512BW-SLOW-NEXT: vmovdqa %ymm1, (%rax) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; @@ -4457,185 +4469,186 @@ ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] -; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm4 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] -; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] -; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] -; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm10 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] -; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] -; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512BW-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX512BW-FAST-NEXT: movw $-28382, %r11w # imm = 0x9122 ; AVX512BW-FAST-NEXT: kmovd %r11d, %k5 -; AVX512BW-FAST-NEXT: vpblendmw %ymm2, %ymm3, %ymm1 {%k5} -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm7, %xmm1, %xmm1 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpblendmw %ymm3, %ymm4, %ymm2 {%k5} +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512BW-FAST-NEXT: vpermi2w %ymm0, %ymm1, %ymm5 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FAST-NEXT: movw $992, %r11w # imm = 0x3E0 ; AVX512BW-FAST-NEXT: kmovd %r11d, %k1 -; AVX512BW-FAST-NEXT: vmovdqu16 %ymm6, %ymm1 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512BW-FAST-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX512BW-FAST-NEXT: vmovdqu16 %ymm5, %ymm2 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512BW-FAST-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX512BW-FAST-NEXT: movw $8772, %r11w # imm = 0x2244 ; AVX512BW-FAST-NEXT: kmovd %r11d, %k1 -; AVX512BW-FAST-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k1} -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,2,4,6,1,2,4,6] -; AVX512BW-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX512BW-FAST-NEXT: vpermd %ymm13, %ymm12, %ymm12 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] +; AVX512BW-FAST-NEXT: vpblendmw %ymm6, %ymm5, %ymm7 {%k1} +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,2,4,6,1,2,4,6] +; AVX512BW-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vmovdqa 192(%rdi), %ymm9 +; AVX512BW-FAST-NEXT: vpermd %ymm9, %ymm8, %ymm8 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] ; AVX512BW-FAST-NEXT: movl $-524288, %r11d # imm = 0xFFF80000 ; AVX512BW-FAST-NEXT: kmovd %r11d, %k4 -; AVX512BW-FAST-NEXT: vmovdqu8 %ymm11, %ymm1 {%k4} +; AVX512BW-FAST-NEXT: vmovdqu8 %ymm7, %ymm2 {%k4} ; AVX512BW-FAST-NEXT: movw $4644, %r11w # imm = 0x1224 ; AVX512BW-FAST-NEXT: kmovd %r11d, %k2 -; AVX512BW-FAST-NEXT: vpblendmw %ymm2, %ymm3, %ymm11 {%k2} -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpblendmw %ymm3, %ymm4, %ymm7 {%k2} +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,8,15],zero,zero,xmm7[4,11],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm7, %xmm8, %xmm8 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,17,10,3,u,21,14,7,24,u,u,u,28,u,u,31> +; AVX512BW-FAST-NEXT: vpermi2w %ymm1, %ymm0, %ymm7 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FAST-NEXT: movl $511, %r11d # imm = 0x1FF ; AVX512BW-FAST-NEXT: kmovd %r11d, %k3 -; AVX512BW-FAST-NEXT: vmovdqu8 %ymm11, %ymm8 {%k3} +; AVX512BW-FAST-NEXT: vmovdqu8 %ymm8, %ymm7 {%k3} ; AVX512BW-FAST-NEXT: movw $9288, %r11w # imm = 0x2448 ; AVX512BW-FAST-NEXT: kmovd %r11d, %k3 -; AVX512BW-FAST-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k3} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u] -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,3,4,6,1,3,4,6] -; AVX512BW-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpermd %ymm13, %ymm12, %ymm12 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] -; AVX512BW-FAST-NEXT: vmovdqu8 %ymm11, %ymm8 {%k4} -; AVX512BW-FAST-NEXT: vpblendmw %ymm2, %ymm3, %ymm11 {%k1} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm12, %xmm11, %xmm11 +; AVX512BW-FAST-NEXT: vpblendmw %ymm6, %ymm5, %ymm8 {%k3} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,6,13],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u] +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[4,11],zero,zero,xmm8[0,7,14,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm10, %xmm8, %xmm8 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,3,4,6,1,3,4,6] +; AVX512BW-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm10 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7] +; AVX512BW-FAST-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4} +; AVX512BW-FAST-NEXT: vpblendmw %ymm3, %ymm4, %ymm8 {%k1} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm10, %xmm8, %xmm8 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,u,18,11,4,21,14,7,u,25,u,u,28,u,u,u> +; AVX512BW-FAST-NEXT: vpermi2w %ymm1, %ymm0, %ymm10 ; AVX512BW-FAST-NEXT: movl $261632, %r11d # imm = 0x3FE00 ; AVX512BW-FAST-NEXT: kmovd %r11d, %k4 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm11 {%k4} = ymm10[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpblendmw %ymm6, %ymm7, %ymm10 {%k5} -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[1,8,15,u,u,u,u] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm8 {%k4} = ymm10[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpblendmw %ymm5, %ymm6, %ymm10 {%k5} +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[1,8,15,u,u,u,u] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm12, %xmm10, %xmm10 +; AVX512BW-FAST-NEXT: vpor %xmm11, %xmm10, %xmm10 ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,3,5,6,1,3,5,6] -; AVX512BW-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpermd %ymm13, %ymm12, %ymm12 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7] -; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FAST-NEXT: vpblendmw %ymm6, %ymm7, %ymm11 {%k2} -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm13 -; AVX512BW-FAST-NEXT: vmovdqa 208(%rdi), %xmm11 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[5,12] -; AVX512BW-FAST-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512BW-FAST-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,3,5,6,1,3,5,6] +; AVX512BW-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpermd %ymm9, %ymm11, %ymm9 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7] +; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0],ymm9[1,2,3,4,5,6,7],ymm8[8],ymm9[9,10,11,12,13,14,15] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-FAST-NEXT: vpblendmw %ymm5, %ymm6, %ymm9 {%k2} +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[2,9,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,1,8,15],zero,zero,xmm9[4,11],zero,zero,xmm9[u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm10 +; AVX512BW-FAST-NEXT: vmovdqa 208(%rdi), %xmm9 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12] +; AVX512BW-FAST-NEXT: vmovdqa 192(%rdi), %xmm11 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512BW-FAST-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512BW-FAST-NEXT: movl $-134217728, %edi # imm = 0xF8000000 ; AVX512BW-FAST-NEXT: kmovd %edi, %k5 -; AVX512BW-FAST-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512BW-FAST-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k3} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FAST-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k1} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u] -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512BW-FAST-NEXT: vmovdqu8 %ymm12, %ymm10 {%k5} +; AVX512BW-FAST-NEXT: vpblendmw %ymm3, %ymm4, %ymm12 {%k3} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <8,1,18,11,4,u,22,15,u,25,u,u,u,29,u,u> +; AVX512BW-FAST-NEXT: vpermi2w %ymm1, %ymm0, %ymm13 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm12 {%k4} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1,2,3,4,5,6,7],ymm12[8],ymm10[9,10,11,12,13,14,15] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-FAST-NEXT: vpblendmw %ymm5, %ymm6, %ymm12 {%k1} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512BW-FAST-NEXT: vpor %xmm13, %xmm14, %xmm13 ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[6,13] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512BW-FAST-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-FAST-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512BW-FAST-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k2} -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-FAST-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k3} +; AVX512BW-FAST-NEXT: vmovdqu8 %ymm13, %ymm12 {%k5} +; AVX512BW-FAST-NEXT: vpblendmw %ymm4, %ymm3, %ymm13 {%k2} +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <8,1,u,19,12,5,22,15,u,u,26,u,u,29,u,u> +; AVX512BW-FAST-NEXT: vpermi2w %ymm1, %ymm0, %ymm14 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm13 {%k4} = ymm14[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-FAST-NEXT: vpblendmw %ymm5, %ymm6, %ymm13 {%k3} ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u] ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u] ; AVX512BW-FAST-NEXT: vpor %xmm14, %xmm13, %xmm13 ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[0,7,14] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14] ; AVX512BW-FAST-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-FAST-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512BW-FAST-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k1} +; AVX512BW-FAST-NEXT: vpblendmw %ymm4, %ymm3, %ymm14 {%k1} ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u] ; AVX512BW-FAST-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FAST-NEXT: vmovdqu16 %ymm7, %ymm6 {%k2} -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,25,18,3,28,21,u,7,u,u,10,u,u,u,14,u> +; AVX512BW-FAST-NEXT: vpermi2w %ymm0, %ymm1, %ymm15 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm15[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-FAST-NEXT: vmovdqu16 %ymm6, %ymm5 {%k2} +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15] +; AVX512BW-FAST-NEXT: vpor %xmm6, %xmm9, %xmm6 ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[1,8,15] -; AVX512BW-FAST-NEXT: vpor %xmm7, %xmm11, %xmm7 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-FAST-NEXT: vmovdqu8 %ymm7, %ymm6 {%k5} -; AVX512BW-FAST-NEXT: vmovdqu16 %ymm3, %ymm2 {%k3} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FAST-NEXT: vmovdqa %ymm1, (%rsi) -; AVX512BW-FAST-NEXT: vmovdqa %ymm8, (%rdx) -; AVX512BW-FAST-NEXT: vmovdqa %ymm10, (%rcx) -; AVX512BW-FAST-NEXT: vmovdqa %ymm9, (%r8) -; AVX512BW-FAST-NEXT: vmovdqa %ymm5, (%r9) -; AVX512BW-FAST-NEXT: vmovdqa %ymm4, (%r10) +; AVX512BW-FAST-NEXT: vmovdqu8 %ymm6, %ymm5 {%k5} +; AVX512BW-FAST-NEXT: vmovdqu16 %ymm4, %ymm3 {%k3} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,25,18,u,4,29,22,7,u,u,u,11,u,u,14,u> +; AVX512BW-FAST-NEXT: vpermi2w %ymm0, %ymm1, %ymm4 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm3 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm5[1,2,3,4,5,6,7],ymm3[8],ymm5[9,10,11,12,13,14,15] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FAST-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512BW-FAST-NEXT: vmovdqa %ymm7, (%rdx) +; AVX512BW-FAST-NEXT: vmovdqa %ymm8, (%rcx) +; AVX512BW-FAST-NEXT: vmovdqa %ymm10, (%r8) +; AVX512BW-FAST-NEXT: vmovdqa %ymm12, (%r9) +; AVX512BW-FAST-NEXT: vmovdqa %ymm13, (%r10) ; AVX512BW-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq @@ -4661,19 +4674,19 @@ ; SSE-LABEL: load_i8_stride7_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $1528, %rsp # imm = 0x5F8 -; SSE-NEXT: movdqa 208(%rdi), %xmm12 -; SSE-NEXT: movdqa 192(%rdi), %xmm5 +; SSE-NEXT: movdqa 96(%rdi), %xmm12 +; SSE-NEXT: movdqa 80(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm8 +; SSE-NEXT: movdqa 64(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm1 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 @@ -4751,11 +4764,11 @@ ; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm2 +; SSE-NEXT: movdqa 144(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa 272(%rdi), %xmm2 +; SSE-NEXT: movdqa 160(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: pand %xmm7, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 @@ -4771,11 +4784,11 @@ ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa 240(%rdi), %xmm2 +; SSE-NEXT: movdqa 128(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa 224(%rdi), %xmm2 +; SSE-NEXT: movdqa 112(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 @@ -4793,11 +4806,11 @@ ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa 288(%rdi), %xmm3 +; SSE-NEXT: movdqa 176(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa 304(%rdi), %xmm3 +; SSE-NEXT: movdqa 192(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm9, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 @@ -4808,7 +4821,7 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: movdqa 320(%rdi), %xmm3 +; SSE-NEXT: movdqa 208(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4906,10 +4919,10 @@ ; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm15 +; SSE-NEXT: movdqa 256(%rdi), %xmm15 ; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm15, %xmm1 -; SSE-NEXT: movdqa 48(%rdi), %xmm2 +; SSE-NEXT: movdqa 272(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm7, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 @@ -4923,11 +4936,11 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa 240(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa 224(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm11, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 @@ -4946,11 +4959,11 @@ ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa 64(%rdi), %xmm2 +; SSE-NEXT: movdqa 288(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa 80(%rdi), %xmm2 +; SSE-NEXT: movdqa 304(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 @@ -4962,7 +4975,7 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: movdqa 96(%rdi), %xmm2 +; SSE-NEXT: movdqa 320(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6699,78 +6712,78 @@ ; SSE-NEXT: por %xmm14, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rcx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r8) +; SSE-NEXT: movaps %xmm0, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%r9) +; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r9) +; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm11, (%rax) +; SSE-NEXT: movdqa %xmm11, 32(%rax) ; SSE-NEXT: movdqa %xmm9, 48(%rax) -; SSE-NEXT: movdqa %xmm7, 32(%rax) +; SSE-NEXT: movdqa %xmm7, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm1, (%rax) +; SSE-NEXT: movdqa %xmm1, 32(%rax) ; SSE-NEXT: movdqa %xmm13, 48(%rax) -; SSE-NEXT: movdqa %xmm12, 32(%rax) -; SSE-NEXT: movdqa %xmm3, 16(%rax) +; SSE-NEXT: movdqa %xmm12, 16(%rax) +; SSE-NEXT: movdqa %xmm3, (%rax) ; SSE-NEXT: addq $1528, %rsp # imm = 0x5F8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $744, %rsp # imm = 0x2E8 +; AVX1-ONLY-NEXT: subq $760, %rsp # imm = 0x2F8 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [128,128,6,13,0,0,0,128,128,128,6,13,0,0,0,128] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,128,5,12,0,0,0,128,128,128,5,12,0,0,0] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm4 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,7,14,128,128,0,0,0,0,7,14,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm7 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,0,0,0,3,10,128,128,128,0,0,0,3,10,128] @@ -6812,12 +6825,11 @@ ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm12 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm9 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX1-ONLY-NEXT: vpblendvb %xmm15, %xmm2, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm2, %xmm9, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm0 @@ -6825,17 +6837,16 @@ ; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm15, %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [9,128,128,128,0,0,0,2,9,128,128,128,0,0,0,2] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,0,7,14,0,0,0,128,128,0,7,14,0,0,0,128] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] @@ -6846,34 +6857,33 @@ ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm13 ; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [13,128,128,6,13,128,128,6,13,128,128,6,13,128,128,6] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX1-ONLY-NEXT: vpblendvb %xmm15, %xmm5, %xmm13, %xmm5 +; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm5, %xmm13, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm15, %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,128,128,128,5,12,0,0,0,128,128,128,5,12,0] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,0,0,7,14,128,128,0,0,0,0,7,14,128,128,0] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,7,14,128,128,0,0,0,0,7,14,128,128,0] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm5 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [10,128,128,128,0,0,0,3,10,128,128,128,0,0,0,3] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm4 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,1,8,15,0,0,0,128,128,1,8,15,0,0,0,128] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm13 @@ -6881,8 +6891,8 @@ ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = ; AVX1-ONLY-NEXT: vpblendvb %xmm4, %xmm5, %xmm13, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm2 @@ -6895,12 +6905,12 @@ ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm0 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [1,8,15,128,128,0,0,0,1,8,15,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [9,128,128,2,9,128,128,2,9,128,128,2,9,128,128,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm0 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm13 @@ -6923,13 +6933,13 @@ ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [2,9,128,128,128,0,0,0,2,9,128,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm15, %xmm2 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,0,7,14,0,0,0,128,128,0,7,14,0,0,0] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm4 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm5 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [128,5,12,128,128,5,12,128,128,5,12,128,128,5,12,128] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm14, %xmm5 @@ -6949,13 +6959,13 @@ ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [3,10,128,128,128,0,0,0,3,10,128,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm3 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,1,8,15,0,0,0,128,128,1,8,15,0,0,0] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm4 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm5 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm13 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,6,13,128,128,6,13,128,128,6,13,128,128,6,13,128] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 @@ -7046,19 +7056,19 @@ ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm9, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm5 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm1, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm6 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] ; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm9 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = ; AVX1-ONLY-NEXT: vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload @@ -7076,7 +7086,7 @@ ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm11 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm15 ; AVX1-ONLY-NEXT: vpor %xmm11, %xmm15, %xmm11 ; AVX1-ONLY-NEXT: vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7087,7 +7097,7 @@ ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm1 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] ; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm14 @@ -7117,11 +7127,12 @@ ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm7[1,2],xmm8[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] ; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] ; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm10 ; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm8, %xmm10, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7132,78 +7143,75 @@ ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,2,9,0,0,2,9,0,0,2,9,0,0,2,9] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,0,2,9,0,0,2,9,0,0,2,9,0,0,2,9] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,1,2,3,8,15,0,0,0,1,2,3,8,15] ; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm10 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,7,14,0,0,7,14,0,0,7,14,0,0,7,14,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,7,14,0,0,7,14,0,0,7,14,0,0,7,14,0] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm2, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm10, %ymm12, %ymm10 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm2, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm8, %ymm12, %ymm8 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm12, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm12, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm15 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm13 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 ; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 @@ -7213,16 +7221,18 @@ ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm15 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [0,0,7,14,0,0,7,14,0,0,7,14,0,0,7,14] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm10 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] ; AVX1-ONLY-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6],xmm0[7] @@ -7231,157 +7241,157 @@ ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm11 ; AVX1-ONLY-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [1,8,15,0,1,8,15,0,1,8,15,0,1,8,15,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm14, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm13, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm11, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm7 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; AVX1-ONLY-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vpxor %xmm9, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm9[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,1,8,15,0,1,8,15,0,1,8,15,0,1,8,15] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [0,1,8,15,0,1,8,15,0,1,8,15,0,1,8,15] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm6 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm9[7] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,128,128,128,128,128,4,11,0,128,128,128,128,128,4,11] ; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm10 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm7 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm15, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm15, %ymm8 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm15, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm11, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm11, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm4 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX1-ONLY-NEXT: vxorps %xmm7, %xmm7, %xmm7 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm12, %xmm4 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm11, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,2,9,128,128,128,0,0,0,2,9,128,128,128,0,0] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,128,128,0,7,14,0,0,0,128,128,0,7,14,0,0] ; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm5 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm7[7] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,128,128,128,128,128,5,12,0,128,128,128,128,128,5,12] ; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm8 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm7 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm10 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm6 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm15, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm15, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm11, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm4 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm4 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm15, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm11, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [12,0,0,0,128,128,128,5,12,0,0,0,128,128,128,5] ; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm13, %xmm5 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,0,0,0,0,7,14,128,128,0,0,0,0,7,14,128] ; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -7391,171 +7401,164 @@ ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[u,u] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,128,128,1,8,15,0,0,0,128,128,1,8,15,0,0] ; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm9 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6],mem[7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,128,128,128,128,128,6,13,0,128,128,128,128,128,6,13] -; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[6,13] ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm11 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm10 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm15, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm15, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm7 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm0[u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm4[u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm5 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm5[6,13] ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm11, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm8 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [13,0,0,0,128,128,128,6,13,0,0,0,128,128,128,6] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [13,0,0,0,128,128,128,6,13,0,0,0,128,128,128,6] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,0,0,0,1,8,15,128,128,0,0,0,1,8,15,128] ; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[2,9,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm12[u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm0[u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm9 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,128,128,128,128,0,7,14,0,128,128,128,128,0,7,14] ; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm13 ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm13 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm1, %ymm9 -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm9, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 ; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm15, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm9, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm14, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm14 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm11, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm10[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm13 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm10[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] -; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm15, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm15, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm11, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm11, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm6 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,0,0,0,2,9,128,128,128,0,0,0,2,9,128,128] ; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [14,0,0,0,128,128,0,7,14,0,0,0,128,128,0,7] -; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [14,0,0,0,128,128,0,7,14,0,0,0,128,128,0,7] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm9 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm12[u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm13[u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm9, %xmm13, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm9 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [0,128,128,128,128,1,8,15,0,128,128,128,128,1,8,15] ; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm14 ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm11, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm9, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm15, %ymm5 ; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm15, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm9, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm9, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm11, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm9, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm12 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm8 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7563,16 +7566,17 @@ ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm10[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm10[u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm7 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm11, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm11, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) @@ -7601,8 +7605,8 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) -; AVX1-ONLY-NEXT: addq $744, %rsp # imm = 0x2E8 +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) +; AVX1-ONLY-NEXT: addq $760, %rsp # imm = 0x2F8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -10863,353 +10867,361 @@ ; ; AVX512BW-ONLY-SLOW-LABEL: load_i8_stride7_vf64: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX512BW-ONLY-SLOW-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm10, %ymm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm7, %ymm0 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: kmovq %k1, %k2 -; AVX512BW-ONLY-SLOW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm4, %xmm3, %xmm16 +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm3, %xmm0, %xmm17 +; AVX512BW-ONLY-SLOW-NEXT: movw $9288, %ax # imm = 0x2448 +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm11, %ymm2, %ymm0 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm0, %ymm16 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm0, %ymm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm12 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm6 ; AVX512BW-ONLY-SLOW-NEXT: movw $8772, %ax # imm = 0x2244 -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k6 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm11, %ymm6, %ymm0 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm12, %ymm6, %ymm0 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm7, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm22 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm22, %xmm8, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm5, %xmm12, %xmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm24[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 224(%rdi), %xmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm25[0,7,14],zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm5, %xmm10, %xmm5 ; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: movw $9288, %ax # imm = 0x2448 -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm17 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm14, %ymm0 {%k6} ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u] ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm5, %xmm0, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm17 +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm5, %xmm0, %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm18 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm5 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm5[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4,5],ymm15[6],ymm5[7,8,9],ymm15[10],ymm5[11,12,13],ymm15[14],ymm5[15] +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm18, %ymm0, %ymm5 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm5[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm10[2],ymm5[3,4,5],ymm10[6],ymm5[7,8,9],ymm10[10],ymm5[11,12,13],ymm10[14],ymm5[15] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: movw $3968, %ax # imm = 0xF80 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm5, %ymm19 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm5, %ymm20 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 416(%rdi), %ymm16 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm5 ; AVX512BW-ONLY-SLOW-NEXT: movw $4644, %ax # imm = 0x1224 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm5, %ymm20 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm20, %xmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm22, %xmm20, %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm16, %ymm5, %ymm10 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm10, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm10[4,11],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm10, %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm10, %ymm0, %ymm21 ; AVX512BW-ONLY-SLOW-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm10, %ymm22 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[1,8,15],zero,zero,xmm22[4,11],zero,zero,xmm22[u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm23, %xmm22, %xmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm7, %ymm10 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm10, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[1,8,15],zero,zero,xmm10[4,11],zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm10, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm2, %ymm11, %ymm10 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3,4,5],ymm13[6],ymm10[7,8,9],ymm13[10],ymm10[11,12,13],ymm13[14],ymm10[15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: movl $511, %edi # imm = 0x1FF ; AVX512BW-ONLY-SLOW-NEXT: kmovd %edi, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm22, %ymm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm11, %ymm6, %ymm22 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm22[u,u,u,6,13],zero,zero,xmm22[2,9],zero,zero,zero,xmm22[u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u],zero,zero,xmm22[4,11],zero,zero,xmm22[0,7,14,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm23, %xmm22, %xmm22 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm14 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm22 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm2[7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm14, %xmm22, %xmm14 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm9 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm10, %ymm2 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm2, %xmm14, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm19, %ymm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm12, %ymm6, %ymm13 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm13[u,u,u,6,13],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,xmm13[4,11],zero,zero,xmm13[0,7,14,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm13, %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm23[0],xmm19[0],xmm23[1],xmm19[1],xmm23[2],xmm19[2],xmm23[3],xmm19[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,6],ymm3[7] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm24[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm25[1,8,15],zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm13, %xmm19, %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm13, %zmm3, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm3, %zmm10 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm2, %ymm11, %ymm3 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3],ymm3[4,5],ymm13[6],ymm3[7,8,9,10],ymm13[11],ymm3[12,13],ymm13[14],ymm3[15] +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm7, %ymm13 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm13, %xmm13 ; AVX512BW-ONLY-SLOW-NEXT: movl $261632, %edi # imm = 0x3FE00 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %edi, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm14 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm14, %xmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[1,8,15,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm22, %xmm14, %xmm14 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm8, %xmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,6],ymm3[7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm14, %xmm21, %xmm14 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm3, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm2, %zmm22 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm10, %ymm2 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm3, %xmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm13 {%k5} = ymm3[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm12, %ymm3 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm3, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm22, %xmm9, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm22[0],xmm19[0],xmm22[1],xmm19[1],xmm22[2],xmm19[2],xmm22[3],xmm19[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm25[2,9],zero,zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm24[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm4, %xmm19, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm13, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm2, %ymm11, %ymm3 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5,6],ymm4[7,8],ymm3[9,10],ymm4[11],ymm3[12,13,14],ymm4[15] +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm7, %ymm4 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm13, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm4 {%k5} = ymm3[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm12, %ymm3 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm18, %xmm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm13, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm18, %xmm21, %xmm18 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm13, %xmm19, %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512BW-ONLY-SLOW-NEXT: movl $-134217728, %edi # imm = 0xF8000000 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %edi, %k2 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm18, %ymm3 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm18, %xmm21, %xmm18 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm18, %zmm3, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm2, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm13, %ymm3 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm25[3,10],zero,zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm24[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm13, %xmm19, %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm13, %zmm3, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm4, %zmm19 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm20, %ymm19 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm21, %ymm20 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm16 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7,8,9,10],ymm14[11],ymm3[12,13],ymm14[14],ymm3[15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2],ymm14[3],ymm3[4,5,6],ymm14[7,8],ymm3[9,10],ymm14[11],ymm3[12,13,14],ymm14[15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm3, %zmm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm14, %ymm15, %ymm3 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[5,12,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm18, %ymm0, %ymm4 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm4[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7,8,9,10],ymm13[11],ymm4[12,13],ymm13[14],ymm4[15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm16, %ymm5, %ymm4 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm13, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm3, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm14, %ymm15, %ymm3 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm0, %ymm18, %ymm4 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm4[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm13[0],ymm4[1,2],ymm13[3],ymm4[4,5,6],ymm13[7,8],ymm4[9,10],ymm13[11],ymm4[12,13,14],ymm13[15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm16, %ymm5, %ymm4 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm13, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm3, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm14, %ymm15, %ymm3 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u] ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm22 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2,3],ymm14[4],ymm3[5,6],ymm14[7,8],ymm3[9,10,11],ymm14[12],ymm3[13,14],ymm14[15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm15, %ymm3 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm0, %ymm18, %ymm4 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm4[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm13[0],ymm4[1,2,3],ymm13[4],ymm4[5,6],ymm13[7,8],ymm4[9,10,11],ymm13[12],ymm4[13,14],ymm13[15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm16, %ymm4 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm13, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm3, %zmm19 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm3, %xmm2, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm2 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm2, %ymm19 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm14, %ymm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm4, %xmm3, %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm0, %ymm18, %ymm3 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm3, %ymm20 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm14, %ymm3 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm18, %ymm0, %ymm4 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: kmovq %k1, %k7 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7,8],ymm14[9],ymm3[10,11,12],ymm14[13],ymm3[14,15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm20 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm4[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4],ymm13[5],ymm4[6,7,8],ymm13[9],ymm4[10,11,12],ymm13[13],ymm4[14,15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm21 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm2, %ymm19 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm2 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm10, %ymm1, %ymm21 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm17, %ymm0 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm3, %ymm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm16, %ymm3 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm3, %ymm20 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm16, %ymm3 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm3, %ymm21 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm18, %ymm0 {%k4} ; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] ; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm13, %ymm12 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm0, %xmm12, %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm15, %ymm14 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,u,u,2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm0, %xmm4, %xmm0 ; AVX512BW-ONLY-SLOW-NEXT: movl $4186112, %eax # imm = 0x3FE000 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm15, %ymm5 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm10, %ymm1, %ymm12 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm10, %ymm1 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm11, %ymm6 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm2, %xmm10, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm10, %ymm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm16, %ymm5 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm2, %ymm11, %ymm13 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm12, %ymm3 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm11, %ymm2, %ymm15 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm7, %ymm1, %ymm14 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm11, %ymm2 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm7, %ymm1, %ymm4 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm12, %ymm11 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm7, %ymm1 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm12, %ymm6 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm10, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm7, %xmm3, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm10, %ymm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm10, %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm7, %xmm12, %xmm7 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm7, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm8, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm25, %zmm10, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,11],zero,zero,xmm12[0,7,14],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm7, %ymm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm7, %xmm11, %xmm7 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14] ; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm2, %zmm11 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm11, %ymm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm11, %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm8, %ymm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm13[1,2,3],ymm8[4],ymm13[5,6],ymm8[7,8],ymm13[9,10,11],ymm8[12],ymm13[13,14],ymm8[15] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,11],zero,zero,xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm9, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm4 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm25, %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm24[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm3, %zmm4 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm21, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm21[5,12],zero,zero,xmm21[1,8,15],zero,zero,xmm21[u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm3, %zmm12 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm15[0],ymm9[1],ymm15[2,3],ymm9[4],ymm15[5,6,7,8],ymm9[9],ymm15[10,11],ymm9[12],ymm15[13,14,15] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 {%k5} = ymm9[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm24[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm25[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm7, %zmm11 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7,8],ymm7[9],ymm2[10,11,12],ymm7[13],ymm2[14,15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm26, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm3, %zmm1 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm7, %xmm1, %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm24, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm25[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k5} ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm12 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm2 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero @@ -11221,58 +11233,43 @@ ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, (%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, (%rdi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rdi) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: load_i8_stride7_vf64: ; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm8 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX512BW-ONLY-FAST-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm8, %ymm0 {%k1} ; AVX512BW-ONLY-FAST-NEXT: kmovq %k1, %k2 ; AVX512BW-ONLY-FAST-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm5, %xmm1, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm4, %xmm0, %xmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm24, %ymm2, %ymm4 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm4, %ymm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm4, %ymm23 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm7 ; AVX512BW-ONLY-FAST-NEXT: movw $8772, %ax # imm = 0x2244 ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm13, %ymm11, %ymm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: kmovq %k1, %k3 -; AVX512BW-ONLY-FAST-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm10, %ymm7, %ymm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: kmovq %k1, %k4 ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u] @@ -11280,685 +11277,708 @@ ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,2,4,6,1,2,4,6] ; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm19, %ymm5, %ymm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm13, %ymm5, %ymm5 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm7 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm8 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm5, %xmm10, %xmm5 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm4, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm5 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm6, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 ; AVX512BW-ONLY-FAST-NEXT: kmovq %rax, %k5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm4, %zmm1 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm6, %zmm23 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm11 ; AVX512BW-ONLY-FAST-NEXT: movw $9288, %ax # imm = 0x2448 -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k6 -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm10 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm10[u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm20, %xmm10, %xmm21 +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k7 +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm12, %ymm11, %ymm6 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm9, %xmm6, %xmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 352(%rdi), %ymm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %ymm16 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm15, %ymm16, %ymm6 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: movw $3968, %ax # imm = 0xF80 -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm6, %ymm21 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm10 +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm6, %ymm20 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm14 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm6 ; AVX512BW-ONLY-FAST-NEXT: movw $4644, %ax # imm = 0x1224 -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k4 -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm10, %ymm6, %ymm20 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm22 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm22, %xmm20, %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm22 +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k6 +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm14, %ymm6, %ymm9 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm9, %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13],zero,zero,xmm17[2,9] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm9[4,11],zero,zero +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm9, %xmm9 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm9, %ymm0, %ymm21 ; AVX512BW-ONLY-FAST-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm20 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm23 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[1,8,15],zero,zero,xmm20[4,11],zero,zero,xmm20[u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm23, %xmm20, %xmm20 +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm8, %ymm9 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm9, %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[6,13],zero,zero,xmm17[2,9,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,8,15],zero,zero,xmm9[4,11],zero,zero,xmm9[u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm9, %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,17,10,3,u,21,14,7,24,u,u,u,28,u,u,31> +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm2, %ymm24, %ymm9 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: movl $511, %r10d # imm = 0x1FF ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm20, %ymm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm13, %ymm11, %ymm20 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[u,u,u,6,13],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm23, %xmm20, %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm15 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [1,3,4,6,1,3,4,6] -; AVX512BW-ONLY-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm20 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm15, %xmm20, %xmm15 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm14, %zmm9 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm14 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm15, %xmm14, %xmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm17, %ymm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm10, %ymm7, %ymm17 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,6,13],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm17, %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,xmm17[4,11],zero,zero,xmm17[0,7,14,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm17, %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [1,3,4,6,1,3,4,6] +; AVX512BW-ONLY-FAST-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm13, %ymm17, %ymm17 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm1, %xmm17, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm9 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm8, %ymm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,18,11,4,21,14,7,u,25,u,u,28,u,u,u> +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm2, %ymm24, %ymm1 ; AVX512BW-ONLY-FAST-NEXT: movl $261632, %r10d # imm = 0x3FE00 ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k5 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm14 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,zero,xmm20[5,12],zero,zero,xmm20[1,8,15,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm20, %xmm14, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [1,3,5,6,1,3,5,6] -; AVX512BW-ONLY-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm14, %xmm19, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm0, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm15, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm0 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm1[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm7, %ymm10, %ymm1 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm1, %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u],zero,zero,zero,xmm17[5,12],zero,zero,xmm17[1,8,15,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm1, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [1,3,5,6,1,3,5,6] +; AVX512BW-ONLY-FAST-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm13, %ymm17, %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm13[7] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm13, %xmm17, %xmm13 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm1, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm0, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm8, %ymm0 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm0, %xmm15, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm15 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm15, %xmm19 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm19, %xmm15, %xmm15 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 208(%rdi), %xmm19 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[5,12] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm24 = xmm20[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm23, %xmm24, %xmm23 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm23 +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <8,1,18,11,4,u,22,15,u,25,u,u,u,29,u,u> +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm2, %ymm24, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm1[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm7, %ymm10, %ymm1 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm1, %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u],zero,zero,zero,xmm17[6,13],zero,zero,xmm17[2,9,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,1,8,15],zero,zero,xmm1[4,11],zero,zero,xmm1[u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm1, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 208(%rdi), %xmm18 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm18[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm18[5,12] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm19 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm19[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm22, %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 ; AVX512BW-ONLY-FAST-NEXT: movl $-134217728, %edi # imm = 0xF8000000 ; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k2 ; AVX512BW-ONLY-FAST-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm23, %ymm15 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm24 = zero,zero,xmm7[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm23, %xmm24, %xmm23 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm23, %zmm15, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm0, %zmm15 {%k1} -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm17, %ymm1 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm22, %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm17, %zmm1, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm0, %zmm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm21, %ymm20 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 -; AVX512BW-ONLY-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm21 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u] +; AVX512BW-ONLY-FAST-NEXT: kmovq %rax, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm23 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm12, %ymm0 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[5,12,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm21, %xmm0, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm18, %ymm0 {%k7} -; AVX512BW-ONLY-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm10, %ymm6, %ymm18 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm18, %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm21, %xmm18, %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm18, %ymm0 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm15, %ymm16, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm1, %ymm0 {%k3} +; AVX512BW-ONLY-FAST-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm14, %ymm6, %ymm1 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm1[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm20, %xmm1, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm9 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm12, %ymm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm17, %ymm0 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm10, %ymm6, %ymm17 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm17, %xmm17 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm17, %xmm17 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm17, %ymm0 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm16, %ymm15, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm1, %ymm0 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm14, %ymm6, %ymm1 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm1[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm20, %xmm1, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm14 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm12, %ymm0 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm0, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm16, %ymm0 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm16 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm16, %xmm17 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[5,12] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,4,11],zero,zero,xmm16[0,7,14],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm16, %xmm16 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm16, %ymm0 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm16, %ymm15, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm1, %ymm0 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm6, %ymm14, %ymm1 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm1, %xmm20 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,4,11],zero,zero,xmm1[0,7,14],zero,zero +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm20, %xmm1, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm15 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm0, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm16 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm16, %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[5,12],zero,zero,xmm18[1,8,15,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,0,7,14],zero,zero,xmm16[3,10],zero,zero,zero,xmm16[u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm16, %xmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm12, %ymm11, %ymm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm1, %xmm0, %xmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm16, %ymm15, %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm0, %ymm16 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm0 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm0, %ymm20 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm12, %ymm11, %ymm0 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,3,u,u,u,7,24,17,10,u,28,21,14,31> +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm15, %ymm16, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm21 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm21 {%k1} ; AVX512BW-ONLY-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13] +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm6, %ymm14, %ymm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm16 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm0 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm20 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm6, %ymm14, %ymm0 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm17 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm12, %ymm3, %ymm0 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm12, %ymm3, %ymm18 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm21 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm12, %ymm3 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm12 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm13, %ymm11 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm20[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm13, %xmm22, %xmm13 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-ONLY-FAST-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm13, %ymm12 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm21[u,u,3,10],zero,zero,zero,xmm21[6,13],zero,zero,xmm21[u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm21, %xmm21 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u],zero,zero,xmm21[1,8,15],zero,zero,xmm21[4,11,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm13, %xmm21, %xmm13 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm20[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[0,7,14] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm21, %xmm22, %xmm21 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm21, %ymm13 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm11, %xmm21 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm21, %xmm11, %xmm11 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm11, %ymm0, %ymm21 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[1,8,15] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm11, %xmm19, %xmm11 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm11, %ymm21 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm11, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm11, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm2, %zmm11, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm2, %xmm0, %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm11 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm22[0],xmm11[1],xmm22[1],xmm11[2],xmm22[2],xmm11[3],xmm22[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm11, %zmm2 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm18, %xmm11 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm18[0],xmm12[0],xmm18[1],xmm12[1],xmm18[2],xmm12[2],xmm18[3],xmm12[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm12, %zmm11 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm21 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm7, %ymm10, %ymm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm8, %ymm3, %ymm22 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,u,4,u,u,7,u,25,18,11,28,21,14,u> +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm15, %ymm16, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm12, %ymm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u],zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm12, %xmm11, %xmm11 +; AVX512BW-ONLY-FAST-NEXT: movl $4186112, %eax # imm = 0x3FE000 +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 {%k1} = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm14, %ymm6 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm8, %ymm3, %ymm1 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm7, %ymm10, %ymm12 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm8, %ymm3 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm10, %ymm7 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm0, %xmm8, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm18[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm18[6,13] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm19[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-ONLY-FAST-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm8, %ymm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[u,u,3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm19[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm18[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm10, %ymm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm19[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm18[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm18[1,8,15] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm10, %ymm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm1, %xmm10, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <8,1,u,19,12,5,22,15,u,u,26,u,u,29,u,u> +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm2, %ymm24, %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm0, %zmm1 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm22, %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm22[5,12],zero,zero,xmm22[1,8,15],zero,zero,xmm22[u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm12, %xmm14, %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,25,18,3,28,21,u,7,u,u,10,u,u,u,14,u> +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm24, %ymm2, %ymm14 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm8, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm8, %zmm12 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm3, %xmm12, %xmm3 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm19[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm0, %zmm3 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm10, %ymm6 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512BW-ONLY-FAST-NEXT: movl $4186112, %edi # imm = 0x3FE000 -; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k1 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,25,18,u,4,29,22,7,u,u,u,11,u,u,14,u> +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm24, %ymm2, %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm2, %zmm3 {%k5} ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10],zero,zero,zero -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm0, %xmm5, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10],zero,zero,zero +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47] -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm0, %zmm4, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, (%rsi) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, (%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rdi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, (%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, (%rdi) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: load_i8_stride7_vf64: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm7 ; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX512DQBW-SLOW-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm9, %ymm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm7, %ymm0 {%k1} ; AVX512DQBW-SLOW-NEXT: kmovq %k1, %k2 -; AVX512DQBW-SLOW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm4, %xmm3, %xmm16 +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm3, %xmm0, %xmm17 +; AVX512DQBW-SLOW-NEXT: movw $9288, %ax # imm = 0x2448 +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k6 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm10, %ymm2, %ymm0 {%k6} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm0, %ymm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm0, %ymm17 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm12 ; AVX512DQBW-SLOW-NEXT: vmovdqa 160(%rdi), %ymm6 ; AVX512DQBW-SLOW-NEXT: movw $8772, %ax # imm = 0x2244 -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k6 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm11, %ymm6, %ymm0 {%k6} +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k3 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm12, %ymm6, %ymm0 {%k3} ; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX512DQBW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm7, %xmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm8 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX512DQBW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm22 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm22, %xmm8, %xmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm9 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm26 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm5, %xmm12, %xmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm24 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm24[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 224(%rdi), %xmm25 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm25[0,7,14],zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm5, %xmm11, %xmm5 ; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm0 ; AVX512DQBW-SLOW-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k5 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k5} -; AVX512DQBW-SLOW-NEXT: vmovdqa 288(%rdi), %ymm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %ymm12 -; AVX512DQBW-SLOW-NEXT: movw $9288, %ax # imm = 0x2448 -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k3 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm17 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqa 288(%rdi), %ymm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %ymm13 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm14, %ymm13, %ymm0 {%k6} ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u] ; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm5, %xmm0, %xmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm17 +; AVX512DQBW-SLOW-NEXT: vporq %xmm5, %xmm0, %xmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm18 ; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm5 {%k6} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm5[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4,5],ymm15[6],ymm5[7,8,9],ymm15[10],ymm5[11,12,13],ymm15[14],ymm5[15] +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm18, %ymm0, %ymm5 {%k3} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm5[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm11[2],ymm5[3,4,5],ymm11[6],ymm5[7,8,9],ymm11[10],ymm5[11,12,13],ymm11[14],ymm5[15] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: movw $3968, %ax # imm = 0xF80 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k7 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm5, %ymm19 {%k7} -; AVX512DQBW-SLOW-NEXT: vmovdqa 416(%rdi), %ymm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm5, %ymm20 {%k7} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 416(%rdi), %ymm16 ; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm5 ; AVX512DQBW-SLOW-NEXT: movw $4644, %ax # imm = 0x1224 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k4 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm15, %ymm5, %ymm20 {%k4} -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm20, %xmm22 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero -; AVX512DQBW-SLOW-NEXT: vporq %xmm22, %xmm20, %xmm20 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm16, %ymm5, %ymm11 {%k4} +; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm11, %xmm19 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero +; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm11, %xmm11 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm11, %ymm0, %ymm21 ; AVX512DQBW-SLOW-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm9, %ymm22 {%k4} -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm23 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[1,8,15],zero,zero,xmm22[4,11],zero,zero,xmm22[u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm23, %xmm22, %xmm22 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm7, %ymm11 {%k4} +; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm11, %xmm19 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm11, %xmm19 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm2, %ymm10, %ymm11 {%k2} +; AVX512DQBW-SLOW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm15 +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4,5],ymm15[6],ymm11[7,8,9],ymm15[10],ymm11[11,12,13],ymm15[14],ymm11[15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: movl $511, %edi # imm = 0x1FF ; AVX512DQBW-SLOW-NEXT: kmovd %edi, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm22, %ymm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm11, %ymm6, %ymm22 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm22[u,u,u,6,13],zero,zero,xmm22[2,9],zero,zero,zero,xmm22[u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm22 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u],zero,zero,xmm22[4,11],zero,zero,xmm22[0,7,14,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm23, %xmm22, %xmm22 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm14 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm22 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm2 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm14, %xmm22, %xmm14 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm10 {%k5} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm9, %ymm2 {%k6} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm2, %xmm14, %xmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm19, %ymm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm12, %ymm6, %ymm15 {%k6} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm15[u,u,u,6,13],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,xmm15[4,11],zero,zero,xmm15[0,7,14,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm15, %xmm15 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm23[0],xmm19[0],xmm23[1],xmm19[1],xmm23[2],xmm19[2],xmm23[3],xmm19[3] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5,6],ymm3[7] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm24[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm25[1,8,15],zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm15, %xmm19, %xmm15 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm15, %zmm3, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm3, %zmm11 {%k5} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm2, %ymm10, %ymm3 {%k4} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm15 +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7,8,9,10],ymm15[11],ymm3[12,13],ymm15[14],ymm3[15] +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm7, %ymm15 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm15[2,9],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm15, %xmm15 ; AVX512DQBW-SLOW-NEXT: movl $261632, %edi # imm = 0x3FE00 ; AVX512DQBW-SLOW-NEXT: kmovd %edi, %k5 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm14 {%k2} -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm14, %xmm22 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[1,8,15,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm22, %xmm14, %xmm14 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm8, %xmm21 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm14, %xmm21, %xmm14 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm3, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm2, %zmm23 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm9, %ymm2 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k4} -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm3, %xmm18 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm3[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm12, %ymm3 {%k2} +; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm3, %xmm19 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm22, %xmm9, %xmm19 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm22[0],xmm19[0],xmm22[1],xmm19[1],xmm22[2],xmm19[2],xmm22[3],xmm19[3] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm25[2,9],zero,zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm24[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm4, %xmm19, %xmm4 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm15, %zmm22 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm2, %ymm10, %ymm3 {%k3} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5,6],ymm4[7,8],ymm3[9,10],ymm4[11],ymm3[12,13,14],ymm4[15] +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm7, %ymm4 {%k6} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm15, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 {%k5} = ymm3[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm12, %ymm3 {%k4} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm15 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm18, %xmm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm15, %xmm3 ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQBW-SLOW-NEXT: vporq %xmm18, %xmm21, %xmm18 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQBW-SLOW-NEXT: vporq %xmm15, %xmm19, %xmm15 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512DQBW-SLOW-NEXT: movl $-134217728, %edi # imm = 0xF8000000 ; AVX512DQBW-SLOW-NEXT: kmovd %edi, %k2 ; AVX512DQBW-SLOW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm18, %ymm3 {%k2} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm18, %xmm21, %xmm18 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm18, %zmm3, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm2, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm15, %ymm3 {%k2} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm25[3,10],zero,zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm24[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm15, %xmm19, %xmm15 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm15, %zmm3, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm4, %zmm19 {%k1} ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm20, %ymm19 {%k2} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm21, %ymm20 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm3 ; AVX512DQBW-SLOW-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k4} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k3} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7,8,9,10],ymm14[11],ymm3[12,13],ymm14[14],ymm3[15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k6} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k6} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k4} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2],ymm14[3],ymm3[4,5,6],ymm14[7,8],ymm3[9,10],ymm14[11],ymm3[12,13,14],ymm14[15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm3, %zmm17 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm14, %ymm3 {%k4} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[5,12,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm18, %ymm0, %ymm4 {%k6} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm4[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm15[3],ymm4[4,5],ymm15[6],ymm4[7,8,9,10],ymm15[11],ymm4[12,13],ymm15[14],ymm4[15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k7} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm16, %ymm5, %ymm4 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10] +; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm15, %xmm4 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm3, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm14, %ymm3 {%k3} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm0, %ymm18, %ymm4 {%k4} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm4[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm4[1,2],ymm15[3],ymm4[4,5,6],ymm15[7,8],ymm4[9,10],ymm15[11],ymm4[12,13,14],ymm15[15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k7} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm16, %ymm5, %ymm4 {%k6} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11] +; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm15, %xmm4 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm3, %zmm22 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm14, %ymm3 {%k6} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u] ; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm23 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k6} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2,3],ymm14[4],ymm3[5,6],ymm14[7,8],ymm3[9,10,11],ymm14[12],ymm3[13,14],ymm14[15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm15, %ymm3 {%k4} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm14 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm14, %xmm3 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm0, %ymm18, %ymm4 {%k3} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm4[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm4[1,2,3],ymm15[4],ymm4[5,6],ymm15[7,8],ymm4[9,10,11],ymm15[12],ymm4[13,14],ymm15[15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k7} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm16, %ymm4 {%k4} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm15 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero +; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm15, %xmm4 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm3, %zmm19 {%k1} ; AVX512DQBW-SLOW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm3, %xmm2, %xmm19 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm2 {%k3} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm2, %ymm19 {%k7} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k4} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm14, %ymm13, %ymm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm4, %xmm3, %xmm20 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm0, %ymm18, %ymm3 {%k6} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm3, %ymm20 {%k7} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm14, %ymm13, %ymm3 {%k4} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm18, %ymm0, %ymm4 {%k1} ; AVX512DQBW-SLOW-NEXT: kmovq %k1, %k7 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7,8],ymm14[9],ymm3[10,11,12],ymm14[13],ymm3[14,15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm20 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm4[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3,4],ymm15[5],ymm4[6,7,8],ymm15[9],ymm4[10,11,12],ymm15[13],ymm4[14,15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm21 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k6} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm19 {%k2} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k2} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm22 {%k6} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm9, %ymm1, %ymm21 {%k6} -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm17, %ymm0 {%k4} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm13, %ymm12 {%k6} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm21 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm16, %ymm3 {%k3} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero +; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm20 {%k2} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm16, %ymm3 {%k6} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14] +; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm21 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm18, %ymm0 {%k4} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm14, %ymm13 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm0, %xmm4, %xmm0 ; AVX512DQBW-SLOW-NEXT: movl $4186112, %eax # imm = 0x3FE000 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm15, %ymm5 {%k7} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm9, %ymm1, %ymm2 {%k4} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm9, %ymm1 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm11, %ymm6 {%k4} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm22[u,u,2,9],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm11 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm9, %xmm11, %xmm9 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQBW-SLOW-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQBW-SLOW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm16, %ymm5 {%k7} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm2, %ymm10, %ymm15 {%k6} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm12, %ymm3 {%k3} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm10, %ymm2, %ymm14 {%k4} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm7, %ymm1, %ymm13 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm10, %ymm2 {%k3} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm7, %ymm1, %ymm4 {%k4} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm12, %ymm10 {%k6} +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm7, %ymm1 {%k6} +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm12, %ymm6 {%k4} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm7, %xmm3, %xmm3 ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14] -; AVX512DQBW-SLOW-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm11, %ymm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQBW-SLOW-NEXT: vpor %xmm7, %xmm12, %xmm7 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQBW-SLOW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm7, %ymm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[u,u,3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14] +; AVX512DQBW-SLOW-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm10, %ymm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm10 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm11, %xmm6 +; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm10, %xmm6 ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15] -; AVX512DQBW-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm7, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm8, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm25, %zmm11, %zmm11 -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm12 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm2, %xmm12, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm11, %xmm4, %xmm12 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm9, %zmm2 {%k5} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15] +; AVX512DQBW-SLOW-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm8, %ymm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm8 +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm15[1,2,3],ymm8[4],ymm15[5,6],ymm8[7,8],ymm15[9,10,11],ymm8[12],ymm15[13,14],ymm8[15] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,11],zero,zero,xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm9, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm8, %xmm25, %xmm9 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm24[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm3, %zmm4 {%k5} ; AVX512DQBW-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm21, %xmm9 +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5,6,7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13,14,15] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm9 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm21[5,12],zero,zero,xmm21[1,8,15],zero,zero,xmm21[u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[5,12],zero,zero,xmm13[1,8,15],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm3[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm24[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm25[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm3, %zmm9 {%k5} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm11, %xmm26, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm3, %zmm1 {%k5} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm8, %xmm24, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm25[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k5} ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm19, %zmm0, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm20, %zmm0, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm20, %zmm0, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm21, %zmm0, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero +; AVX512DQBW-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, (%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, (%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, (%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, (%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, (%r9) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%rdi) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQBW-SLOW-NEXT: vzeroupper @@ -11966,46 +11986,30 @@ ; ; AVX512DQBW-FAST-LABEL: load_i8_stride7_vf64: ; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] -; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] -; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm12 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm8 ; AVX512DQBW-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512DQBW-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX512DQBW-FAST-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm1 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm8, %ymm0 {%k1} ; AVX512DQBW-FAST-NEXT: kmovq %k1, %k2 ; AVX512DQBW-FAST-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm5, %xmm1, %xmm1 +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm4, %xmm0, %xmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm25, %ymm2, %ymm4 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm4, %ymm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512DQBW-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm4, %ymm24 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 +; AVX512DQBW-FAST-NEXT: vmovdqa 160(%rdi), %ymm7 ; AVX512DQBW-FAST-NEXT: movw $8772, %ax # imm = 0x2244 -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm13, %ymm11, %ymm4 {%k1} -; AVX512DQBW-FAST-NEXT: kmovq %k1, %k3 -; AVX512DQBW-FAST-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k6 +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm7, %ymm4 {%k6} ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u] @@ -12013,318 +12017,331 @@ ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,2,4,6,1,2,4,6] ; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512DQBW-FAST-NEXT: vpermd %ymm19, %ymm5, %ymm5 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm13 +; AVX512DQBW-FAST-NEXT: vpermd %ymm13, %ymm5, %ymm5 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX512DQBW-FAST-NEXT: vmovdqa 240(%rdi), %xmm7 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqa 224(%rdi), %xmm8 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm5, %xmm10, %xmm5 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm4, %zmm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX512DQBW-FAST-NEXT: vmovdqa 240(%rdi), %xmm4 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqa 224(%rdi), %xmm5 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm6, %zmm6 ; AVX512DQBW-FAST-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 ; AVX512DQBW-FAST-NEXT: kmovq %rax, %k5 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm4, %zmm1 {%k5} -; AVX512DQBW-FAST-NEXT: vmovdqa 288(%rdi), %ymm5 -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm6, %zmm24 {%k5} +; AVX512DQBW-FAST-NEXT: vmovdqa 288(%rdi), %ymm12 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %ymm11 ; AVX512DQBW-FAST-NEXT: movw $9288, %ax # imm = 0x2448 -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k6 -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm10 {%k6} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm10[u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm10, %xmm21 +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k3 +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm12, %ymm11, %ymm6 {%k3} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm9, %xmm6, %xmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512DQBW-FAST-NEXT: vmovdqa 352(%rdi), %ymm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %ymm16 +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm15, %ymm16, %ymm6 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: movw $3968, %ax # imm = 0xF80 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k7 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm6, %ymm21 {%k7} -; AVX512DQBW-FAST-NEXT: vmovdqa 416(%rdi), %ymm10 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm6, %ymm20 {%k7} +; AVX512DQBW-FAST-NEXT: vmovdqa 416(%rdi), %ymm14 ; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdi), %ymm6 ; AVX512DQBW-FAST-NEXT: movw $4644, %ax # imm = 0x1224 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k4 -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm6, %ymm20 {%k4} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm22 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero -; AVX512DQBW-FAST-NEXT: vporq %xmm22, %xmm20, %xmm20 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm22 +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm14, %ymm6, %ymm9 {%k4} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm9, %xmm17 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13],zero,zero,xmm17[2,9] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm9[4,11],zero,zero +; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm9, %xmm9 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm9, %ymm0, %ymm21 ; AVX512DQBW-FAST-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm20 {%k4} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm23 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[1,8,15],zero,zero,xmm20[4,11],zero,zero,xmm20[u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm23, %xmm20, %xmm20 +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm8, %ymm9 {%k4} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm9, %xmm17 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[6,13],zero,zero,xmm17[2,9,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,8,15],zero,zero,xmm9[4,11],zero,zero,xmm9[u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm9, %xmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,17,10,3,u,21,14,7,24,u,u,u,28,u,u,31> +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm2, %ymm25, %ymm9 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: movl $511, %r10d # imm = 0x1FF ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm20, %ymm9 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm13, %ymm11, %ymm20 {%k6} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[u,u,u,6,13],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u] -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm20 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm23, %xmm20, %xmm20 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm15 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [1,3,4,6,1,3,4,6] -; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm20 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm15, %xmm20, %xmm15 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm14, %zmm9 {%k5} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm14 {%k3} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm15, %xmm14, %xmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm17, %ymm9 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm7, %ymm17 {%k3} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,6,13],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[u,u,u,u] +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm17, %xmm17 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,xmm17[4,11],zero,zero,xmm17[0,7,14,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm17, %xmm17 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm17 = [1,3,4,6,1,3,4,6] +; AVX512DQBW-FAST-NEXT: # ymm17 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermd %ymm13, %ymm17, %ymm17 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm1, %xmm17, %xmm1 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm9 {%k5} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm8, %ymm0 {%k6} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,18,11,4,21,14,7,u,25,u,u,28,u,u,u> +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm2, %ymm25, %ymm1 ; AVX512DQBW-FAST-NEXT: movl $261632, %r10d # imm = 0x3FE00 ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k5 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm14 {%k2} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm20 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,zero,xmm20[5,12],zero,zero,xmm20[1,8,15,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [1,3,5,6,1,3,5,6] -; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm19 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm14, %xmm19, %xmm14 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm0, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm15, %zmm14 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm12, %ymm0 {%k6} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm1[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm7, %ymm10, %ymm1 {%k2} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm1, %xmm17 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u],zero,zero,zero,xmm17[5,12],zero,zero,xmm17[1,8,15,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm1, %xmm1 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm17 = [1,3,5,6,1,3,5,6] +; AVX512DQBW-FAST-NEXT: # ymm17 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermd %ymm13, %ymm17, %ymm13 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm13[7] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm13, %xmm17, %xmm13 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm1, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm0, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm8, %ymm0 {%k3} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm15, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm15 {%k4} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm15, %xmm19 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm19, %xmm15, %xmm15 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 208(%rdi), %xmm19 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[5,12] -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %xmm20 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm24 = xmm20[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQBW-FAST-NEXT: vporq %xmm23, %xmm24, %xmm23 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm23 +; AVX512DQBW-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <8,1,18,11,4,u,22,15,u,25,u,u,u,29,u,u> +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm2, %ymm25, %ymm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm1[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm7, %ymm10, %ymm1 {%k4} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm1, %xmm17 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u],zero,zero,zero,xmm17[6,13],zero,zero,xmm17[2,9,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,1,8,15],zero,zero,xmm1[4,11],zero,zero,xmm1[u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm1, %xmm1 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 208(%rdi), %xmm18 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm18[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm18[5,12] +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %xmm19 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm19[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm22, %xmm17 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 ; AVX512DQBW-FAST-NEXT: movl $-134217728, %edi # imm = 0xF8000000 ; AVX512DQBW-FAST-NEXT: kmovd %edi, %k2 ; AVX512DQBW-FAST-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm23, %ymm15 {%k2} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm24 = zero,zero,xmm7[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm23, %xmm24, %xmm23 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm23, %zmm15, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm0, %zmm15 {%k1} -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm17, %ymm1 {%k2} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm22, %xmm17 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm17, %zmm1, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm0, %zmm17 {%k1} +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k2 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm21, %ymm20 {%k2} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm0 ; AVX512DQBW-FAST-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 -; AVX512DQBW-FAST-NEXT: kmovq %rax, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k4} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm21 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u] +; AVX512DQBW-FAST-NEXT: kmovq %rax, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm24 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm12, %ymm0 {%k4} +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[5,12,u,u] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm21, %xmm0, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm18, %ymm0 {%k7} -; AVX512DQBW-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm6, %ymm18 {%k1} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm18, %xmm18 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10] -; AVX512DQBW-FAST-NEXT: vporq %xmm21, %xmm18, %xmm18 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm18, %ymm0 {%k3} +; AVX512DQBW-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm15, %ymm16, %ymm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm1, %ymm0 {%k7} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm14, %ymm6, %ymm1 {%k6} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm1[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10] +; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm1, %xmm1 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm9 {%k2} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k1} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm18 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm12, %ymm0 {%k6} +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13,u,u] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm17, %ymm0 {%k7} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm6, %ymm17 {%k6} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm17, %xmm17 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11] -; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm17, %xmm17 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm17, %ymm0 {%k3} +; AVX512DQBW-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm16, %ymm15, %ymm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm1, %ymm0 {%k7} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm14, %ymm6, %ymm1 {%k3} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm1[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11] +; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm1, %xmm1 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm14 {%k2} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k6} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm12, %ymm0 {%k3} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm0, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm16, %ymm0 {%k7} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm16 {%k4} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm16, %xmm17 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[5,12] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,4,11],zero,zero,xmm16[0,7,14],zero,zero -; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm16, %xmm16 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm16, %ymm0 {%k3} +; AVX512DQBW-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm16, %ymm15, %ymm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm1, %ymm0 {%k7} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm14, %ymm1 {%k4} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm1, %xmm20 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,4,11],zero,zero,xmm1[0,7,14],zero,zero +; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm1, %xmm1 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm15 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm0, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm17 {%k1} ; AVX512DQBW-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm16 {%k1} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm16, %xmm18 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[5,12],zero,zero,xmm18[1,8,15,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,0,7,14],zero,zero,xmm16[3,10],zero,zero,zero,xmm16[u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm16, %xmm16 +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm12, %ymm11, %ymm0 {%k1} +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm1, %xmm0, %xmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm16, %ymm15, %ymm0 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm0, %ymm16 {%k7} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm0 {%k4} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm18 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm0, %ymm20 {%k7} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm12, %ymm11, %ymm0 {%k4} +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,3,u,u,u,7,24,17,10,u,28,21,14,31> +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm15, %ymm16, %ymm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm21 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm0, %ymm17 {%k1} -; AVX512DQBW-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm0 {%k1} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm18 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13] +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm0, %ymm21 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm14, %ymm0 {%k6} +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero -; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm0, %ymm16 {%k3} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm10, %ymm0 {%k6} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm0, %ymm20 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm14, %ymm0 {%k3} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] -; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm0, %ymm17 {%k3} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm12, %ymm3, %ymm21 {%k4} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm12, %ymm3, %ymm18 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm0 {%k6} -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm12, %ymm3 {%k6} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm13, %ymm12 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm13, %ymm11 {%k4} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm20[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQBW-FAST-NEXT: vporq %xmm13, %xmm22, %xmm13 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQBW-FAST-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm13, %ymm12 {%k2} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm13, %xmm0 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm20[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[0,7,14] -; AVX512DQBW-FAST-NEXT: vporq %xmm13, %xmm22, %xmm13 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm13, %ymm0 {%k2} -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm13, %xmm11, %xmm11 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm13 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[1,8,15] -; AVX512DQBW-FAST-NEXT: vporq %xmm11, %xmm19, %xmm11 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm11, %ymm13 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm11, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm11, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] -; AVX512DQBW-FAST-NEXT: vpermw %zmm2, %zmm11, %zmm11 -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm21, %xmm2 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[4,11],zero,zero,xmm21[0,7,14],zero,zero,xmm21[u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm2, %xmm21, %xmm2 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512DQBW-FAST-NEXT: vpshufb %xmm21, %xmm8, %xmm11 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm22[0],xmm11[1],xmm22[1],xmm11[2],xmm22[2],xmm11[3],xmm22[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm11, %zmm2 {%k5} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm18, %xmm11 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm11 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm18[0],xmm12[0],xmm18[1],xmm12[1],xmm18[2],xmm12[2],xmm18[3],xmm12[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm0, %zmm11 {%k5} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm0, %ymm21 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm7, %ymm10, %ymm23 {%k6} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm8, %ymm3, %ymm22 {%k6} +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,u,u,u,4,u,u,7,u,25,18,11,28,21,14,u> +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm15, %ymm16, %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm12, %ymm11 {%k6} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,u,2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u],zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm1, %xmm11, %xmm11 +; AVX512DQBW-FAST-NEXT: movl $4186112, %eax # imm = 0x3FE000 +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm11 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm14, %ymm6 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm8, %ymm3, %ymm0 {%k4} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm7, %ymm10, %ymm1 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm8, %ymm3 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm10, %ymm7 {%k4} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm23[u,u,2,9],zero,zero,zero,xmm23[5,12],zero,zero,xmm23[u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm23, %xmm10 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm18[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm18[6,13] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm19[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQBW-FAST-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-FAST-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm10, %ymm8 {%k1} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm1, %xmm10, %xmm1 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm19[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm18[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14] +; AVX512DQBW-FAST-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm10, %ymm1 {%k1} +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm19[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm18[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm18[1,8,15] +; AVX512DQBW-FAST-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm10, %ymm7 {%k1} +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <8,1,u,19,12,5,22,15,u,u,26,u,u,29,u,u> +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm2, %ymm25, %ymm10 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512DQBW-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm12 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm8, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm8, %zmm0 {%k5} +; AVX512DQBW-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm22, %xmm8 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm22[5,12],zero,zero,xmm22[1,8,15],zero,zero,xmm22[u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm8, %xmm12, %xmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,25,18,3,28,21,u,7,u,u,10,u,u,u,14,u> +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm25, %ymm2, %ymm12 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm8 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm1, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm1, %zmm8 {%k5} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm19[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb %xmm21, %xmm7, %xmm3 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm13, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm3, %zmm0 {%k5} -; AVX512DQBW-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm10, %ymm6 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm3, %zmm3 -; AVX512DQBW-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512DQBW-FAST-NEXT: movl $4186112, %edi # imm = 0x3FE000 -; AVX512DQBW-FAST-NEXT: kmovd %edi, %k1 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,25,18,u,4,29,22,7,u,u,u,11,u,u,14,u> +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm25, %ymm2, %ymm3 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm3[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm2 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm2, %zmm1 {%k5} ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm16, %zmm0, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm17, %zmm0, %zmm11 {%k1} -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm6, %xmm3 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10],zero,zero,zero -; AVX512DQBW-FAST-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47] -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm3, %zmm4, %zmm5 -; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm0, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm20, %zmm0, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm21, %zmm0, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm6, %xmm2 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10],zero,zero,zero +; AVX512DQBW-FAST-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0,1,2],ymm2[3,4,5,6,7],ymm11[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1} ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, (%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, (%rsi) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, (%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, (%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, (%rdi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, (%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, (%rdi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %wide.vec = load <448 x i8>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll @@ -1886,535 +1886,533 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i8_stride8_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $904, %rsp # imm = 0x388 -; SSE-NEXT: movdqa 64(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm12 +; SSE-NEXT: subq $888, %rsp # imm = 0x378 +; SSE-NEXT: movdqa 192(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm6 -; SSE-NEXT: movdqa 144(%rdi), %xmm13 -; SSE-NEXT: movdqa 160(%rdi), %xmm11 -; SSE-NEXT: movdqa 176(%rdi), %xmm14 -; SSE-NEXT: movdqa 192(%rdi), %xmm2 +; SSE-NEXT: movdqa 224(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa 16(%rdi), %xmm11 +; SSE-NEXT: movdqa 32(%rdi), %xmm10 +; SSE-NEXT: movdqa 48(%rdi), %xmm13 +; SSE-NEXT: movdqa 64(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm7 -; SSE-NEXT: movdqa 224(%rdi), %xmm8 -; SSE-NEXT: movdqa 240(%rdi), %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: movdqa 80(%rdi), %xmm9 +; SSE-NEXT: movdqa 96(%rdi), %xmm7 +; SSE-NEXT: movdqa 112(%rdi), %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: packuswb %xmm0, %xmm3 -; SSE-NEXT: movdqa 112(%rdi), %xmm15 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: packuswb %xmm0, %xmm4 +; SSE-NEXT: movdqa 240(%rdi), %xmm15 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: packuswb %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: packuswb %xmm0, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[0,3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa 176(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa 160(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: packuswb %xmm0, %xmm4 +; SSE-NEXT: movdqa 144(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa 128(%rdi), %xmm15 +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: packuswb %xmm0, %xmm3 ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: packuswb %xmm0, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[0,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: packuswb %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pxor %xmm8, %xmm8 +; SSE-NEXT: movdqa %xmm11, %xmm14 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE-NEXT: packuswb %xmm14, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: packuswb %xmm3, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: packuswb %xmm6, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,1,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; SSE-NEXT: packuswb %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE-NEXT: packuswb %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,1,1] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,2,2] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,1,1] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: pand %xmm6, %xmm9 +; SSE-NEXT: por %xmm7, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm8 -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[1,1,1,1] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pand %xmm13, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm9, %xmm1 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3],xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,3] +; SSE-NEXT: movdqa %xmm13, %xmm12 ; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,1,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm12, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,1,3] -; SSE-NEXT: packuswb %xmm1, %xmm3 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: movdqa %xmm3, %xmm12 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: packuswb %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[1,1,1,1] +; SSE-NEXT: packuswb %xmm15, %xmm15 +; SSE-NEXT: pand %xmm6, %xmm15 +; SSE-NEXT: por %xmm4, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; SSE-NEXT: packuswb %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[1,1,1,1] -; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: pand %xmm9, %xmm15 -; SSE-NEXT: por %xmm7, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,2,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,2,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: por %xmm15, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm11, %xmm7 -; SSE-NEXT: por %xmm15, %xmm7 +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,5] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: pandn %xmm4, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm15, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,5] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: pandn %xmm4, %xmm15 +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: por %xmm15, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: pandn %xmm15, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[0,2,2,3] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm15, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm8[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: pand %xmm9, %xmm15 -; SSE-NEXT: por %xmm10, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm15[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: pand %xmm6, %xmm15 +; SSE-NEXT: por %xmm5, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: pandn %xmm7, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: por %xmm10, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa %xmm11, %xmm10 -; SSE-NEXT: pandn %xmm7, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm11, %xmm7 -; SSE-NEXT: por %xmm10, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm10, %xmm10 -; SSE-NEXT: movdqa %xmm9, %xmm15 -; SSE-NEXT: pandn %xmm10, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,5] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,5] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: movdqa %xmm6, %xmm15 +; SSE-NEXT: pandn %xmm5, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm15, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,2,3,3] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,2,3,3] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm11, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm7, %xmm1 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[2,2,3,3] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: pand %xmm10, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -2423,59 +2421,58 @@ ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufd $212, (%rsp), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: movdqa %xmm6, %xmm5 ; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[3,1,2,3] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: pand %xmm6, %xmm4 ; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] -; SSE-NEXT: pshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -2484,45 +2481,45 @@ ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[2,0,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2530,7 +2527,7 @@ ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2538,7 +2535,7 @@ ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -2548,7 +2545,7 @@ ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2556,7 +2553,7 @@ ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -2565,15 +2562,14 @@ ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,2,2,2] +; SSE-NEXT: packuswb %xmm3, %xmm12 +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm12, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,1,1] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: pand %xmm6, %xmm7 +; SSE-NEXT: por %xmm4, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2585,11 +2581,11 @@ ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2597,7 +2593,7 @@ ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2605,16 +2601,16 @@ ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: packuswb %xmm1, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: packuswb %xmm1, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,1,2,2] +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2622,7 +2618,7 @@ ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -2631,202 +2627,202 @@ ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: packuswb %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[1,1,1,1] -; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: pand %xmm9, %xmm15 -; SSE-NEXT: por %xmm4, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: packuswb %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,1,1] +; SSE-NEXT: packuswb %xmm14, %xmm14 +; SSE-NEXT: pand %xmm6, %xmm14 +; SSE-NEXT: por %xmm7, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: pandn %xmm1, %xmm15 +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm14 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm15, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm14, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: pandn %xmm1, %xmm15 -; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: pshufhw $237, (%rsp), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm15, %xmm1 -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm15, %xmm2 -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: pand %xmm9, %xmm15 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: pshuflw $116, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm14, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm14, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm14, %xmm2 +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: packuswb %xmm14, %xmm14 +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm14, %xmm3 +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm14, %xmm14 +; SSE-NEXT: pand %xmm6, %xmm14 +; SSE-NEXT: por %xmm3, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm2[2],xmm14[3],xmm2[3] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,2,3,3] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,2,3,3] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,1,3,3] +; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: pandn %xmm2, %xmm12 +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm12, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm3[2],xmm12[3],xmm3[3] ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,2,3,3] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,2,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] -; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,2,3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm13 -; SSE-NEXT: por %xmm1, %xmm13 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: por %xmm2, %xmm11 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,3,3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%r8) -; SSE-NEXT: movapd %xmm10, (%r9) -; SSE-NEXT: movapd %xmm6, 16(%r9) +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%r8) +; SSE-NEXT: movapd %xmm9, 16(%r9) +; SSE-NEXT: movapd %xmm5, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm4, (%rax) -; SSE-NEXT: movapd %xmm8, 16(%rax) +; SSE-NEXT: movapd %xmm7, 16(%rax) +; SSE-NEXT: movapd %xmm8, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm12, (%rax) -; SSE-NEXT: movapd %xmm15, 16(%rax) +; SSE-NEXT: movapd %xmm14, 16(%rax) +; SSE-NEXT: movapd %xmm1, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm0, 16(%rax) -; SSE-NEXT: movapd %xmm5, (%rax) -; SSE-NEXT: addq $904, %rsp # imm = 0x388 +; SSE-NEXT: movapd %xmm12, (%rax) +; SSE-NEXT: addq $888, %rsp # imm = 0x378 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride8_vf32: @@ -4468,138 +4464,136 @@ ; AVX512F-SLOW-LABEL: load_i8_stride8_vf32: ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm16 -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512F-SLOW-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm7 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vpmovqb %ymm2, %xmm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm18 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm19 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm17 +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm15 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512F-SLOW-NEXT: vpmovqb %ymm9, %xmm9 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5],ymm9[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm21 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm20 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] ; AVX512F-SLOW-NEXT: vpmovqb %zmm16, %xmm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm19 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm12 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm12 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm22 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm20 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm5, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm13 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm15, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm19 +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm8, %xmm12 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm13 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm26 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm22 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm24 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm11 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm10 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm7, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm20 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm21 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm23 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm24 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm12 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm10 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm14 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm20 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4],ymm10[5],ymm14[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vmovdqa %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa %xmm11, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa %xmm12, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 @@ -4607,41 +4601,41 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm25 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm22 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm27 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm27 +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm10 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm12 ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm15 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4],ymm10[5],ymm14[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm10 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm11, %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 @@ -4649,41 +4643,44 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm22 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm22 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm24 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm25 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm26 ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm23 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm23 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm28 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm25 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm16, %zmm2 @@ -4691,43 +4688,44 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm7, %xmm13 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm5, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm11 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm26 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm13 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm7 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm10 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm12, %xmm10 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm14, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm14, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm5, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm24 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm16, %zmm2 @@ -4735,41 +4733,37 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm23 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm13, %xmm2 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm23 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm25 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm8, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm10, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm5, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm7 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm16, %zmm2 @@ -4779,32 +4773,32 @@ ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm2 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm5 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm6 ; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] @@ -4812,8 +4806,8 @@ ; AVX512F-SLOW-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, (%rsi) -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, (%rdx) +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, (%rsi) +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, (%rdx) ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, (%r8) ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, (%r9) @@ -5063,291 +5057,293 @@ ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm16 -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa 240(%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa 224(%rdi), %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm3 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm7 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm9 -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm8 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512BW-SLOW-NEXT: vpmovqb %ymm3, %xmm3 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm8 -; AVX512BW-SLOW-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; AVX512BW-SLOW-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm11 -; AVX512BW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm10[3] +; AVX512BW-SLOW-NEXT: vmovdqa 240(%rdi), %xmm1 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa 224(%rdi), %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm3 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm6 +; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm7 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm5[7] +; AVX512BW-SLOW-NEXT: vmovdqa 176(%rdi), %xmm5 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-SLOW-NEXT: vpshufb %xmm8, %xmm5, %xmm9 +; AVX512BW-SLOW-NEXT: vmovdqa 160(%rdi), %xmm7 +; AVX512BW-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm8 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512BW-SLOW-NEXT: vpmovqb %ymm9, %xmm9 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX512BW-SLOW-NEXT: vmovdqa 112(%rdi), %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm9 +; AVX512BW-SLOW-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX512BW-SLOW-NEXT: vmovdqa 80(%rdi), %xmm9 +; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm10 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] ; AVX512BW-SLOW-NEXT: vpmovqb %zmm16, %xmm10 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] ; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20 ; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm19 -; AVX512BW-SLOW-NEXT: vmovdqa 144(%rdi), %xmm11 -; AVX512BW-SLOW-NEXT: vmovdqa 160(%rdi), %xmm12 -; AVX512BW-SLOW-NEXT: vmovdqa 176(%rdi), %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa 144(%rdi), %xmm13 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm14 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm10 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm14 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm15 -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm17 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm17 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5,6],ymm10[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm17 -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm5, %xmm17 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm7, %xmm15 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm11, %xmm18 +; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm13, %xmm18 ; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm19, %xmm17 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm10 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5],ymm10[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm10 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm10 -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm14 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm16, %zmm10 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm12 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm15[5],ymm12[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm12 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm12 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm14 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3] +; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm16, %zmm12 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm12, %xmm12 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm10 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm12 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm15 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5,6],ymm10[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm5, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm14 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm17 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm17 ; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm19, %xmm15 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm10[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm10 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm16, %zmm10 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm12 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3] +; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm16, %zmm12 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm12, %xmm12 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm22 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm10 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm12 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm15 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5,6],ymm10[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm5, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm14 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm17 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm17 ; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm19, %xmm15 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm10[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm10 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm16, %zmm10 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm12 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3] +; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm16, %zmm12 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm12, %xmm12 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm17 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm10 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm12 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm17 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm17[0],xmm14[0],xmm17[1],xmm14[1],xmm17[2],xmm14[2],xmm17[3],xmm14[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5,6],ymm10[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm17 -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm5, %xmm17 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm14 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm17[0],xmm14[1],xmm17[1],xmm14[2],xmm17[2],xmm14[3],xmm17[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm11, %xmm18 +; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm13, %xmm18 ; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm19, %xmm17 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm15 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm10[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm10 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrlq $32, %zmm16, %zmm10 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm12 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3] +; AVX512BW-SLOW-NEXT: vpsrlq $32, %zmm16, %zmm12 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm12, %xmm12 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm24 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm15 -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm17 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm10 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm12 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm17 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5,6],ymm10[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm17 -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm5, %xmm17 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm7, %xmm15 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm11, %xmm18 +; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm13, %xmm18 ; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm19, %xmm17 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm14 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm10[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm10 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrlq $40, %zmm16, %zmm10 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm12 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3] +; AVX512BW-SLOW-NEXT: vpsrlq $40, %zmm16, %zmm12 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm12, %xmm12 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm25 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm10 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm14 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm12 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm14 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm15 -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm17 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm17 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5,6],ymm10[7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm17 -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm5, %xmm17 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm7, %xmm15 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm11, %xmm18 +; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm13, %xmm18 ; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm19, %xmm17 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm15[5],ymm5[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm10 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm10 -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm14 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrlq $48, %zmm16, %zmm10 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm9 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm7 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm9 -; AVX512BW-SLOW-NEXT: vpshufb %xmm7, %xmm12, %xmm7 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vpshufb %xmm9, %xmm11, %xmm10 -; AVX512BW-SLOW-NEXT: vpshufb %xmm9, %xmm19, %xmm9 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5],ymm9[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm4 +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm10 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5],ymm10[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm12[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm12 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm12 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm14 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3] +; AVX512BW-SLOW-NEXT: vpsrlq $48, %zmm16, %zmm12 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm12, %xmm12 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm13, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm19, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm6, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3] ; AVX512BW-SLOW-NEXT: vpsrlq $56, %zmm16, %zmm3 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm20, (%rsi) ; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm21, (%rdx) ; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm22, (%rcx) @@ -5595,22 +5591,22 @@ ; SSE-LABEL: load_i8_stride8_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $2040, %rsp # imm = 0x7F8 -; SSE-NEXT: movdqa 64(%rdi), %xmm6 +; SSE-NEXT: movdqa 192(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm8 +; SSE-NEXT: movdqa 208(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm11 +; SSE-NEXT: movdqa 224(%rdi), %xmm11 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm5 -; SSE-NEXT: movdqa 144(%rdi), %xmm10 -; SSE-NEXT: movdqa 160(%rdi), %xmm7 -; SSE-NEXT: movdqa 176(%rdi), %xmm13 -; SSE-NEXT: movdqa 192(%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa 16(%rdi), %xmm10 +; SSE-NEXT: movdqa 32(%rdi), %xmm7 +; SSE-NEXT: movdqa 48(%rdi), %xmm13 +; SSE-NEXT: movdqa 64(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm2 +; SSE-NEXT: movdqa 80(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm9 -; SSE-NEXT: movdqa 240(%rdi), %xmm12 +; SSE-NEXT: movdqa 96(%rdi), %xmm9 +; SSE-NEXT: movdqa 112(%rdi), %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0] ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pand %xmm4, %xmm0 @@ -5635,7 +5631,7 @@ ; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: packuswb %xmm0, %xmm3 -; SSE-NEXT: movdqa 112(%rdi), %xmm14 +; SSE-NEXT: movdqa 240(%rdi), %xmm14 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm3, %xmm3 @@ -5655,17 +5651,17 @@ ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rdi), %xmm14 +; SSE-NEXT: movdqa 176(%rdi), %xmm14 ; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa 32(%rdi), %xmm15 +; SSE-NEXT: movdqa 160(%rdi), %xmm15 ; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm8 +; SSE-NEXT: movdqa 144(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa 128(%rdi), %xmm11 ; SSE-NEXT: movdqa %xmm11, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: packuswb %xmm0, %xmm3 @@ -6588,10 +6584,9 @@ ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[2,0,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 @@ -6642,10 +6637,11 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,1,2,0,4,5,6,7] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 @@ -6918,11 +6914,10 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,3] ; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 @@ -6947,14 +6942,15 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: packuswb %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: packuswb %xmm2, %xmm8 ; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,1,1] ; SSE-NEXT: packuswb %xmm3, %xmm3 ; SSE-NEXT: pand %xmm9, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 @@ -7095,12 +7091,11 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm3[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,3,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[3,1,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 @@ -7131,14 +7126,16 @@ ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm1[2],xmm14[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 @@ -7174,8 +7171,7 @@ ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[3,1,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 @@ -7219,7 +7215,7 @@ ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshuflw $231, (%rsp), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm9, %xmm2 @@ -7264,7 +7260,7 @@ ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshuflw $231, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm9, %xmm2 @@ -7297,7 +7293,8 @@ ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,3,3] +; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,3,3] ; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -7319,8 +7316,7 @@ ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 @@ -7418,52 +7414,52 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 16(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm15, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm5, 48(%rax) -; SSE-NEXT: movapd %xmm7, 32(%rax) +; SSE-NEXT: movapd %xmm5, 32(%rax) +; SSE-NEXT: movapd %xmm7, 48(%rax) ; SSE-NEXT: movapd %xmm11, 16(%rax) ; SSE-NEXT: movapd %xmm14, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -7476,69 +7472,69 @@ ; ; AVX1-ONLY-LABEL: load_i8_stride8_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $808, %rsp # imm = 0x328 +; AVX1-ONLY-NEXT: subq $840, %rsp # imm = 0x348 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm7 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm8 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] @@ -7548,63 +7544,64 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm7 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm3 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] @@ -7615,43 +7612,43 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm12, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm12, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm15 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 @@ -7675,11 +7672,10 @@ ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] @@ -7719,39 +7715,40 @@ ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm14 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm14 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3],xmm14[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 @@ -7759,10 +7756,10 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 @@ -7775,10 +7772,9 @@ ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] @@ -7810,16 +7806,15 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] @@ -7829,9 +7824,10 @@ ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] @@ -7842,16 +7838,14 @@ ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3],xmm14[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 @@ -7859,24 +7853,25 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] @@ -7908,7 +7903,8 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -7920,16 +7916,15 @@ ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] @@ -7940,19 +7935,19 @@ ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3],xmm14[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 @@ -7960,40 +7955,40 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 @@ -8020,14 +8015,15 @@ ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] @@ -8042,12 +8038,12 @@ ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm15 @@ -8069,33 +8065,32 @@ ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm3 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -8106,12 +8101,12 @@ ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -8125,8 +8120,8 @@ ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] @@ -8137,19 +8132,19 @@ ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3],xmm14[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 @@ -8170,13 +8165,12 @@ ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] @@ -8187,27 +8181,30 @@ ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm3 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] @@ -8219,33 +8216,33 @@ ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3],xmm14[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm13 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 @@ -8254,43 +8251,44 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm13 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3],xmm14[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm3 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 @@ -8298,111 +8296,111 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rax) -; AVX1-ONLY-NEXT: addq $808, %rsp # imm = 0x328 +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) +; AVX1-ONLY-NEXT: addq $840, %rsp # imm = 0x348 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i8_stride8_vf64: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $840, %rsp # imm = 0x348 -; AVX2-SLOW-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vmovdqa 336(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm15 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm11, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa 272(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm13, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm12 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm12 -; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vmovdqa 496(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 176(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa 400(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm7 ; AVX2-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm8 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm8 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 @@ -8410,100 +8408,100 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 368(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 336(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 272(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 496(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa 464(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-SLOW-NEXT: vmovdqa 400(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm15, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm13, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm11, %xmm5 @@ -8518,9 +8516,9 @@ ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm15 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -8528,9 +8526,9 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm4 @@ -8544,69 +8542,69 @@ ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm6 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm14 @@ -8620,9 +8618,9 @@ ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm14 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -8631,85 +8629,85 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 @@ -8723,9 +8721,9 @@ ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm14 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -8734,83 +8732,83 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 @@ -8825,8 +8823,8 @@ ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm14 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -8835,9 +8833,9 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 @@ -8845,77 +8843,77 @@ ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 @@ -8931,9 +8929,9 @@ ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -8941,9 +8939,9 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 @@ -8956,67 +8954,67 @@ ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm5 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 @@ -9032,9 +9030,9 @@ ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -9043,9 +9041,9 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 @@ -9059,59 +9057,59 @@ ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm13, %xmm5 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm1 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm14[0],xmm6[1],xmm14[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] @@ -9129,14 +9127,14 @@ ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm15 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm13 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm13 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 @@ -9155,14 +9153,14 @@ ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm13 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm13 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3] @@ -9171,58 +9169,58 @@ ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm3 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm12 ; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm12 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r8) +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r8) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r9) +; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rax) +; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-SLOW-NEXT: addq $840, %rsp # imm = 0x348 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -9230,177 +9228,184 @@ ; AVX2-FAST-LABEL: load_i8_stride8_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $904, %rsp # imm = 0x388 -; AVX2-FAST-NEXT: vmovdqa 368(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm13 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa 336(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm14 +; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm4 ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-NEXT: vmovdqa 272(%rdi), %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm14 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm4 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm6 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm15 ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm12 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm12 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm9 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 368(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm9 +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm9 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 336(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm9 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm8 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; AVX2-FAST-NEXT: vmovdqa 272(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm8 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm10 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm12 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm11 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm12 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm12 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm14 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm14 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm9 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm9[5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm8 @@ -9423,93 +9428,94 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm9 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm10 ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm11 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm12 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm14 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm12 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm14 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm0 ; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm8 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm8 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm9 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm10 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm11 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm11 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm12 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm12 ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm14 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3] @@ -9530,16 +9536,15 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm0 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm6 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm6 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm6 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm6 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -9562,77 +9567,79 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm13, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm14, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm7 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm15, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm2 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm12 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm5 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm9 +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm6 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm6 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm6 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm6 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm7 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm9 ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm7 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1],xmm5[2,3] ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] @@ -9648,26 +9655,24 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm9 ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm11 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm12 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm12 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm12 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm13 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm13 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm14 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm14 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm15 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] @@ -9686,20 +9691,20 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm3 ; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm10 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm11 @@ -9727,30 +9732,31 @@ ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm11 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm12 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm12 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm13 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm12 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm13 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm14 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm14 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm15 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm15 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7] ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm14 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm15 @@ -9758,21 +9764,21 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm8 +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm12, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm10 ; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm11 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] @@ -9780,10 +9786,10 @@ ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm10 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm10 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7] @@ -9804,14 +9810,15 @@ ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm13 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm13 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm14 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm14 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm15 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm15 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm13[1],xmm8[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] @@ -9831,9 +9838,9 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -9866,38 +9873,38 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%r9) +; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r9) -; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax) +; AVX2-FAST-NEXT: vmovaps %ymm2, (%r9) +; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rax) -; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax) +; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-NEXT: addq $904, %rsp # imm = 0x388 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -9905,73 +9912,72 @@ ; AVX2-FAST-PERLANE-LABEL: load_i8_stride8_vf64: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $840, %rsp # imm = 0x348 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 336(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm11, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 272(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm13, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 496(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 400(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 @@ -9979,100 +9985,100 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 368(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 336(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 272(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 496(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 400(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm15, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm11, %xmm5 @@ -10087,9 +10093,9 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10097,9 +10103,9 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm7, %xmm4 @@ -10113,69 +10119,69 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm14 @@ -10189,9 +10195,9 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10200,85 +10206,85 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 @@ -10292,9 +10298,9 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10303,83 +10309,83 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 @@ -10394,8 +10400,8 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10404,9 +10410,9 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 @@ -10414,77 +10420,77 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 @@ -10500,9 +10506,9 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm15, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10510,9 +10516,9 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 @@ -10525,67 +10531,67 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 @@ -10601,9 +10607,9 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm15, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10612,9 +10618,9 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 @@ -10628,59 +10634,59 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm14[0],xmm6[1],xmm14[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] @@ -10698,14 +10704,14 @@ ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 @@ -10724,14 +10730,14 @@ ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3] @@ -10740,228 +10746,237 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r9) +; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-FAST-PERLANE-NEXT: addq $840, %rsp # imm = 0x348 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: load_i8_stride8_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $520, %rsp # imm = 0x208 -; AVX512F-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm17 -; AVX512F-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512F-SLOW-NEXT: vpmovqb %zmm0, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 496(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: subq $488, %rsp # imm = 0x1E8 +; AVX512F-SLOW-NEXT: vmovdqa 496(%rdi), %xmm9 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa 480(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 480(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm15 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm22 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vmovdqa 432(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa 416(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm15 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %ymm4 -; AVX512F-SLOW-NEXT: vpmovqb %ymm4, %xmm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vmovdqa 368(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm18 -; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vpmovqb %ymm5, %xmm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vmovdqa 368(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa 336(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm19 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm25 +; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm27 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX512F-SLOW-NEXT: vpmovqb %zmm17, %xmm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa 336(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm26 +; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm28 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm18 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] +; AVX512F-SLOW-NEXT: vpmovqb %zmm18, %xmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 ; AVX512F-SLOW-NEXT: movb $-64, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm25 -; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm4 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512F-SLOW-NEXT: vpmovqb %zmm5, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm12 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm24 +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm31 +; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512F-SLOW-NEXT: vpmovqb %ymm5, %xmm5 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm30 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm19 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm28 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm20 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm16 -; AVX512F-SLOW-NEXT: vpmovqb %zmm16, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512F-SLOW-NEXT: vpmovqb %zmm30, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm7, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm24 -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm21 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm9, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm23 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm27 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm15, %xmm4 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm21 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm11, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 432(%rdi), %xmm11 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm22 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm16 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm17 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 400(%rdi), %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 400(%rdi), %xmm15 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm31 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm23 -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm14, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm26 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm14, %xmm7 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] -; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm17, %zmm6 -; AVX512F-SLOW-NEXT: vpmovqb %zmm6, %xmm6 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm19 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm19 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm22 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm13, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm25 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] +; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm18, %zmm5 +; AVX512F-SLOW-NEXT: vpmovqb %zmm5, %xmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm14 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm14 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm12, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm27 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm25 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm24 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -10975,298 +10990,302 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm28 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm10, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm16, %zmm2 +; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm30, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm13, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm2 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm19 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm18 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm24 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm30 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm21 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm17, %zmm9 -; AVX512F-SLOW-NEXT: vpmovqb %zmm9, %xmm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm8 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm14, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm23 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm26 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4],ymm7[5],ymm14[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm21 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm18, %zmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, %zmm16 +; AVX512F-SLOW-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm14[0,1],xmm7[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm0, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm7 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm29 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm18 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm15 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm29 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, (%rsp) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm17 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm16, %zmm2 -; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm30, %zmm1 +; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, (%rsp) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm28 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm30 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4],ymm7[5],ymm14[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm17, %zmm9 -; AVX512F-SLOW-NEXT: vpmovqb %zmm9, %xmm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1} +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3] +; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm16, %zmm14 +; AVX512F-SLOW-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm14[0,1],xmm7[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm0, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm7 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm22 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm26 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm27 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm16, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, %zmm21 +; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm20, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm12 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4],ymm7[5],ymm14[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm23 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm17, %zmm9 -; AVX512F-SLOW-NEXT: vpmovqb %zmm9, %xmm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm20 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm31 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm16, %zmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, %zmm23 +; AVX512F-SLOW-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm14[0,1],xmm7[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm0, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm7 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm18 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm25 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm19 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm20 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm16, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm21, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 @@ -11274,183 +11293,179 @@ ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm28 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm27 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm31 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm17, %zmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512F-SLOW-NEXT: vpmovqb %zmm9, %xmm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1} -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm30 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4],ymm7[5],ymm14[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3] +; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm16, %zmm14 +; AVX512F-SLOW-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm14[0,1],xmm7[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm0, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm7 {%k1} +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm26 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm29 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm28 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm16 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm24 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm24 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm22 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm18, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm21, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm29 +; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm21 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm18 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm22 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm17, %zmm9 -; AVX512F-SLOW-NEXT: vpmovqb %zmm9, %xmm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm19 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm17 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm20 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm17 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4],ymm7[5],ymm14[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm30 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3] +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm16, %zmm14 +; AVX512F-SLOW-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm14[0,1],xmm7[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm0, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm7 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm16 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm27 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm18 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm15 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm12 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm26, %zmm1 +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm21, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm24 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm24 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 @@ -11458,79 +11473,78 @@ ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm8 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm11 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm9 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm11 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm23, %zmm9 -; AVX512F-SLOW-NEXT: vpmovqb %zmm9, %xmm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm11 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] +; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm23, %zmm8 +; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm0, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm7 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm26, %zmm1 +; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm21, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-SLOW-NEXT: vmovaps %zmm1, (%rsi) ; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -11548,7 +11562,7 @@ ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-SLOW-NEXT: addq $520, %rsp # imm = 0x208 +; AVX512F-SLOW-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -12116,668 +12130,665 @@ ; ; AVX512BW-SLOW-LABEL: load_i8_stride8_vf64: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: subq $744, %rsp # imm = 0x2E8 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa 496(%rdi), %xmm4 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm4, %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa 480(%rdi), %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm25 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-SLOW-NEXT: vmovdqa 464(%rdi), %xmm6 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm6, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm26 -; AVX512BW-SLOW-NEXT: vmovdqa 448(%rdi), %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm7, %xmm6 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm7, %xmm30 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm4 -; AVX512BW-SLOW-NEXT: vpmovqb %ymm4, %xmm4 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa 368(%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: subq $664, %rsp # imm = 0x298 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm22 +; AVX512BW-SLOW-NEXT: vmovdqa 496(%rdi), %xmm1 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm20 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-SLOW-NEXT: vpshufb %xmm20, %xmm1, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, %xmm9 +; AVX512BW-SLOW-NEXT: vmovdqa 480(%rdi), %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm20, %xmm1, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm1, %xmm26 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vmovdqa 464(%rdi), %xmm1 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm23 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-SLOW-NEXT: vpshufb %xmm23, %xmm1, %xmm6 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 +; AVX512BW-SLOW-NEXT: vmovdqa 448(%rdi), %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm23, %xmm1, %xmm8 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm1, %xmm31 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm5[7] +; AVX512BW-SLOW-NEXT: vmovdqa64 432(%rdi), %xmm21 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm21, %xmm8 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa 416(%rdi), %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm1, %xmm10 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, %xmm5 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm10 +; AVX512BW-SLOW-NEXT: vpmovqb %ymm10, %xmm10 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 368(%rdi), %xmm28 +; AVX512BW-SLOW-NEXT: vpshufb %xmm20, %xmm28, %xmm10 ; AVX512BW-SLOW-NEXT: vmovdqa64 352(%rdi), %xmm27 -; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm27, %xmm6 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vmovdqa 336(%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm11 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa 320(%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm15 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, %xmm9 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX512BW-SLOW-NEXT: vpmovqb %zmm1, %xmm11 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm20 +; AVX512BW-SLOW-NEXT: vpshufb %xmm20, %xmm27, %xmm11 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 336(%rdi), %xmm19 +; AVX512BW-SLOW-NEXT: vpshufb %xmm23, %xmm19, %xmm14 +; AVX512BW-SLOW-NEXT: vmovdqa 320(%rdi), %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm23, %xmm1, %xmm15 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] +; AVX512BW-SLOW-NEXT: vpmovqb %zmm22, %xmm14 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm24 ; AVX512BW-SLOW-NEXT: movb $-64, %al ; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm20 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm28 -; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm28, %xmm7 -; AVX512BW-SLOW-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm10 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm24 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm20, %xmm0, %xmm14 +; AVX512BW-SLOW-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm20, %xmm0, %xmm15 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm1 ; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-SLOW-NEXT: vmovdqa64 208(%rdi), %xmm17 -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm17, %xmm10 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm23, %xmm1, %xmm16 ; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm8, %xmm16 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm16[0],xmm10[0],xmm16[1],xmm10[1],xmm16[2],xmm10[2],xmm16[3],xmm10[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5,6],ymm7[7] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX512BW-SLOW-NEXT: vpmovqb %ymm10, %xmm10 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5],ymm10[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm16 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, %xmm10 -; AVX512BW-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm12 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, %xmm14 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm16[0],xmm12[1],xmm16[1],xmm12[2],xmm16[2],xmm12[3],xmm16[3] -; AVX512BW-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm21 -; AVX512BW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm21[0],xmm19[1],xmm21[1],xmm19[2],xmm21[2],xmm19[3],xmm21[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa 160(%rdi), %xmm6 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa 384(%rdi), %xmm7 -; AVX512BW-SLOW-NEXT: vmovdqa 400(%rdi), %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 416(%rdi), %xmm20 -; AVX512BW-SLOW-NEXT: vmovdqa64 432(%rdi), %xmm29 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, %xmm19 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm24, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm25, %xmm11 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm23, %xmm8, %xmm18 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm16 = xmm18[0],xmm16[0],xmm18[1],xmm16[1],xmm18[2],xmm16[2],xmm18[3],xmm16[3] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm1 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512BW-SLOW-NEXT: vmovdqa 176(%rdi), %xmm7 +; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm7, %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa 160(%rdi), %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm2, %xmm17 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, %xmm12 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm17[0],xmm1[0],xmm17[1],xmm1[1],xmm17[2],xmm1[2],xmm17[3],xmm1[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm26, %xmm12 -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm26, %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm30, %xmm16 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm30, %xmm25 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm2 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm26 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm29, %xmm24 -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm20, %xmm25 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm20, %xmm23 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm2 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm3, %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm3, %xmm21 -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm7, %xmm25 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm3 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm31, %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm27, %xmm3 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm22, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm9, %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm9, %xmm31 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm24[0],xmm3[0],xmm24[1],xmm3[1],xmm24[2],xmm3[2],xmm24[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] -; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm15, %zmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm28, %xmm9 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm28, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm18, %xmm3 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm17, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm8, %xmm24 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm24[0],xmm3[0],xmm24[1],xmm3[1],xmm24[2],xmm3[2],xmm24[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm17 +; AVX512BW-SLOW-NEXT: vpmovqb %ymm17, %xmm17 +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm2 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-SLOW-NEXT: vmovdqa 112(%rdi), %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm20, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm20, %xmm2, %xmm2 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 80(%rdi), %xmm29 +; AVX512BW-SLOW-NEXT: vpshufb %xmm23, %xmm29, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm23, %xmm4, %xmm23 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm23[0],xmm2[0],xmm23[1],xmm2[1],xmm23[2],xmm2[2],xmm23[3],xmm2[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm2 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa 400(%rdi), %xmm14 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm9, %xmm20 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm26, %xmm11 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm26, %xmm2 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm25, %xmm10 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm25, %xmm25 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm31, %xmm26 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm31, %xmm9 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm25 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm0, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm6, %xmm25 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm25[0],xmm3[0],xmm25[1],xmm3[1],xmm25[2],xmm3[2],xmm25[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm6 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa 144(%rdi), %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm0, %xmm0 -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm6, %xmm30 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm30[0],xmm0[0],xmm30[1],xmm0[1],xmm30[2],xmm0[2],xmm30[3],xmm0[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm21, %xmm25 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm5, %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm26 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm25 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm14, %xmm26 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm14, %xmm21 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm0, %xmm31 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm26 = xmm31[0],xmm26[0],xmm31[1],xmm26[1],xmm31[2],xmm26[2],xmm31[3],xmm26[3] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5],ymm5[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm14, %xmm3 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm28, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm27, %xmm5 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm19, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm25 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm25[0],xmm5[0],xmm25[1],xmm5[1],xmm25[2],xmm5[2],xmm25[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3] +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm22, %zmm5 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} ; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm15, %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm28, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm23, %xmm5 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm17, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm8, %xmm18 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm25 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm25[0],xmm5[0],xmm25[1],xmm5[1],xmm25[2],xmm5[2],xmm25[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm12, %xmm24 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm3 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm7 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 144(%rdi), %xmm16 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm16, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm7, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm4 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm5, %zmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3] +; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm6, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm26 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm19, %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm19, %xmm20 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm3 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm20, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm20, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm12, %xmm25 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm16, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm9, %xmm11 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm5 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm29, %xmm24 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm29, %xmm13 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm23, %xmm30 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm23, %xmm16 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm13, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm21, %xmm18 -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm21, %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm7, %xmm17 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm7, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm19[0],xmm0[0],xmm19[1],xmm0[1],xmm19[2],xmm0[2],xmm19[3],xmm0[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm27, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm13 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm31, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm22, %zmm13 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm13, %xmm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm21, %xmm31 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm20, %xmm1 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm31[0],xmm1[1],xmm31[1],xmm1[2],xmm31[2],xmm1[3],xmm31[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5],ymm1[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm28, %xmm10 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm28, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm27, %xmm5 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm19, %xmm13 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm19, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm31 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm31[0],xmm5[0],xmm31[1],xmm5[1],xmm31[2],xmm5[2],xmm31[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] +; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm22, %zmm5 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm9, %xmm26 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm21 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm21, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm22 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm22, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm29, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm31, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm23, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm23, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm23, %xmm25 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm17, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm18, %xmm31 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm31[0],xmm5[0],xmm31[1],xmm5[1],xmm31[2],xmm5[2],xmm31[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm19 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm19, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm24, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, %xmm28 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm16, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm24 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm24, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm1 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm28, %xmm3 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm29, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm12, %xmm7 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm5, %zmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm12 +; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm26, %zmm3 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm20, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm2 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm25, %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm24, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, (%rsp) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm16, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm19[0],xmm4[0],xmm19[1],xmm4[1],xmm19[2],xmm4[2],xmm19[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm18, %xmm30 -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm17, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm30[0],xmm13[1],xmm30[1],xmm13[2],xmm30[2],xmm13[3],xmm30[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa %xmm12, %xmm10 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm27, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm30 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0,1,2],xmm4[3] -; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm6, %zmm13 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm13, %xmm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0,1],xmm4[2,3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm26, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm21, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm22, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm22, %xmm21 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm30 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm29, %xmm22 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm9, %xmm25 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm7, %xmm9 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm31, %xmm13 -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm23, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm14, %xmm23 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm2 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm17 -; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm28, %zmm2 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm20, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm11, %xmm28 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm26, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm16, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm11, %xmm22 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm24, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm26, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm18, %xmm30 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm18, %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm18, %xmm2 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm30[0],xmm2[1],xmm30[1],xmm2[2],xmm30[2],xmm2[3],xmm30[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm9, %xmm11 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm21, %xmm31 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm20, %xmm2 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm31[0],xmm2[1],xmm31[1],xmm2[2],xmm31[2],xmm2[3],xmm31[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm13[5],ymm2[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5],ymm2[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa %xmm10, %xmm6 ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm27, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm27, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm8, %xmm27 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm30 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm27, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm27, %xmm21 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm31 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm14, %xmm16 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm31[0],xmm5[0],xmm31[1],xmm5[1],xmm31[2],xmm5[2],xmm31[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] ; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vpsrlq $32, %zmm10, %zmm13 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm13, %xmm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] +; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm10, %zmm5 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm21, %xmm13 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm22, %xmm30 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm23, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm25, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm25, %xmm27 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm17, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm18, %xmm31 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm31[0],xmm5[0],xmm31[1],xmm5[1],xmm31[2],xmm5[2],xmm31[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm13 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm4 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm31, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm8, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm19, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm19, %xmm25 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm28, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm28, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm24, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm23, %xmm0 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsp), %xmm24 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm24, %xmm0 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm29, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm29, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm7, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm20 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vpsrlq $32, %zmm17, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm12, %zmm3 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm26, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm26, %xmm12 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm22, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm22, %xmm15 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm11, %xmm26 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm30 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm9, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm6, %xmm31 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm20, %xmm3 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm31[0],xmm3[1],xmm31[1],xmm3[2],xmm31[2],xmm3[3],xmm31[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm21, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm21, %xmm22 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm13, %xmm21 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm16, %xmm31 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm31[0],xmm5[0],xmm31[1],xmm5[1],xmm31[2],xmm5[2],xmm31[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] +; AVX512BW-SLOW-NEXT: vpsrlq $32, %zmm10, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm23, %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm27, %xmm9 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm27, %xmm5 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm17, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm18, %xmm31 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm31[0],xmm5[0],xmm31[1],xmm5[1],xmm31[2],xmm5[2],xmm31[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm13, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm28, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm27, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-SLOW-NEXT: vmovdqa %xmm8, %xmm11 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm24, %xmm0 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] +; AVX512BW-SLOW-NEXT: vpsrlq $32, %zmm7, %zmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm25 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm28, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm16, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm15, %xmm14 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm4 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm26, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm24, %xmm19 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, %xmm26 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm18, %xmm30 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm30[0],xmm19[0],xmm30[1],xmm19[1],xmm30[2],xmm19[2],xmm30[3],xmm19[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm26, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm26, %xmm23 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm19, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm19, %xmm26 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm31 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm20, %xmm10 +; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm20, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm30 = xmm30[0],xmm31[0],xmm30[1],xmm31[1],xmm30[2],xmm31[2],xmm30[3],xmm31[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm0, %ymm6 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm24 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm27, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX512BW-SLOW-NEXT: vpsrlq $40, %zmm10, %zmm5 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm22, %xmm6 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm6 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm20, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm30[0],xmm6[0],xmm30[1],xmm6[1],xmm30[2],xmm6[2],xmm30[3],xmm6[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 +; AVX512BW-SLOW-NEXT: vpsrlq $40, %zmm16, %zmm6 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm6, %xmm6 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm11, %xmm17 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm22, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm16, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm6 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm17, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm18, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm30[0],xmm6[0],xmm30[1],xmm6[1],xmm30[2],xmm6[2],xmm30[3],xmm6[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm25, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm31, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm8, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm29, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm28, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm27, %xmm5 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm23, %xmm0 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm24, %xmm0 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm27, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm31, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrlq $40, %zmm20, %zmm2 +; AVX512BW-SLOW-NEXT: vpsrlq $40, %zmm25, %zmm2 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm28, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm16, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm23, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm26, %xmm5 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm30[0],xmm6[0],xmm30[1],xmm6[1],xmm30[2],xmm6[2],xmm30[3],xmm6[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm26, %xmm19 -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm18, %xmm30 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm30[0],xmm19[0],xmm30[1],xmm19[1],xmm30[2],xmm19[2],xmm30[3],xmm19[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm3 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm24, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm27, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm30, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] -; AVX512BW-SLOW-NEXT: vpsrlq $48, %zmm10, %zmm5 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm17, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm22, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm22, %xmm6 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm20, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm30[0],xmm6[0],xmm30[1],xmm6[1],xmm30[2],xmm6[2],xmm30[3],xmm6[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] +; AVX512BW-SLOW-NEXT: vpsrlq $48, %zmm19, %zmm6 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm6, %xmm6 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm16, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm6 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm17, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm18, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm30[0],xmm6[0],xmm30[1],xmm6[1],xmm30[2],xmm6[2],xmm30[3],xmm6[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm29, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm28, %xmm6 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm30, %xmm5 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm9, %xmm18 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm31, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm13, %xmm20, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm23, %xmm0 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm26 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm2 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm24, %xmm0 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm27, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm31, %xmm2 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vpsrlq $48, %zmm19, %zmm2 +; AVX512BW-SLOW-NEXT: vpsrlq $48, %zmm25, %zmm2 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm28, %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm16, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm8 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm9 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm26, %xmm6 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm14, %xmm7 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm9 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4],ymm5[5],ymm9[6,7] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5],ymm7[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm24, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm9 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm27, %xmm9 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm30, %xmm6 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] -; AVX512BW-SLOW-NEXT: vpsrlq $56, %zmm10, %zmm6 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm6, %xmm6 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm22, %xmm7 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm21, %xmm7 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm20, %xmm8 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] +; AVX512BW-SLOW-NEXT: vpsrlq $56, %zmm19, %zmm7 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm7, %xmm7 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm5 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm5 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm17, %xmm6 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm21, %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm22, %xmm9 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm16, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm7 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm17, %xmm7 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm18, %xmm8 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm18, %xmm4 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm8, %xmm31, %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm8, %xmm20, %xmm8 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm29, %xmm7 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm13, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm28, %xmm7 +; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm30, %xmm6 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] ; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm23, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm24, %xmm1 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm26, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm29, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm27, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm31, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512BW-SLOW-NEXT: vpsrlq $56, %zmm19, %zmm3 +; AVX512BW-SLOW-NEXT: vpsrlq $56, %zmm25, %zmm3 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -12799,7 +12810,7 @@ ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-SLOW-NEXT: addq $744, %rsp # imm = 0x2E8 +; AVX512BW-SLOW-NEXT: addq $664, %rsp # imm = 0x298 ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll @@ -83,25 +83,57 @@ ; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rdx) ; AVX1-ONLY-NEXT: retq ; -; AVX2-ONLY-LABEL: store_i16_stride2_vf8: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq +; AVX2-SLOW-LABEL: store_i16_stride2_vf8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq ; -; AVX512F-LABEL: store_i16_stride2_vf8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX2-FAST-LABEL: store_i16_stride2_vf8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15,16,17,20,21,18,19,22,23,24,25,28,29,26,27,30,31] +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: store_i16_stride2_vf8: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX512F-SLOW-LABEL: store_i16_stride2_vf8: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: store_i16_stride2_vf8: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-FAST-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15,16,17,20,21,18,19,22,23,24,25,28,29,26,27,30,31] +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq ; ; AVX512BW-LABEL: store_i16_stride2_vf8: ; AVX512BW: # %bb.0: @@ -128,15 +160,15 @@ ; SSE-NEXT: movdqa (%rsi), %xmm2 ; SSE-NEXT: movdqa 16(%rsi), %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: movdqa %xmm1, 32(%rdx) -; SSE-NEXT: movdqa %xmm2, 48(%rdx) -; SSE-NEXT: movdqa %xmm0, (%rdx) -; SSE-NEXT: movdqa %xmm4, 16(%rdx) +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: movdqa %xmm1, 48(%rdx) +; SSE-NEXT: movdqa %xmm2, 32(%rdx) +; SSE-NEXT: movdqa %xmm0, 16(%rdx) +; SSE-NEXT: movdqa %xmm4, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride2_vf16: @@ -213,25 +245,25 @@ ; SSE-NEXT: movdqa 32(%rsi), %xmm6 ; SSE-NEXT: movdqa 48(%rsi), %xmm7 ; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] ; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; SSE-NEXT: movdqa %xmm3, 96(%rdx) -; SSE-NEXT: movdqa %xmm6, 112(%rdx) -; SSE-NEXT: movdqa %xmm2, 64(%rdx) -; SSE-NEXT: movdqa %xmm5, 80(%rdx) -; SSE-NEXT: movdqa %xmm1, 32(%rdx) -; SSE-NEXT: movdqa %xmm4, 48(%rdx) -; SSE-NEXT: movdqa %xmm0, (%rdx) -; SSE-NEXT: movdqa %xmm8, 16(%rdx) +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; SSE-NEXT: movdqa %xmm3, 112(%rdx) +; SSE-NEXT: movdqa %xmm6, 96(%rdx) +; SSE-NEXT: movdqa %xmm2, 80(%rdx) +; SSE-NEXT: movdqa %xmm5, 64(%rdx) +; SSE-NEXT: movdqa %xmm1, 48(%rdx) +; SSE-NEXT: movdqa %xmm4, 32(%rdx) +; SSE-NEXT: movdqa %xmm0, 16(%rdx) +; SSE-NEXT: movdqa %xmm8, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride2_vf32: @@ -244,22 +276,22 @@ ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 16(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 48(%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm3, 96(%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm6, 112(%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, 64(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 80(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 80(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 48(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 16(%rdx) ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i16_stride2_vf32: @@ -270,16 +302,16 @@ ; AVX2-ONLY-NEXT: vmovdqa 32(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[0,1],ymm4[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 96(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rdx) +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[0,1],ymm4[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 64(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -350,102 +382,102 @@ ; SSE-NEXT: movdqa 32(%rsi), %xmm14 ; SSE-NEXT: movdqa 48(%rsi), %xmm15 ; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] ; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] ; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7] ; SSE-NEXT: movdqa %xmm3, %xmm15 -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] ; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] ; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] ; SSE-NEXT: movdqa 112(%rsi), %xmm11 ; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE-NEXT: movdqa %xmm0, 224(%rdx) -; SSE-NEXT: movdqa %xmm7, 240(%rdx) -; SSE-NEXT: movdqa %xmm6, 192(%rdx) -; SSE-NEXT: movdqa %xmm12, 208(%rdx) -; SSE-NEXT: movdqa %xmm4, 160(%rdx) -; SSE-NEXT: movdqa %xmm13, 176(%rdx) -; SSE-NEXT: movdqa %xmm3, 128(%rdx) -; SSE-NEXT: movdqa %xmm15, 144(%rdx) -; SSE-NEXT: movdqa %xmm5, 96(%rdx) -; SSE-NEXT: movdqa %xmm14, 112(%rdx) -; SSE-NEXT: movdqa %xmm2, 64(%rdx) -; SSE-NEXT: movdqa %xmm10, 80(%rdx) -; SSE-NEXT: movdqa %xmm1, 32(%rdx) -; SSE-NEXT: movdqa %xmm9, 48(%rdx) -; SSE-NEXT: movdqa %xmm8, (%rdx) +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: movdqa %xmm0, 240(%rdx) +; SSE-NEXT: movdqa %xmm7, 224(%rdx) +; SSE-NEXT: movdqa %xmm6, 208(%rdx) +; SSE-NEXT: movdqa %xmm12, 192(%rdx) +; SSE-NEXT: movdqa %xmm4, 176(%rdx) +; SSE-NEXT: movdqa %xmm13, 160(%rdx) +; SSE-NEXT: movdqa %xmm3, 144(%rdx) +; SSE-NEXT: movdqa %xmm15, 128(%rdx) +; SSE-NEXT: movdqa %xmm5, 112(%rdx) +; SSE-NEXT: movdqa %xmm14, 96(%rdx) +; SSE-NEXT: movdqa %xmm2, 80(%rdx) +; SSE-NEXT: movdqa %xmm10, 64(%rdx) +; SSE-NEXT: movdqa %xmm1, 48(%rdx) +; SSE-NEXT: movdqa %xmm9, 32(%rdx) +; SSE-NEXT: movdqa %xmm8, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride2_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm11 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm13 ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, 224(%rdx) +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, 224(%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 240(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 32(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 48(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 96(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm13, 112(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 192(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm11, 192(%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm14, 208(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, 64(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 80(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm12, 16(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 160(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 176(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 128(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, 160(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, 176(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 128(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 144(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 96(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 112(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 80(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rdx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rdx) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rdx) ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i16_stride2_vf64: @@ -460,82 +492,82 @@ ; AVX2-ONLY-NEXT: vmovdqa 96(%rsi), %ymm7 ; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15] ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[2,3],ymm8[2,3] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15] ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],ymm8[2,3] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm2[4],ymm6[4],ymm2[5],ymm6[5],ymm2[6],ymm6[6],ymm2[7],ymm6[7],ymm2[12],ymm6[12],ymm2[13],ymm6[13],ymm2[14],ymm6[14],ymm2[15],ymm6[15] ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm8[2,3] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm3[4],ymm7[4],ymm3[5],ymm7[5],ymm3[6],ymm7[6],ymm3[7],ymm7[7],ymm3[12],ymm7[12],ymm3[13],ymm7[13],ymm3[14],ymm7[14],ymm3[15],ymm7[15] ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[8],ymm7[8],ymm3[9],ymm7[9],ymm3[10],ymm7[10],ymm3[11],ymm7[11] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm3[2,3],ymm8[2,3] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[0,1],ymm8[0,1] -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 192(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 224(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 128(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 160(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 96(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 32(%rdx) +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm3[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 224(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 192(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 160(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, 128(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 64(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: store_i16_stride2_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX512F-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX512F-NEXT: vmovdqa 48(%rsi), %xmm5 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm8 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX512F-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-NEXT: vmovdqa 80(%rsi), %xmm3 -; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512F-NEXT: vmovdqa 96(%rsi), %xmm5 -; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm6 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-NEXT: vmovdqa 112(%rsi), %xmm6 -; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm7 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512F-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512F-NEXT: vmovdqa 16(%rsi), %xmm9 -; AVX512F-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX512F-NEXT: vmovdqa 48(%rsi), %xmm11 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm14 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX512F-NEXT: vmovdqa %xmm9, 48(%rdx) -; AVX512F-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX512F-NEXT: vmovdqa %xmm7, 16(%rdx) -; AVX512F-NEXT: vmovdqa %xmm14, (%rdx) -; AVX512F-NEXT: vmovdqa %xmm11, 112(%rdx) -; AVX512F-NEXT: vmovdqa %xmm13, 96(%rdx) -; AVX512F-NEXT: vmovdqa %xmm10, 80(%rdx) -; AVX512F-NEXT: vmovdqa %xmm15, 64(%rdx) -; AVX512F-NEXT: vmovdqa %xmm6, 240(%rdx) -; AVX512F-NEXT: vmovdqa %xmm8, 224(%rdx) -; AVX512F-NEXT: vmovdqa %xmm5, 208(%rdx) -; AVX512F-NEXT: vmovdqa %xmm4, 192(%rdx) -; AVX512F-NEXT: vmovdqa %xmm3, 176(%rdx) -; AVX512F-NEXT: vmovdqa %xmm2, 160(%rdx) -; AVX512F-NEXT: vmovdqa %xmm1, 144(%rdx) -; AVX512F-NEXT: vmovdqa64 %xmm16, 128(%rdx) +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm8 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm9 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-NEXT: vmovdqa 80(%rsi), %xmm9 +; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm11 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512F-NEXT: vmovdqa 96(%rsi), %xmm11 +; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm13 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX512F-NEXT: vmovdqa 112(%rsi), %xmm13 +; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm15 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX512F-NEXT: vmovdqa %xmm13, 224(%rdx) +; AVX512F-NEXT: vmovdqa %xmm0, 240(%rdx) +; AVX512F-NEXT: vmovdqa %xmm11, 192(%rdx) +; AVX512F-NEXT: vmovdqa %xmm14, 208(%rdx) +; AVX512F-NEXT: vmovdqa %xmm9, 160(%rdx) +; AVX512F-NEXT: vmovdqa %xmm12, 176(%rdx) +; AVX512F-NEXT: vmovdqa %xmm8, 128(%rdx) +; AVX512F-NEXT: vmovdqa %xmm10, 144(%rdx) +; AVX512F-NEXT: vmovdqa %xmm5, 96(%rdx) +; AVX512F-NEXT: vmovdqa %xmm7, 112(%rdx) +; AVX512F-NEXT: vmovdqa %xmm4, 64(%rdx) +; AVX512F-NEXT: vmovdqa %xmm6, 80(%rdx) +; AVX512F-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX512F-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX512F-NEXT: vmovdqa %xmm1, (%rdx) +; AVX512F-NEXT: vmovdqa64 %xmm16, 16(%rdx) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i16_stride2_vf64: @@ -544,17 +576,17 @@ ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm6, %zmm0 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm1, %zmm4 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i16>, ptr %in.vecptr0, align 64 @@ -567,9 +599,6 @@ ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; AVX1: {{.*}} ; AVX2: {{.*}} -; AVX2-FAST: {{.*}} -; AVX2-FAST-PERLANE: {{.*}} -; AVX2-SLOW: {{.*}} ; AVX512: {{.*}} ; AVX512-FAST: {{.*}} ; AVX512-SLOW: {{.*}} @@ -581,10 +610,8 @@ ; AVX512DQ-SLOW: {{.*}} ; AVX512DQBW-FAST: {{.*}} ; AVX512DQBW-SLOW: {{.*}} -; AVX512F-FAST: {{.*}} ; AVX512F-ONLY-FAST: {{.*}} ; AVX512F-ONLY-SLOW: {{.*}} -; AVX512F-SLOW: {{.*}} ; FALLBACK0: {{.*}} ; FALLBACK1: {{.*}} ; FALLBACK10: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll @@ -24,8 +24,8 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] ; SSE-NEXT: movq %xmm0, (%rcx) ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: movd %xmm0, 8(%rcx) @@ -102,65 +102,117 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,u,u,2,3,10,11,u,u,4,5,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,4,5,8,9,14,15,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,u,u,4,5,6,7,u,u,8,9,10,11] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rcx) ; AVX1-ONLY-NEXT: vmovq %xmm2, 16(%rcx) ; AVX1-ONLY-NEXT: retq ; -; AVX2-ONLY-LABEL: store_i16_stride3_vf4: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-ONLY-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-ONLY-NEXT: vmovq %xmm1, 16(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rcx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq +; AVX2-SLOW-LABEL: store_i16_stride3_vf4: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,u,u,2,3,10,11,u,u,4,5,12,13,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vmovq %xmm1, 16(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq ; -; AVX512F-LABEL: store_i16_stride3_vf4: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vmovq %xmm1, 16(%rcx) -; AVX512F-NEXT: vmovdqa %xmm0, (%rcx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX2-FAST-LABEL: store_i16_stride3_vf4: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,0,0,1,1,u,u> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,u,u,2,3,10,11,u,u,4,5,12,13,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vmovq %xmm1, 16(%rcx) +; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: store_i16_stride3_vf4: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,u,u,2,3,10,11,u,u,4,5,12,13,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm1, 16(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX512F-SLOW-LABEL: store_i16_stride3_vf4: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13],zero,zero,ymm0[22,23,30,31],zero,zero,ymm0[u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-SLOW-NEXT: vmovq %xmm1, 16(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, (%rcx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: store_i16_stride3_vf4: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13],zero,zero,ymm0[22,23,30,31],zero,zero,ymm0[u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-FAST-NEXT: vmovq %xmm1, 16(%rcx) +; AVX512F-FAST-NEXT: vmovdqa %xmm0, (%rcx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq ; ; AVX512BW-LABEL: store_i16_stride3_vf4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u> -; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vmovq %xmm1, 16(%rcx) -; AVX512BW-NEXT: vmovdqa %xmm0, (%rcx) +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,8,16,1,9,17,2,10,18,3,11,19,u,u,u,u> +; AVX512BW-NEXT: vpermi2w %ymm2, %ymm0, %ymm1 +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, 16(%rcx) +; AVX512BW-NEXT: vmovdqa %xmm1, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <4 x i16>, ptr %in.vecptr0, align 64 @@ -311,20 +363,21 @@ ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX512F-SLOW-NEXT: vpermd %ymm2, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, (%rdx), %zmm0, %zmm3 ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6],xmm3[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-SLOW-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-SLOW-NEXT: vzeroupper @@ -378,124 +431,124 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride3_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm6 -; SSE-NEXT: movdqa (%rsi), %xmm2 -; SSE-NEXT: movdqa 16(%rsi), %xmm7 -; SSE-NEXT: movdqa (%rdx), %xmm4 -; SSE-NEXT: movdqa 16(%rdx), %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm10, %xmm5 -; SSE-NEXT: por %xmm8, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm5 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 +; SSE-NEXT: movdqa (%rdx), %xmm7 +; SSE-NEXT: movdqa 16(%rdx), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,4,5] +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm9, %xmm10 ; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: por %xmm6, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm6, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm7[1,1,2,2] ; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,4,5] -; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm11, %xmm8 ; SSE-NEXT: por %xmm10, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,2,2] -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm2[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: pandn %xmm11, %xmm12 -; SSE-NEXT: por %xmm10, %xmm12 -; SSE-NEXT: pand %xmm3, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,2,2] -; SSE-NEXT: pandn %xmm10, %xmm3 -; SSE-NEXT: por %xmm12, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,5,4,5] -; SSE-NEXT: pand %xmm0, %xmm11 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: por %xmm11, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm9, %xmm11 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm6 -; SSE-NEXT: por %xmm11, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,0,0] +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm10, %xmm11 +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,5,4,5] +; SSE-NEXT: pand %xmm9, %xmm10 +; SSE-NEXT: por %xmm11, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,2,2] +; SSE-NEXT: pand %xmm9, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; SSE-NEXT: pandn %xmm12, %xmm9 +; SSE-NEXT: por %xmm11, %xmm9 +; SSE-NEXT: pand %xmm6, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,2,2] +; SSE-NEXT: pandn %xmm11, %xmm6 +; SSE-NEXT: por %xmm9, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm7, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm11, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm10 -; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm10, 32(%rcx) -; SSE-NEXT: movdqa %xmm6, 80(%rcx) +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm9, 80(%rcx) +; SSE-NEXT: movdqa %xmm4, 32(%rcx) +; SSE-NEXT: movdqa %xmm6, 64(%rcx) +; SSE-NEXT: movdqa %xmm10, 48(%rcx) +; SSE-NEXT: movdqa %xmm8, 16(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: movdqa %xmm3, 16(%rcx) -; SSE-NEXT: movdqa %xmm8, 48(%rcx) -; SSE-NEXT: movdqa %xmm5, 64(%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride3_vf16: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7] ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm2[1,2],xmm7[3],xmm2[4,5],xmm7[6],xmm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1,2],xmm8[3],xmm7[4,5],xmm8[6],xmm7[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6],xmm10[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2],xmm8[3,4],xmm10[5],xmm8[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2],xmm10[3,4],xmm12[5],xmm10[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6],xmm12[7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6],xmm3[7] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3,4],xmm3[5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, 48(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 32(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, (%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 80(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 16(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 64(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 80(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 64(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride3_vf16: @@ -549,7 +602,7 @@ ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [6,7,6,7,6,7,6,7,8,9,8,9,8,9,8,9] ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm5 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm7 @@ -593,7 +646,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [6,7,6,7,6,7,6,7,8,9,8,9,8,9,8,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm7 @@ -698,343 +751,339 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride3_vf32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 16(%rdi), %xmm6 -; SSE-NEXT: movdqa 32(%rdi), %xmm4 -; SSE-NEXT: movdqa 48(%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rsi), %xmm7 -; SSE-NEXT: movdqa 32(%rsi), %xmm8 -; SSE-NEXT: movdqa 48(%rsi), %xmm11 -; SSE-NEXT: movdqa 32(%rdx), %xmm10 -; SSE-NEXT: movdqa 48(%rdx), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,2] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,2,2] +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm4 +; SSE-NEXT: movdqa 32(%rdi), %xmm5 +; SSE-NEXT: movdqa (%rsi), %xmm10 +; SSE-NEXT: movdqa 16(%rsi), %xmm8 +; SSE-NEXT: movdqa (%rdx), %xmm1 +; SSE-NEXT: movdqa 16(%rdx), %xmm7 +; SSE-NEXT: movdqa 32(%rdx), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,4,5] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,7,5,4,5] -; SSE-NEXT: pand %xmm5, %xmm9 -; SSE-NEXT: por %xmm3, %xmm9 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,2,2] -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm4, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm1[0,1,2,3,7,5,4,5] -; SSE-NEXT: pand %xmm5, %xmm13 -; SSE-NEXT: por %xmm3, %xmm13 -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,2] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,2,2] +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: por %xmm2, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,0,0] +; SSE-NEXT: movdqa %xmm7, %xmm11 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,7,5,4,5] +; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,2] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,2,2] +; SSE-NEXT: movdqa %xmm3, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: por %xmm2, %xmm12 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa 16(%rdx), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,7,5,4,5] -; SSE-NEXT: pand %xmm5, %xmm15 -; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: movdqa (%rsi), %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa 32(%rsi), %xmm4 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa (%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,2,2] -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa (%rdx), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[1,1,2,2] -; SSE-NEXT: pandn %xmm12, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,4,5] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[0,0,0,0] -; SSE-NEXT: pandn %xmm12, %xmm5 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,7,5,4,5] +; SSE-NEXT: pand %xmm13, %xmm14 +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,2,2] +; SSE-NEXT: movdqa %xmm3, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: movdqa 48(%rdx), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0] +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm1[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm12, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa 48(%rdi), %xmm5 +; SSE-NEXT: movdqa 48(%rsi), %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,4,5] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm1[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] +; SSE-NEXT: pandn %xmm11, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: pand %xmm3, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,2,2] +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm13, %xmm3 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm0[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm10 +; SSE-NEXT: por %xmm13, %xmm10 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: pandn %xmm0, %xmm13 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,1,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[2,2,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: pandn %xmm8, %xmm13 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm12 -; SSE-NEXT: por %xmm4, %xmm12 -; SSE-NEXT: movdqa %xmm12, 32(%rcx) -; SSE-NEXT: movdqa %xmm1, 80(%rcx) -; SSE-NEXT: movdqa %xmm0, 128(%rcx) +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm13, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm11 +; SSE-NEXT: por %xmm1, %xmm11 ; SSE-NEXT: movdqa %xmm11, 176(%rcx) -; SSE-NEXT: movdqa %xmm5, (%rcx) -; SSE-NEXT: movdqa %xmm2, 16(%rcx) -; SSE-NEXT: movdqa %xmm15, 48(%rcx) -; SSE-NEXT: movdqa %xmm14, 64(%rcx) -; SSE-NEXT: movdqa %xmm13, 96(%rcx) -; SSE-NEXT: movdqa %xmm10, 112(%rcx) -; SSE-NEXT: movdqa %xmm9, 144(%rcx) +; SSE-NEXT: movdqa %xmm4, 128(%rcx) +; SSE-NEXT: movdqa %xmm0, 80(%rcx) +; SSE-NEXT: movdqa %xmm10, 32(%rcx) +; SSE-NEXT: movdqa %xmm3, 160(%rcx) +; SSE-NEXT: movdqa %xmm2, 144(%rcx) +; SSE-NEXT: movdqa %xmm15, 112(%rcx) +; SSE-NEXT: movdqa %xmm14, 96(%rcx) +; SSE-NEXT: movdqa %xmm12, 64(%rcx) +; SSE-NEXT: movdqa %xmm7, 48(%rcx) +; SSE-NEXT: movdqa %xmm9, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rcx) +; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride3_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,1,2,2] -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm4[2],xmm0[3,4],xmm4[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm4[1,2],xmm7[3],xmm4[4,5],xmm7[6],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3,4],xmm4[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3,4],xmm4[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm4[1,2],xmm7[3],xmm4[4,5],xmm7[6],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2],xmm0[3,4],xmm7[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1,2],xmm7[3],xmm0[4,5],xmm7[6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm11[2],xmm7[3,4],xmm11[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm7[1,2],xmm11[3],xmm7[4,5],xmm11[6],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm14[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3,4],xmm14[5],xmm11[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm15 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm11[1,2],xmm14[3],xmm11[4,5],xmm14[6],xmm11[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm4[2],xmm14[3,4],xmm4[5],xmm14[6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1],xmm11[2],xmm0[3,4],xmm11[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0],xmm11[1,2],xmm15[3],xmm11[4,5],xmm15[6],xmm11[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm11[1],xmm4[2,3],xmm11[4],xmm4[5,6],xmm11[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3],xmm0[4],xmm9[5,6],xmm0[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3,4],xmm10[5],xmm9[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2,3],xmm12[4],xmm7[5,6],xmm12[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6],xmm10[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5,6],xmm11[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2],xmm10[3,4],xmm12[5],xmm10[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm12[2],xmm1[3,4],xmm12[5],xmm1[6,7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 80(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 32(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm9, 48(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 96(%rcx) +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 176(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 128(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm14, 144(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 128(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 144(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 96(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 80(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 32(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 48(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm15, (%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm11, 160(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rcx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rcx) ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride3_vf32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm5 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <5,5,u,6,6,u,7,7> -; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm5, %ymm6 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm3, %ymm6, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm14, %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2],xmm9[3,4],xmm12[5],xmm9[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm9, %ymm7 -; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm7, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm6, %ymm9, %ymm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[1,1,2,2] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm13[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2],xmm9[3,4],xmm12[5],xmm9[6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm8 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <5,5,u,6,6,u,7,7> +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm7, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm10, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm13 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm14, %xmm11 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,2] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm13[2],xmm10[3,4],xmm13[5],xmm10[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm7, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm8, %ymm7, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm10, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[1,1,2,2] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm10, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[1,1,2,2] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2],xmm11[3,4],xmm13[5],xmm11[6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm10, %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-SLOW-NEXT: vpermd (%rdi), %ymm9, %ymm10 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm4, %ymm10, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm3, %ymm10, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <2,u,3,3,u,4,4,u> ; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm10, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermd 32(%rdi), %ymm9, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermd 32(%rdi), %ymm9, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm10, %ymm1 ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 96(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 64(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 160(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 96(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 160(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 64(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -1042,75 +1091,75 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2],xmm7[3,4],xmm3[5],xmm7[6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm9 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,6,7,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm5 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1],xmm5[2],xmm9[3,4],xmm5[5],xmm9[6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm9[2],xmm3[3,4],xmm9[5],xmm3[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm3, %ymm9 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm3[2],xmm10[3,4],xmm3[5],xmm10[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm3 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm9, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1],xmm2[2],xmm9[3,4],xmm2[5],xmm9[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1],xmm2[2],xmm7[3,4],xmm2[5],xmm7[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <5,5,u,6,6,u,7,7> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm8[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm5[2],xmm13[3,4],xmm5[5],xmm13[6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm5 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm13, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm8 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm3 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2],xmm6[3,4],xmm3[5],xmm6[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-FAST-NEXT: vpermd (%rdi), %ymm8, %ymm9 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm6, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm7, %ymm9, %ymm7 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm6 ; AVX2-FAST-NEXT: vpermd 32(%rdi), %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm6, %ymm8, %ymm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,u,3,3,u,4,4,u> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm3, 128(%rcx) +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm5, 128(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 160(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm4, 96(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm5, 160(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -1119,156 +1168,153 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2],xmm7[3,4],xmm3[5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm7, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,6,7,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1],xmm5[2],xmm9[3,4],xmm5[5],xmm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm9, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm9[2],xmm3[3,4],xmm9[5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm3, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm3[2],xmm10[3,4],xmm3[5],xmm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm7, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm9, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1],xmm2[2],xmm9[3,4],xmm2[5],xmm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm10, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm8, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1],xmm2[2],xmm7[3,4],xmm2[5],xmm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <5,5,u,6,6,u,7,7> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm8[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm5[2],xmm13[3,4],xmm5[5],xmm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm9, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm4, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2],xmm6[3,4],xmm3[5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm7, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm3, %ymm6, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm8, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm8, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm6, %ymm9, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm9, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm9, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm9, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpermd 32(%rdi), %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm6, %ymm8, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <2,u,3,3,u,4,4,u> ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm8, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 128(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 160(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 160(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-LABEL: store_i16_stride3_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX512F-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX512F-NEXT: vprold $16, %xmm2, %xmm4 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] +; AVX512F-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] +; AVX512F-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512F-NEXT: vmovdqa 16(%rsi), %xmm4 +; AVX512F-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX512F-NEXT: vprold $16, %xmm3, %xmm7 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm8 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX512F-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX512F-NEXT: vprold $16, %xmm3, %xmm3 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] -; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10> -; AVX512F-NEXT: vpermd (%rdx), %zmm4, %zmm4 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] -; AVX512F-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] -; AVX512F-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX512F-NEXT: vpor %ymm3, %ymm6, %ymm3 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX512F-NEXT: vmovdqa 48(%rsi), %xmm9 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX512F-NEXT: vpshufb %xmm7, %xmm10, %xmm7 -; AVX512F-NEXT: vprold $16, %xmm9, %xmm9 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm9[2],xmm6[3,4],xmm9[5],xmm6[6,7] +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1],xmm7[2],xmm10[3,4],xmm7[5],xmm10[6,7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm7 +; AVX512F-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512F-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] +; AVX512F-NEXT: vpshufb %ymm3, %ymm10, %ymm11 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512F-NEXT: vpermd %ymm10, %ymm12, %ymm10 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-NEXT: vpandn %ymm10, %ymm12, %ymm10 +; AVX512F-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm10 +; AVX512F-NEXT: vprold $16, %xmm6, %xmm7 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0,1],xmm7[2],xmm11[3,4],xmm7[5],xmm11[6,7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX512F-NEXT: vpshufb %xmm8, %xmm6, %xmm6 ; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX512F-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm6[4,5,6,7] -; AVX512F-NEXT: vmovdqa (%rdx), %ymm6 -; AVX512F-NEXT: vmovdqa 32(%rdx), %ymm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <5,5,u,6,6,u,7,7> -; AVX512F-NEXT: vpermd %ymm7, %ymm9, %ymm9 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-NEXT: vpandn %ymm9, %ymm10, %ymm9 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] -; AVX512F-NEXT: vpshufb %ymm10, %ymm7, %ymm7 -; AVX512F-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 -; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512F-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512F-NEXT: vpshufb %ymm8, %ymm5, %ymm5 -; AVX512F-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512F-NEXT: vprold $16, %xmm0, %xmm5 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-NEXT: vpshufb %ymm10, %ymm6, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-NEXT: vpermd %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX512F-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512F-NEXT: vprold $16, %xmm4, %xmm4 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] +; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm4 +; AVX512F-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = <5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10> +; AVX512F-NEXT: vpermd (%rdx), %zmm5, %zmm5 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm4 +; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 48(%rsi), %xmm4 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX512F-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX512F-NEXT: vprold $16, %xmm4, %xmm4 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7] +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <5,5,u,6,6,u,7,7> +; AVX512F-NEXT: vpermd %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-NEXT: vpandn %ymm1, %ymm4, %ymm1 +; AVX512F-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm1, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm5, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1664,201 +1710,201 @@ ; ; AVX1-ONLY-LABEL: store_i16_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $280, %rsp # imm = 0x118 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm12 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: subq $296, %rsp # imm = 0x128 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,2,2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm11 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm9 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm8 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3],xmm15[4],xmm0[5,6],xmm15[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1],xmm14[2],xmm15[3,4],xmm14[5],xmm15[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1],xmm15[2],xmm14[3,4],xmm15[5],xmm14[6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm15[1],xmm14[2,3],xmm15[4],xmm14[5,6],xmm15[7] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm15[2],xmm0[3,4],xmm15[5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm15[2],xmm14[3,4],xmm15[5],xmm14[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm15[2],xmm0[3,4],xmm15[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0],xmm14[1],xmm15[2,3],xmm14[4],xmm15[5,6],xmm14[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm15[1],xmm14[2,3],xmm15[4],xmm14[5,6],xmm15[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3],xmm15[4],xmm14[5,6],xmm15[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm15[1],xmm14[2,3],xmm15[4],xmm14[5,6],xmm15[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1],xmm15[2],xmm14[3,4],xmm15[5],xmm14[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm10[2],xmm14[3,4],xmm10[5],xmm14[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm10, %xmm10 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3],xmm14[4],xmm15[5,6],xmm14[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm14[0,1],xmm15[2],xmm14[3,4],xmm15[5],xmm14[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm15[1],xmm14[2,3],xmm15[4],xmm14[5,6],xmm15[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3],xmm11[4],xmm13[5,6],xmm11[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm11[1],xmm14[2,3],xmm11[4],xmm14[5,6],xmm11[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm9[1],xmm5[2,3],xmm9[4],xmm5[5,6],xmm9[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm15[2],xmm11[3,4],xmm15[5],xmm11[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1],xmm14[2],xmm15[3,4],xmm14[5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3],xmm12[4],xmm13[5,6],xmm12[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm13 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm9[2],xmm6[3,4],xmm9[5],xmm6[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm13[2],xmm9[3,4],xmm13[5],xmm9[6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2],xmm0[3,4],xmm7[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 32(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 48(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm9, (%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 80(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 288(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 368(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm14, 320(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 336(%rcx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%rcx) +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2],xmm1[3,4],xmm9[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 368(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 320(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 336(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 288(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 272(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, 224(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm14, 240(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm11, 192(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1866,320 +1912,324 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm12, 240(%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rcx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 352(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 352(%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%rcx) -; AVX1-ONLY-NEXT: addq $280, %rsp # imm = 0x118 +; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rcx) +; AVX1-ONLY-NEXT: addq $296, %rsp # imm = 0x128 ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride3_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 80(%rsi), %xmm5 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm8 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm5 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <5,5,u,6,6,u,7,7> -; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm8, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm2, %ymm5, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <5,5,u,6,6,u,7,7> +; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm11, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm6 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm13 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,3,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm8, %ymm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm6, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 112(%rsi), %xmm10 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm11 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm11, %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm5, %ymm6, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 80(%rsi), %xmm13 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm10[2],xmm6[3,4],xmm10[5],xmm6[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm8, %ymm10 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm6, %ymm10, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm13 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm14, %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,3,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2],xmm11[3,4],xmm13[5],xmm11[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm11, %ymm7 -; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[1,1,2,2] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm12[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm13[2],xmm6[3,4],xmm13[5],xmm6[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm11, %ymm13 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm6, %ymm13, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 112(%rsi), %xmm14 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm15 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2],xmm13[3,4],xmm14[5],xmm13[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm13, %ymm7 +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm11, %ymm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm7, %ymm11, %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[1,1,2,2] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm8, %ymm8 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm12, %ymm9 +; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm12, %ymm10 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm8, %ymm9, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm8, %ymm10, %ymm8 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm14 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm14 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm15[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm14[2],xmm10[3,4],xmm14[5],xmm10[6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] ; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm9 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm12, %ymm10 +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm12, %ymm10 ; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm9, %ymm10, %ymm9 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm14[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm10[2],xmm2[3,4],xmm10[5],xmm2[6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm14 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm15[2],xmm4[3,4],xmm15[5],xmm4[6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] ; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm10 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm10, %ymm2 -; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm12, %ymm10 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm2, %ymm10, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm2 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm10, %ymm4 +; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm12, %ymm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm4, %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm4 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[3,3,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm15 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[1,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm14[2],xmm5[3,4],xmm14[5],xmm5[6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3] -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm14 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm12, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm2, %ymm5, %ymm11 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm5 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm12, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm14, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = ; AVX2-SLOW-NEXT: vpermd (%rdi), %ymm12, %ymm13 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm13, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm13 -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm13, %ymm13 -; AVX2-SLOW-NEXT: vpermd 64(%rdi), %ymm12, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm13 +; AVX2-SLOW-NEXT: vpermd 32(%rdi), %ymm12, %ymm15 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm13, %ymm15, %ymm13 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm15 -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm15, %ymm15 -; AVX2-SLOW-NEXT: vpermd 32(%rdi), %ymm12, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm15 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm15, %ymm15 +; AVX2-SLOW-NEXT: vpermd 64(%rdi), %ymm12, %ymm6 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm15, %ymm6, %ymm6 ; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm15 -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm15, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm15, %ymm4 ; AVX2-SLOW-NEXT: vpermd 96(%rdi), %ymm12, %ymm12 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm12, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm4, %ymm12, %ymm4 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <2,u,3,3,u,4,4,u> -; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm12, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm12, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm13, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm12, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm12, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm13, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm12, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 320(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 224(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 32(%rcx) +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm6, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm12, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 320(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 224(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 128(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 32(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm11, 288(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm10, 96(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, 192(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm10, 192(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 96(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 64(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 352(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 352(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i16_stride3_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm3 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm4 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,6,7,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm5 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm15 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1],xmm5[2],xmm9[3,4],xmm5[5],xmm9[6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm13, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 80(%rsi), %xmm5 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2],xmm7[3,4],xmm3[5],xmm7[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <5,5,u,6,6,u,7,7> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm7 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1],xmm7[2],xmm15[3,4],xmm7[5],xmm15[6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm8 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm15 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm15, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm15 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm7, %ymm15, %ymm7 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm13, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm8 -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm3 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm15[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm13, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm4, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm7 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm7 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm8 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm15 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm13, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm15 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm15[2],xmm3[3,4],xmm15[5],xmm3[6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm15 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm13, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm12 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm13, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm12 +; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 80(%rsi), %xmm2 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm13 ; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 112(%rsi), %xmm3 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa 112(%rsi), %xmm2 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3,4],xmm3[5],xmm0[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm2 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm10, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm13 -; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-NEXT: vpermd (%rdi), %ymm4, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpermd (%rdi), %ymm3, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm9 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpermd 64(%rdi), %ymm4, %ymm11 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm11, %ymm9 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpermd 32(%rdi), %ymm4, %ymm14 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm11, %ymm14, %ymm11 +; AVX2-FAST-NEXT: vpermd 32(%rdi), %ymm3, %ymm14 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm9, %ymm14, %ymm9 +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm14 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vpermd 64(%rdi), %ymm3, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm14, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm14 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vpermd 96(%rdi), %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm11, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd 96(%rdi), %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,u,3,3,u,4,4,u> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm9, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm3, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm8, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 320(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 128(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 224(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 64(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 224(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 128(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 352(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm12, 288(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm13, 352(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 96(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 160(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm13, 256(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 192(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 160(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-FAST-NEXT: vzeroupper @@ -2187,143 +2237,144 @@ ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride3_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm6 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,6,7,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1],xmm5[2],xmm9[3,4],xmm5[5],xmm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm13, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm13, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rsi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm10, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2],xmm7[3,4],xmm3[5],xmm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <5,5,u,6,6,u,7,7> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1],xmm7[2],xmm15[3,4],xmm7[5],xmm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm13, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm5, %ymm7, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm10, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm7, %ymm15, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm13, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm15[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm13, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm4, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm2, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm5, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm7, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm8, %ymm13, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm15[2],xmm3[3,4],xmm15[5],xmm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm13, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm13, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rsi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm8, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rsi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rsi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3,4],xmm3[5],xmm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm10, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm10, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm4, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm9, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm3, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm9, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpermd 64(%rdi), %ymm4, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm9, %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermd 32(%rdi), %ymm4, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm11, %ymm14, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpermd 32(%rdi), %ymm3, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm9, %ymm14, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm14, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpermd 64(%rdi), %ymm3, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm14, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm14, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermd 96(%rdi), %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm9, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm11, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd 96(%rdi), %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <2,u,3,3,u,4,4,u> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm9, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm8, %ymm3, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm8, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 320(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 128(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 224(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 224(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 352(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 288(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 352(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 160(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 256(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 192(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 160(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -2331,155 +2382,148 @@ ; ; AVX512F-LABEL: store_i16_stride3_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] -; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm1, %ymm7 -; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] -; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512F-NEXT: vmovdqa64 16(%rsi), %xmm24 -; AVX512F-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX512F-NEXT: vprold $16, %xmm5, %xmm8 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512F-NEXT: vmovdqa64 16(%rdi), %xmm25 -; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-NEXT: vpshufb %xmm0, %xmm9, %xmm9 -; AVX512F-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa (%rdx), %ymm3 -; AVX512F-NEXT: vmovdqa 32(%rdx), %ymm8 -; AVX512F-NEXT: vmovdqa 64(%rdx), %ymm14 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] -; AVX512F-NEXT: vpshufb %ymm9, %ymm3, %ymm11 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm19 = -; AVX512F-NEXT: vpermd %ymm3, %ymm19, %ymm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-NEXT: vpandnq %ymm3, %ymm16, %ymm3 -; AVX512F-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-NEXT: vpternlogq $248, %zmm17, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqa 96(%rsi), %xmm10 -; AVX512F-NEXT: vprold $16, %xmm10, %xmm11 -; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX512F-NEXT: vpshufb %xmm0, %xmm10, %xmm10 -; AVX512F-NEXT: vmovdqa64 %xmm0, %xmm26 -; AVX512F-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512F-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX512F-NEXT: vmovdqa 80(%rsi), %xmm13 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX512F-NEXT: vpshufb %xmm11, %xmm15, %xmm15 -; AVX512F-NEXT: vprold $16, %xmm13, %xmm13 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] -; AVX512F-NEXT: vinserti128 $1, %xmm15, %ymm12, %ymm12 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm12[0,1,2,3],zmm10[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = <5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10> -; AVX512F-NEXT: vpermd 64(%rdx), %zmm20, %zmm10 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-NEXT: vpternlogq $184, %zmm15, %zmm21, %zmm10 -; AVX512F-NEXT: vmovdqa 96(%rdi), %ymm15 -; AVX512F-NEXT: vmovdqa %ymm7, %ymm1 -; AVX512F-NEXT: vpshufb %ymm7, %ymm15, %ymm15 -; AVX512F-NEXT: vmovdqa 96(%rsi), %ymm12 -; AVX512F-NEXT: vpshufb %ymm2, %ymm12, %ymm12 -; AVX512F-NEXT: vpor %ymm15, %ymm12, %ymm12 -; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm15 -; AVX512F-NEXT: vmovdqa 112(%rsi), %xmm13 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; AVX512F-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX512F-NEXT: vprold $16, %xmm13, %xmm13 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm13[2],xmm15[3,4],xmm13[5],xmm15[6,7] -; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm13, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm12[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-NEXT: vmovdqa 96(%rdx), %ymm12 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm23 = <5,5,u,6,6,u,7,7> -; AVX512F-NEXT: vpermd %ymm12, %ymm23, %ymm15 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-NEXT: vpandnq %ymm15, %ymm22, %ymm15 -; AVX512F-NEXT: vpshufb %ymm9, %ymm12, %ymm12 -; AVX512F-NEXT: vinserti64x4 $1, %ymm15, %zmm12, %zmm18 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-NEXT: vpternlogq $248, %zmm12, %zmm5, %zmm18 -; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX512F-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vmovdqa 64(%rsi), %ymm15 -; AVX512F-NEXT: vpshufb %ymm2, %ymm15, %ymm15 -; AVX512F-NEXT: vpor %ymm5, %ymm15, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm15 -; AVX512F-NEXT: vprold $16, %xmm15, %xmm0 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] -; AVX512F-NEXT: vmovdqa64 %xmm26, %xmm15 -; AVX512F-NEXT: vpshufb %xmm15, %xmm7, %xmm7 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] +; AVX512F-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512F-NEXT: vmovdqa %ymm0, %ymm14 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] +; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX512F-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512F-NEXT: vmovdqa 16(%rsi), %xmm5 +; AVX512F-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512F-NEXT: vprold $16, %xmm3, %xmm4 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm6 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1],xmm4[2],xmm8[3,4],xmm4[5],xmm8[6,7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm12 +; AVX512F-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512F-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX512F-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX512F-NEXT: vmovdqa 96(%rdx), %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] +; AVX512F-NEXT: vpshufb %ymm4, %ymm2, %ymm13 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm18 = +; AVX512F-NEXT: vpermd %ymm2, %ymm18, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-NEXT: vpandn %ymm2, %ymm15, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm13, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-NEXT: vpternlogq $248, %zmm13, %zmm12, %zmm2 +; AVX512F-NEXT: vprold $16, %xmm7, %xmm12 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm12[2],xmm0[3,4],xmm12[5],xmm0[6,7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; AVX512F-NEXT: vpshufb %xmm9, %xmm7, %xmm7 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm7, %ymm0 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-NEXT: vpshufb %ymm9, %ymm14, %ymm5 -; AVX512F-NEXT: vpermd %ymm14, %ymm19, %ymm7 -; AVX512F-NEXT: vpandnq %ymm7, %ymm16, %ymm7 -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 -; AVX512F-NEXT: vpternlogq $248, %zmm17, %zmm0, %zmm5 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX512F-NEXT: vpshufb %xmm7, %xmm11, %xmm11 +; AVX512F-NEXT: vprold $16, %xmm5, %xmm5 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1],xmm5[2],xmm10[3,4],xmm5[5],xmm10[6,7] +; AVX512F-NEXT: vinserti128 $1, %xmm11, %ymm5, %ymm5 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = <5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10> +; AVX512F-NEXT: vpermd (%rdx), %zmm16, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm5 +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX512F-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa64 %ymm14, %ymm19 +; AVX512F-NEXT: vmovdqa 64(%rsi), %ymm12 +; AVX512F-NEXT: vmovdqa %ymm1, %ymm14 +; AVX512F-NEXT: vpshufb %ymm1, %ymm12, %ymm12 +; AVX512F-NEXT: vpor %ymm0, %ymm12, %ymm0 +; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm12 +; AVX512F-NEXT: vprold $16, %xmm12, %xmm10 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX512F-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpshufb %ymm4, %ymm8, %ymm1 +; AVX512F-NEXT: vpermd %ymm8, %ymm18, %ymm8 +; AVX512F-NEXT: vpandn %ymm8, %ymm15, %ymm8 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm8 +; AVX512F-NEXT: vpternlogq $248, %zmm13, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqa 96(%rsi), %xmm0 +; AVX512F-NEXT: vprold $16, %xmm0, %xmm1 +; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm10 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1],xmm1[2],xmm11[3,4],xmm1[5],xmm11[6,7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; AVX512F-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 80(%rsi), %xmm9 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; AVX512F-NEXT: vpshufb %xmm7, %xmm10, %xmm10 +; AVX512F-NEXT: vprold $16, %xmm9, %xmm9 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2],xmm1[3,4],xmm9[5],xmm1[6,7] +; AVX512F-NEXT: vinserti128 $1, %xmm10, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpermd 64(%rdx), %zmm16, %zmm1 +; AVX512F-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm1 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm7 -; AVX512F-NEXT: vpshufb %ymm2, %ymm7, %ymm7 -; AVX512F-NEXT: vpor %ymm0, %ymm7, %ymm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX512F-NEXT: vmovdqa 48(%rsi), %xmm13 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] -; AVX512F-NEXT: vpshufb %xmm11, %xmm14, %xmm14 +; AVX512F-NEXT: vmovdqa64 %ymm19, %ymm12 +; AVX512F-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm9 +; AVX512F-NEXT: vpshufb %ymm14, %ymm9, %ymm9 +; AVX512F-NEXT: vpor %ymm0, %ymm9, %ymm0 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm9 +; AVX512F-NEXT: vmovdqa 48(%rsi), %xmm10 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512F-NEXT: vpshufb %xmm7, %xmm11, %xmm11 +; AVX512F-NEXT: vprold $16, %xmm10, %xmm10 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3,4],xmm10[5],xmm9[6,7] +; AVX512F-NEXT: vinserti128 $1, %xmm11, %ymm9, %ymm9 +; AVX512F-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <5,5,u,6,6,u,7,7> +; AVX512F-NEXT: vpermd %ymm6, %ymm9, %ymm10 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-NEXT: vpandn %ymm10, %ymm11, %ymm10 +; AVX512F-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX512F-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-NEXT: vpternlogq $248, %zmm10, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa 96(%rdi), %ymm0 +; AVX512F-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 96(%rsi), %ymm12 +; AVX512F-NEXT: vpshufb %ymm14, %ymm12, %ymm12 +; AVX512F-NEXT: vpor %ymm0, %ymm12, %ymm0 +; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm12 +; AVX512F-NEXT: vmovdqa 112(%rsi), %xmm13 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512F-NEXT: vpshufb %xmm7, %xmm14, %xmm7 ; AVX512F-NEXT: vprold $16, %xmm13, %xmm13 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm13[2],xmm7[3,4],xmm13[5],xmm7[6,7] -; AVX512F-NEXT: vinserti128 $1, %xmm14, %ymm7, %ymm7 -; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-NEXT: vpermd %ymm8, %ymm23, %ymm7 -; AVX512F-NEXT: vpandnq %ymm7, %ymm22, %ymm7 -; AVX512F-NEXT: vpshufb %ymm9, %ymm8, %ymm8 -; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512F-NEXT: vpternlogq $248, %zmm12, %zmm0, %zmm7 -; AVX512F-NEXT: vprold $16, %xmm6, %xmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm0[2],xmm8[3,4],xmm0[5],xmm8[6,7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX512F-NEXT: vpshufb %xmm15, %xmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %xmm24, %xmm1 -; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm6 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; AVX512F-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX512F-NEXT: vprold $16, %xmm24, %xmm4 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm25[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7] -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-NEXT: vpermd (%rdx), %zmm20, %zmm1 -; AVX512F-NEXT: vpternlogq $184, %zmm0, %zmm21, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm18, 320(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm10, 256(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm3, (%rcx) +; AVX512F-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] +; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm12, %ymm7 +; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 +; AVX512F-NEXT: vpermd %ymm3, %ymm9, %ymm7 +; AVX512F-NEXT: vpandn %ymm7, %ymm11, %ymm7 +; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 +; AVX512F-NEXT: vpternlogq $248, %zmm10, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, 320(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm1, 256(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm8, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm5, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm2, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2496,27 +2540,26 @@ ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u> -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm9, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm3, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm14, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm8, %zmm1 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm9 -; AVX512BW-NEXT: vpermt2w %zmm4, %zmm11, %zmm9 -; AVX512BW-NEXT: vpermt2w %zmm2, %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm4, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 320(%rcx) +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] +; AVX512BW-NEXT: vpermt2w %zmm4, %zmm11, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u> +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm12, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] +; AVX512BW-NEXT: vpermt2w %zmm4, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm1, %zmm6 +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm8, %zmm6 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm1, %zmm9 +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm9 +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm12, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, 320(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 256(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2533,6 +2576,7 @@ ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; AVX: {{.*}} ; AVX2: {{.*}} +; AVX2-ONLY: {{.*}} ; AVX512: {{.*}} ; AVX512-FAST: {{.*}} ; AVX512-SLOW: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll @@ -20,47 +20,92 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa (%rdx), %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, (%r8) ; SSE-NEXT: retq ; -; AVX1-LABEL: store_i16_stride4_vf2: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa (%rdx), %xmm1 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15] -; AVX1-NEXT: vmovdqa %xmm0, (%r8) -; AVX1-NEXT: retq +; AVX1-ONLY-LABEL: store_i16_stride4_vf2: +; AVX1-ONLY: # %bb.0: +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%r8) +; AVX1-ONLY-NEXT: retq ; -; AVX512F-LABEL: store_i16_stride4_vf2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15] -; AVX512F-NEXT: vmovdqa %xmm0, (%r8) -; AVX512F-NEXT: retq +; AVX2-SLOW-LABEL: store_i16_stride4_vf2: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastd (%rcx), %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastd (%rdx), %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%r8) +; AVX2-SLOW-NEXT: retq ; -; AVX512BW-LABEL: store_i16_stride4_vf2: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11] -; AVX512BW-NEXT: vpermi2w %xmm1, %xmm0, %xmm2 -; AVX512BW-NEXT: vmovdqa %xmm2, (%r8) -; AVX512BW-NEXT: retq +; AVX2-FAST-LABEL: store_i16_stride4_vf2: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm1 +; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm2 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vmovdqa %xmm0, (%r8) +; AVX2-FAST-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: store_i16_stride4_vf2: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rcx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%r8) +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX512-SLOW-LABEL: store_i16_stride4_vf2: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-SLOW-NEXT: vpbroadcastd (%rcx), %xmm1 +; AVX512-SLOW-NEXT: vpbroadcastd (%rdx), %xmm2 +; AVX512-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX512-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512-SLOW-NEXT: vmovdqa %xmm0, (%r8) +; AVX512-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: store_i16_stride4_vf2: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-FAST-NEXT: vpbroadcastd (%rdx), %xmm1 +; AVX512F-FAST-NEXT: vpunpckldq (%rcx){1to4}, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-FAST-NEXT: vmovdqa %xmm0, (%r8) +; AVX512F-FAST-NEXT: retq +; +; AVX512BW-FAST-LABEL: store_i16_stride4_vf2: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-FAST-NEXT: vpbroadcastd (%rcx), %xmm1 +; AVX512BW-FAST-NEXT: vpbroadcastd (%rdx), %xmm2 +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [4,14,5,15,4,14,5,15] +; AVX512BW-FAST-NEXT: vpermi2w %xmm1, %xmm2, %xmm3 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512BW-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX512BW-FAST-NEXT: vmovdqa %xmm0, (%r8) +; AVX512BW-FAST-NEXT: retq %in.vec0 = load <2 x i16>, ptr %in.vecptr0, align 64 %in.vec1 = load <2 x i16>, ptr %in.vecptr1, align 64 %in.vec2 = load <2 x i16>, ptr %in.vecptr2, align 64 @@ -236,10 +281,10 @@ ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm0, 32(%r8) -; SSE-NEXT: movdqa %xmm1, 48(%r8) +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa %xmm0, 48(%r8) +; SSE-NEXT: movdqa %xmm1, 32(%r8) ; SSE-NEXT: movdqa %xmm5, 16(%r8) ; SSE-NEXT: movdqa %xmm6, (%r8) ; SSE-NEXT: retq @@ -351,33 +396,33 @@ ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: movdqa %xmm4, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] ; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE-NEXT: movdqa %xmm1, 96(%r8) -; SSE-NEXT: movdqa %xmm6, 112(%r8) -; SSE-NEXT: movdqa %xmm8, 64(%r8) -; SSE-NEXT: movdqa %xmm10, 80(%r8) -; SSE-NEXT: movdqa %xmm0, 32(%r8) -; SSE-NEXT: movdqa %xmm5, 48(%r8) -; SSE-NEXT: movdqa %xmm2, (%r8) -; SSE-NEXT: movdqa %xmm3, 16(%r8) +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: movdqa %xmm1, 112(%r8) +; SSE-NEXT: movdqa %xmm6, 96(%r8) +; SSE-NEXT: movdqa %xmm8, 80(%r8) +; SSE-NEXT: movdqa %xmm10, 64(%r8) +; SSE-NEXT: movdqa %xmm0, 48(%r8) +; SSE-NEXT: movdqa %xmm5, 32(%r8) +; SSE-NEXT: movdqa %xmm2, 16(%r8) +; SSE-NEXT: movdqa %xmm3, (%r8) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride4_vf16: @@ -408,26 +453,26 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) ; AVX1-ONLY-NEXT: vzeroupper @@ -461,26 +506,26 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 96(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 64(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm5, 32(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r8) ; AVX2-SLOW-NEXT: vzeroupper @@ -511,22 +556,22 @@ ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm10, %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 96(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 64(%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm4, (%r8) ; AVX2-FAST-NEXT: vzeroupper @@ -560,26 +605,26 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 64(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%r8) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -591,27 +636,27 @@ ; AVX512F-NEXT: vmovdqa 16(%rcx), %xmm1 ; AVX512F-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512F-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-NEXT: vinserti32x4 $2, %xmm4, %zmm1, %zmm1 -; AVX512F-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa (%rsi), %xmm2 ; AVX512F-NEXT: vmovdqa 16(%rsi), %xmm4 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm5 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX512F-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27] -; AVX512F-NEXT: vpermt2d %zmm1, %zmm6, %zmm4 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512F-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27] +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm2 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm2, (%r8) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -662,104 +707,104 @@ ; SSE-NEXT: movdqa %xmm5, %xmm6 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm15[2],xmm6[3],xmm15[3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] ; SSE-NEXT: movdqa %xmm13, %xmm15 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; SSE-NEXT: movdqa %xmm11, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm15[2],xmm7[3],xmm15[3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] ; SSE-NEXT: movdqa %xmm11, %xmm8 -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm13[2],xmm8[3],xmm13[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] ; SSE-NEXT: movdqa %xmm10, %xmm15 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] ; SSE-NEXT: movdqa %xmm4, %xmm13 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] ; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; SSE-NEXT: movdqa 48(%rdx), %xmm15 ; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] ; SSE-NEXT: movdqa 48(%rcx), %xmm12 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] ; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] ; SSE-NEXT: movdqa %xmm15, %xmm10 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] ; SSE-NEXT: movdqa 48(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] -; SSE-NEXT: movdqa %xmm2, 224(%r8) -; SSE-NEXT: movdqa %xmm1, 240(%r8) -; SSE-NEXT: movdqa %xmm3, 192(%r8) -; SSE-NEXT: movdqa %xmm0, 208(%r8) -; SSE-NEXT: movdqa %xmm4, 160(%r8) -; SSE-NEXT: movdqa %xmm9, 176(%r8) -; SSE-NEXT: movdqa %xmm13, 128(%r8) -; SSE-NEXT: movdqa %xmm14, 144(%r8) -; SSE-NEXT: movdqa %xmm11, 96(%r8) -; SSE-NEXT: movdqa %xmm8, 112(%r8) -; SSE-NEXT: movdqa %xmm7, 64(%r8) +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; SSE-NEXT: movdqa %xmm2, 240(%r8) +; SSE-NEXT: movdqa %xmm1, 224(%r8) +; SSE-NEXT: movdqa %xmm3, 208(%r8) +; SSE-NEXT: movdqa %xmm0, 192(%r8) +; SSE-NEXT: movdqa %xmm4, 176(%r8) +; SSE-NEXT: movdqa %xmm9, 160(%r8) +; SSE-NEXT: movdqa %xmm13, 144(%r8) +; SSE-NEXT: movdqa %xmm14, 128(%r8) +; SSE-NEXT: movdqa %xmm11, 112(%r8) +; SSE-NEXT: movdqa %xmm8, 96(%r8) +; SSE-NEXT: movdqa %xmm7, 80(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%r8) -; SSE-NEXT: movdqa %xmm5, 32(%r8) +; SSE-NEXT: movaps %xmm0, 64(%r8) +; SSE-NEXT: movdqa %xmm5, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r8) -; SSE-NEXT: movdqa %xmm6, (%r8) +; SSE-NEXT: movaps %xmm0, 32(%r8) +; SSE-NEXT: movdqa %xmm6, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r8) +; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride4_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm9, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4],ymm2[5],ymm11[6],ymm2[7] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm6[0],zero,xmm6[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm9, %ymm6 @@ -792,9 +837,9 @@ ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2],ymm8[3],ymm11[4],ymm8[5],ymm11[6],ymm8[7] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] @@ -822,45 +867,45 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride4_vf32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm7 ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 48(%rcx), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 48(%rcx), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm8 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 48(%rdx), %xmm8 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX2-SLOW-NEXT: vmovdqa 48(%rdx), %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm9, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm9 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm12 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4],ymm2[5],ymm11[6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm6[0],zero,xmm6[1],zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm9, %ymm6 @@ -893,9 +938,9 @@ ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm12 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2],ymm8[3],ymm11[4],ymm8[5],ymm11[6],ymm8[7] -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] @@ -923,44 +968,44 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm9, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 64(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 224(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 192(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm4, 160(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm8, 128(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 224(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 192(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 96(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 64(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 32(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r8) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i16_stride4_vf32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm8 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm9 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm10 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm11 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,1,1,2,2,3,3] ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2],ymm12[3],ymm2[4],ymm12[5],ymm2[6],ymm12[7] -; AVX2-FAST-NEXT: vmovdqa 48(%rcx), %xmm12 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm12 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vmovdqa 48(%rdx), %xmm13 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm13 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm3, %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] @@ -986,12 +1031,12 @@ ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] -; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm12 +; AVX2-FAST-NEXT: vmovdqa 48(%rcx), %xmm12 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm6 +; AVX2-FAST-NEXT: vmovdqa 48(%rdx), %xmm6 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm3, %ymm10 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm10[1],ymm4[2],ymm10[3],ymm4[4],ymm10[5],ymm4[6],ymm10[7] @@ -1009,45 +1054,45 @@ ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 64(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 224(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm10, 192(%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm4, 160(%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm9, 128(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 224(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 192(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 96(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 64(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r8) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride4_vf32: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rcx), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rcx), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rcx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rcx), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdx), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm9, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4],ymm2[5],ymm11[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm6[0],zero,xmm6[1],zero ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm9, %ymm6 @@ -1080,9 +1125,9 @@ ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2],ymm8[3],ymm11[4],ymm8[5],ymm11[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] @@ -1110,27 +1155,27 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm9, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 224(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 192(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 160(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 128(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 224(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 192(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%r8) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: store_i16_stride4_vf32: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rcx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rcx), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rdx), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rcx), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rcx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rdx), %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,0,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] @@ -1140,14 +1185,14 @@ ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rsi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rsi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rdi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rsi), %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rsi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm15[0],zero,xmm15[1],zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] @@ -1217,23 +1262,23 @@ ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 192(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 128(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 128(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 64(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i16_stride4_vf32: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm9 -; AVX512F-FAST-NEXT: vmovdqa 48(%rsi), %xmm5 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 16(%rsi), %xmm9 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX512F-FAST-NEXT: vmovdqa 48(%rsi), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] @@ -1243,14 +1288,14 @@ ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa 16(%rcx), %xmm11 -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX512F-FAST-NEXT: vmovdqa 48(%rcx), %xmm7 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa 16(%rdx), %xmm13 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdx), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm11 +; AVX512F-FAST-NEXT: vmovdqa 16(%rcx), %xmm12 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX512F-FAST-NEXT: vmovdqa 48(%rcx), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512F-FAST-NEXT: vmovdqa 16(%rdx), %xmm14 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa 48(%rdx), %xmm4 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm11, %zmm11 @@ -1297,23 +1342,23 @@ ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 ; AVX512F-FAST-NEXT: vpermd %zmm2, %zmm13, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 192(%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 128(%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 128(%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 64(%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%r8) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i16_stride4_vf32: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rsi), %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rsi), %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rsi), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] @@ -1323,14 +1368,14 @@ ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rcx), %xmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rcx), %xmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdx), %xmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdx), %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rcx), %xmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rcx), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdx), %xmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdx), %xmm4 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm15[0,0,1,1] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] @@ -1396,10 +1441,10 @@ ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 192(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 128(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 192(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 128(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; @@ -1657,134 +1702,134 @@ ; AVX1-ONLY-LABEL: store_i16_stride4_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $40, %rsp -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm13 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm11[0],zero,xmm11[1],zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm15, %ymm11 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2],ymm0[3],ymm11[4],ymm0[5],ymm11[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm2[0],zero,xmm2[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm12[0],zero,xmm12[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm12[0],zero,xmm12[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm2[1],ymm12[2],ymm2[3],ymm12[4],ymm2[5],ymm12[6],ymm2[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm2[1],ymm14[2],ymm2[3],ymm14[4],ymm2[5],ymm14[6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm4[0],zero,xmm4[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm13, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 80(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,0,1,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm12, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm12[0],zero,xmm12[1],zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm15, %ymm15 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm4[1],ymm15[2],ymm4[3],ymm15[4],ymm4[5],ymm15[6],ymm4[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0],ymm4[1],ymm13[2],ymm4[3],ymm13[4],ymm4[5],ymm13[6],ymm4[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm7[0],zero,xmm7[1],zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4],ymm5[5],ymm7[6],ymm5[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm6[0],zero,xmm6[1],zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2],ymm6[3],ymm10[4],ymm6[5],ymm10[6],ymm6[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7] +; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8 +; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0],ymm7[1],ymm13[2],ymm7[3],ymm13[4],ymm7[5],ymm13[6],ymm7[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0],ymm8[1],ymm13[2],ymm8[3],ymm13[4],ymm8[5],ymm13[6],ymm8[7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4],ymm9[5],ymm10[6],ymm9[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1] +; AVX1-ONLY-NEXT: vmovdqa 80(%rcx), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm11[0],zero,xmm11[1],zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm8[0],zero,xmm8[1],zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] -; AVX1-ONLY-NEXT: vmovdqa 96(%rcx), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm12 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm8[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm8 -; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm13, %ymm10 +; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm13 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm15[0],zero,xmm15[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm10[1],ymm0[2],ymm10[3],ymm0[4],ymm10[5],ymm0[6],ymm10[7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -1794,14 +1839,14 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm0[1],ymm11[2],ymm0[3],ymm11[4],ymm0[5],ymm11[6],ymm0[7] -; AVX1-ONLY-NEXT: vmovdqa 112(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vmovdqa 96(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm13 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 -; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm14 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -1815,21 +1860,21 @@ ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa 112(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm14 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2],ymm13[3],ymm0[4],ymm13[5],ymm0[6],ymm13[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 @@ -1838,27 +1883,27 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 480(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 448(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 416(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 384(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 352(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 416(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 384(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 352(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 288(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 256(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) ; AVX1-ONLY-NEXT: addq $40, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -1866,134 +1911,134 @@ ; AVX2-SLOW-LABEL: store_i16_stride4_vf64: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $40, %rsp -; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa 48(%rcx), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 48(%rcx), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa 48(%rdx), %xmm8 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm14 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm11[0],zero,xmm11[1],zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm15, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2],ymm0[3],ymm11[4],ymm0[5],ymm11[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm2[0],zero,xmm2[1],zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm12[0],zero,xmm12[1],zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm12[0],zero,xmm12[1],zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm13, %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm2[1],ymm12[2],ymm2[3],ymm12[4],ymm2[5],ymm12[6],ymm2[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm14 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm2[1],ymm14[2],ymm2[3],ymm14[4],ymm2[5],ymm14[6],ymm2[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm4[0],zero,xmm4[1],zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 80(%rcx), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 80(%rdx), %xmm7 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,0,1,1] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm12, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 80(%rsi), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm12[0],zero,xmm12[1],zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm15, %ymm15 -; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm4[1],ymm15[2],ymm4[3],ymm15[4],ymm4[5],ymm15[6],ymm4[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm4[1],ymm13[2],ymm4[3],ymm13[4],ymm4[5],ymm13[6],ymm4[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,0,1,1] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm7[0],zero,xmm7[1],zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm13, %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4],ymm5[5],ymm7[6],ymm5[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[0,0,1,1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm6[0],zero,xmm6[1],zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm10, %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm10, %ymm6 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2],ymm6[3],ymm10[4],ymm6[5],ymm10[6],ymm6[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm13, %ymm7 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7] +; AVX2-SLOW-NEXT: vmovdqa 64(%rcx), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0],ymm7[1],ymm13[2],ymm7[3],ymm13[4],ymm7[5],ymm13[6],ymm7[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm8[1],ymm13[2],ymm8[3],ymm13[4],ymm8[5],ymm13[6],ymm8[7] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4],ymm9[5],ymm10[6],ymm9[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1] +; AVX2-SLOW-NEXT: vmovdqa 80(%rcx), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 80(%rdx), %xmm12 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm11[0],zero,xmm11[1],zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm13, %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm6 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm8[0],zero,xmm8[1],zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %xmm12 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm8[0,0,1,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm13, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 80(%rsi), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm15[0],zero,xmm15[1],zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0],ymm10[1],ymm0[2],ymm10[3],ymm0[4],ymm10[5],ymm0[6],ymm10[7] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -2003,14 +2048,14 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm0[1],ymm11[2],ymm0[3],ymm11[4],ymm0[5],ymm11[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 112(%rcx), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 112(%rdx), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 96(%rcx), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %xmm13 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12 -; AVX2-SLOW-NEXT: vmovdqa 112(%rsi), %xmm14 -; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm14 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -2024,21 +2069,21 @@ ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-SLOW-NEXT: vmovdqa 112(%rcx), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 112(%rdx), %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm14 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm15 +; AVX2-SLOW-NEXT: vmovdqa 112(%rsi), %xmm14 +; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2],ymm13[3],ymm0[4],ymm13[5],ymm0[6],ymm13[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 @@ -2047,27 +2092,27 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 480(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm12, 448(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm11, 416(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, 384(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 224(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm10, 192(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, 160(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 128(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 352(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 480(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 448(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 416(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm12, 384(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, 352(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm10, 320(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 288(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, 256(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 224(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 192(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 160(%r8) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%r8) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%r8) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%r8) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%r8) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r8) ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r8) ; AVX2-SLOW-NEXT: addq $40, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -2075,113 +2120,113 @@ ; AVX2-FAST-LABEL: store_i16_stride4_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $40, %rsp -; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm8 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm7 ; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm5 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm12 +; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm13 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 48(%rcx), %xmm10 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm15 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2],ymm6[3],ymm0[4],ymm6[5],ymm0[6],ymm6[7] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm2, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2],ymm14[3],ymm0[4],ymm14[5],ymm0[6],ymm14[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm0 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm14 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm3[0],zero,xmm3[1],zero +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm3[0],zero,xmm3[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm8 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm13[1],ymm3[2],ymm13[3],ymm3[4],ymm13[5],ymm3[6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm12, %ymm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm2, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 48(%rdx), %xmm12 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 80(%rsi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm8 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm5[0],zero,xmm5[1],zero -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm13, %ymm5 -; AVX2-FAST-NEXT: vmovdqa 80(%rcx), %xmm14 -; AVX2-FAST-NEXT: vmovdqa 80(%rdx), %xmm15 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm13[1],ymm5[2],ymm13[3],ymm5[4],ymm13[5],ymm5[6],ymm13[7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4],ymm5[5],ymm0[6],ymm5[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 48(%rcx), %xmm13 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm6[0],zero,xmm6[1],zero -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm8[0],zero,xmm8[1],zero -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm14, %ymm8 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm2, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2],ymm14[3],ymm8[4],ymm14[5],ymm8[6],ymm14[7] -; AVX2-FAST-NEXT: vmovdqa 48(%rdx), %xmm14 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm0[1],ymm10[2],ymm0[3],ymm10[4],ymm0[5],ymm10[6],ymm0[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0],ymm11[1],ymm0[2],ymm11[3],ymm0[4],ymm11[5],ymm0[6],ymm11[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm6[1],ymm0[2],ymm6[3],ymm0[4],ymm6[5],ymm0[6],ymm6[7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4],ymm7[5],ymm0[6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm9[0],zero,xmm9[1],zero +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm10 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm9[0],zero,xmm9[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm13, %ymm9 -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm13 -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm14 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm12 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm13[1],ymm9[2],ymm13[3],ymm9[4],ymm13[5],ymm9[6],ymm13[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0],ymm10[1],ymm0[2],ymm10[3],ymm0[4],ymm10[5],ymm0[6],ymm10[7] +; AVX2-FAST-NEXT: vmovdqa 80(%rsi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm12 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm11[0],zero,xmm11[1],zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm13, %ymm11 +; AVX2-FAST-NEXT: vmovdqa 80(%rcx), %xmm13 +; AVX2-FAST-NEXT: vmovdqa 80(%rdx), %xmm14 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-FAST-NEXT: vpermd %ymm15, %ymm2, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm15[1],ymm9[2],ymm15[3],ymm9[4],ymm15[5],ymm9[6],ymm15[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm15[1],ymm11[2],ymm15[3],ymm11[4],ymm15[5],ymm11[6],ymm15[7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -2189,14 +2234,14 @@ ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm2, %ymm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm12[1],ymm0[2],ymm12[3],ymm0[4],ymm12[5],ymm0[6],ymm12[7] -; AVX2-FAST-NEXT: vmovdqa 112(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm14 +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm14 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm13[0],zero,xmm13[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm13, %ymm15, %ymm13 -; AVX2-FAST-NEXT: vmovdqa 112(%rcx), %xmm15 -; AVX2-FAST-NEXT: vmovdqa 112(%rdx), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm3[1],ymm13[2],ymm3[3],ymm13[4],ymm3[5],ymm13[6],ymm3[7] @@ -2207,14 +2252,14 @@ ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 112(%rsi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm14[0],zero,xmm14[1],zero ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,2,3,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm15 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 112(%rcx), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 112(%rdx), %xmm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0],ymm5[1],ymm14[2],ymm5[3],ymm14[4],ymm5[5],ymm14[6],ymm5[7] @@ -2225,27 +2270,27 @@ ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 480(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm13, 448(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm12, 416(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm9, 384(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 224(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm11, 192(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 160(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 128(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 352(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 480(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 448(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 416(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm13, 384(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm12, 352(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm11, 320(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm10, 288(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm9, 256(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 224(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 192(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 160(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%r8) +; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%r8) +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r8) ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-FAST-NEXT: vmovaps %ymm0, (%r8) ; AVX2-FAST-NEXT: addq $40, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -2253,134 +2298,134 @@ ; AVX2-FAST-PERLANE-LABEL: store_i16_stride4_vf64: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $40, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rcx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rcx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rcx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rcx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdx), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm11[0],zero,xmm11[1],zero +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm15, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2],ymm0[3],ymm11[4],ymm0[5],ymm11[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm2[0],zero,xmm2[1],zero ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm12[0],zero,xmm12[1],zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm12[0],zero,xmm12[1],zero ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm2[1],ymm12[2],ymm2[3],ymm12[4],ymm2[5],ymm12[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm2[1],ymm14[2],ymm2[3],ymm14[4],ymm2[5],ymm14[6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm4[0],zero,xmm4[1],zero ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rcx), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdx), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm12, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rsi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm12[0],zero,xmm12[1],zero -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm4[1],ymm15[2],ymm4[3],ymm15[4],ymm4[5],ymm15[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm4[1],ymm13[2],ymm4[3],ymm13[4],ymm4[5],ymm13[6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm7[0],zero,xmm7[1],zero -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm13, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4],ymm5[5],ymm7[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm6[0],zero,xmm6[1],zero +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm10, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm10, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2],ymm6[3],ymm10[4],ymm6[5],ymm10[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm13, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0],ymm7[1],ymm13[2],ymm7[3],ymm13[4],ymm7[5],ymm13[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm8[1],ymm13[2],ymm8[3],ymm13[4],ymm8[5],ymm13[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4],ymm9[5],ymm10[6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rcx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm11[0],zero,xmm11[1],zero -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm13, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm8[0],zero,xmm8[1],zero -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm8[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm13, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rsi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm15[0],zero,xmm15[1],zero ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0],ymm10[1],ymm0[2],ymm10[3],ymm0[4],ymm10[5],ymm0[6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -2390,14 +2435,14 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm0[1],ymm11[2],ymm0[3],ymm11[4],ymm0[5],ymm11[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rcx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rsi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -2411,21 +2456,21 @@ ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rcx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rsi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2],ymm13[3],ymm0[4],ymm13[5],ymm0[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 @@ -2434,573 +2479,570 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 480(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 448(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 416(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 384(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 224(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 192(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 160(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 128(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 352(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 480(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 448(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 416(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 384(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 352(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 320(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 288(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 256(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 224(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 192(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 160(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 288(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r8) ; AVX2-FAST-PERLANE-NEXT: addq $40, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: store_i16_stride4_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %xmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rcx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rcx), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rdx), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rsi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rdi), %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm3[4],xmm14[5],xmm3[5],xmm14[6],xmm3[6],xmm14[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm15[0],zero,xmm15[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm3[0],zero,xmm3[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm14, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm3, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: movw $-21846, %ax # imm = 0xAAAA -; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm9, %zmm17 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm15[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm14, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm9, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm0[0],zero,xmm0[1],zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rcx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rcx), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rdx), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm16, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm9[0],zero,xmm9[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm15, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm14, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 112(%rcx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 112(%rdx), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rsi), %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rsi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rdi), %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm15[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm15[0],zero,xmm15[1],zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm14, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 112(%rsi), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 112(%rdi), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm3[0],zero,xmm3[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm16, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm14[0],zero,xmm14[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm14, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm15[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm15[4],xmm3[4],xmm15[5],xmm3[5],xmm15[6],xmm3[6],xmm15[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm9[0],zero,xmm9[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm9, %ymm16, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm3[0],zero,xmm3[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm15, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rcx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm9[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm9, %ymm16, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rsi), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm14[0],zero,xmm14[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm14, %ymm16, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm3[0],zero,xmm3[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm9, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm3, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: movw $-21846, %ax # imm = 0xAAAA +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm13, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm1[0],zero,xmm1[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm13, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm10, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm9, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm3[0],zero,xmm3[1],zero +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm9, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm7, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rcx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdx), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rsi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm20, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm6, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm6, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 192(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 128(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 320(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 256(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 448(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 384(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%r8) +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 112(%rcx), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 112(%rdx), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 112(%rsi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 112(%rdi), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 448(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 384(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 320(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 256(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 192(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i16_stride4_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %xmm19 -; AVX512F-FAST-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX512F-FAST-NEXT: vmovdqa 48(%rsi), %xmm6 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %xmm20 -; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa 16(%rcx), %xmm5 -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm13 -; AVX512F-FAST-NEXT: vmovdqa 48(%rcx), %xmm9 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa 16(%rdx), %xmm8 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdx), %xmm10 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm5, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-FAST-NEXT: movw $-21846, %ax # imm = 0xAAAA -; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vpermd %zmm8, %zmm5, %zmm18 {%k1} -; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %xmm8 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm0[0],zero,xmm0[1],zero -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm0, %ymm16, %ymm0 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm8[0],zero,xmm8[1],zero -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm15, %ymm8 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %xmm15 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm5, %zmm8 {%k1} -; AVX512F-FAST-NEXT: vmovdqa 112(%rsi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm15[0],zero,xmm15[1],zero -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 -; AVX512F-FAST-NEXT: vmovdqa 112(%rcx), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 112(%rdx), %xmm1 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm5, %zmm15 {%k1} -; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm2[0],zero,xmm2[1],zero -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm5, %zmm16 {%k1} -; AVX512F-FAST-NEXT: vmovdqa 80(%rsi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm17 = xmm2[0],zero,xmm2[1],zero -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm17, %ymm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm17 -; AVX512F-FAST-NEXT: vmovdqa 80(%rcx), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 80(%rdx), %xmm1 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa 16(%rsi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa 48(%rsi), %xmm4 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm5, %zmm17 {%k1} -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 -; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm5, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm6, %zmm2 -; AVX512F-FAST-NEXT: vpermd %zmm2, %zmm5, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm7 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm9 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 16(%rcx), %xmm12 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX512F-FAST-NEXT: vmovdqa 48(%rcx), %xmm6 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512F-FAST-NEXT: vmovdqa 16(%rdx), %xmm14 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX512F-FAST-NEXT: vmovdqa 48(%rdx), %xmm7 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm1, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-FAST-NEXT: movw $-21846, %ax # imm = 0xAAAA +; AVX512F-FAST-NEXT: kmovw %eax, %k1 +; AVX512F-FAST-NEXT: vpermd %zmm13, %zmm1, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm13[0],zero,xmm13[1],zero +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm13, %ymm15, %ymm13 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm2[0],zero,xmm2[1],zero ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm11, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm12, %zmm11 +; AVX512F-FAST-NEXT: vpermd %zmm11, %zmm1, %zmm2 {%k1} +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm3[0],zero,xmm3[1],zero +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm8, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm9, %zmm8 +; AVX512F-FAST-NEXT: vpermd %zmm8, %zmm1, %zmm3 {%k1} +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm6, %zmm5 +; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm1, %zmm4 {%k1} +; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %xmm5 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %xmm6 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %xmm7 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm6, %zmm6 +; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm1, %zmm5 {%k1} +; AVX512F-FAST-NEXT: vmovdqa 80(%rsi), %xmm6 +; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm7 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm2 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm3, %zmm3 -; AVX512F-FAST-NEXT: vpermd %zmm3, %zmm5, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, (%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 192(%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, 320(%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 256(%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 448(%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 384(%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 64(%r8) +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa 80(%rcx), %xmm7 +; AVX512F-FAST-NEXT: vmovdqa 80(%rdx), %xmm8 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm7, %zmm7 +; AVX512F-FAST-NEXT: vpermd %zmm7, %zmm1, %zmm6 {%k1} +; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %xmm7 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %xmm9 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vpermd %zmm8, %zmm1, %zmm7 {%k1} +; AVX512F-FAST-NEXT: vmovdqa 112(%rsi), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm9 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-FAST-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vmovdqa 112(%rcx), %xmm9 +; AVX512F-FAST-NEXT: vmovdqa 112(%rdx), %xmm10 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm1, %zmm8 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 448(%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 384(%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 320(%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 256(%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 192(%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 128(%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 64(%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i16_stride4_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rsi), %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rsi), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm6[0],zero,xmm6[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm6 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rcx), %xmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rcx), %xmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdx), %xmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdx), %xmm9 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rcx), %xmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rcx), %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdx), %xmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdx), %xmm6 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm15[0,0,1,1] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm13, %ymm6 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 ; AVX512DQ-SLOW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm15, %zmm6, %zmm17 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %xmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %xmm13 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm15[0],zero,xmm15[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm6[0],zero,xmm6[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm13, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %xmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %xmm15 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm0[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm16, %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm15, %zmm13, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm15, %ymm13 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm13, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 112(%rsi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 112(%rdi), %xmm13 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm15[0],zero,xmm15[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm13, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa 112(%rcx), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 112(%rdx), %xmm15 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm6[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm10, %zmm11, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm10, %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm8, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm4, %zmm5, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm6, %ymm16, %ymm6 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm15, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm6, %zmm0, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm15[0],zero,xmm15[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm5 ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm13[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm13, %ymm16, %ymm13 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm13, %zmm0, %zmm15 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rsi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm13[0],zero,xmm13[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm13, %ymm16, %ymm13 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rcx), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdx), %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm20 = xmm13[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm13, %ymm20, %ymm13 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm13, %zmm0, %zmm16 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm6[0],zero,xmm6[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm10, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm10 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm11, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm6, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm5, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rsi), %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm6 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rcx), %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdx), %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm5, %zmm6, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 128(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 320(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 256(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 448(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 384(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 64(%r8) +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm8, %zmm6, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %xmm8 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm9, %zmm7, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 112(%rsi), %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 112(%rdi), %xmm8 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 112(%rcx), %xmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 112(%rdx), %xmm9 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm10, %zmm8, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 448(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 384(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 320(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 256(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 192(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 128(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; @@ -3014,32 +3056,32 @@ ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm10, %zmm11 ; AVX512BW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm9, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm13, %zmm14 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm12, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm16, %zmm17 ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm15, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63,u,u> ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm8 @@ -3054,14 +3096,14 @@ ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm15, %zmm5 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 ; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i16>, ptr %in.vecptr0, align 64 @@ -3077,11 +3119,10 @@ } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; AVX: {{.*}} +; AVX1: {{.*}} ; AVX2: {{.*}} ; AVX512: {{.*}} ; AVX512-FAST: {{.*}} -; AVX512-SLOW: {{.*}} -; AVX512BW-FAST: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} ; AVX512BW-ONLY-SLOW: {{.*}} ; AVX512BW-SLOW: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll @@ -18,54 +18,57 @@ define void @store_i16_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride5_vf2: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movaps (%rdi), %xmm0 ; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: movdqa (%r8), %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,7,5] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,1] ; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movd %xmm0, 16(%r9) +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movd %xmm1, 16(%r9) ; SSE-NEXT: movdqa %xmm3, (%r9) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride5_vf2: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,2,3,6,7,10,11] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovd %xmm1, 16(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%r9) +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,12,13,u,u,2,3,6,7,10,11] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovd %xmm0, 16(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%r9) ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride5_vf2: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX2-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-SLOW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,2,3,6,7,10,11,u,u,18,19,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -81,16 +84,20 @@ ; ; AVX2-FAST-LABEL: store_i16_stride5_vf2: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX2-FAST-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,ymm0[2,3,6,7,10,11],zero,zero,ymm0[18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[30,31],zero,zero,ymm0[30,31,30,31,16,17,18,19,28,29,30,31] -; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,2,3,6,7,10,11,u,u,18,19,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vmovd %xmm1, 16(%r9) ; AVX2-FAST-NEXT: vmovdqa %xmm0, (%r9) @@ -99,11 +106,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride5_vf2: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,ymm0[2,3,6,7,10,11],zero,zero,ymm0[18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -115,31 +124,55 @@ ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; -; AVX512F-LABEL: store_i16_stride5_vf2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,ymm0[2,3,6,7,10,11],zero,zero,ymm0[18,19,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vmovd %xmm1, 16(%r9) -; AVX512F-NEXT: vmovdqa %xmm0, (%r9) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-SLOW-LABEL: store_i16_stride5_vf2: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,ymm0[2,3,6,7,10,11],zero,zero,ymm0[18,19,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-SLOW-NEXT: vmovd %xmm1, 16(%r9) +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, (%r9) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: store_i16_stride5_vf2: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512F-FAST-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,ymm0[2,3,6,7,10,11],zero,zero,ymm0[18,19,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9],zero,zero,zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-FAST-NEXT: vmovd %xmm1, 16(%r9) +; AVX512F-FAST-NEXT: vmovdqa %xmm0, (%r9) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq ; ; AVX512BW-LABEL: store_i16_stride5_vf2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u> ; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 @@ -216,57 +249,60 @@ ; ; AVX1-ONLY-LABEL: store_i16_stride5_vf4: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[10,11,10,11,u,u,u,u,4,5,12,13,u,u,u,u] +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[10,11,10,11,u,u,u,u,4,5,12,13,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[3,1,2,1] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6],xmm2[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm4, 16(%r9) -; AVX1-ONLY-NEXT: vmovq %xmm3, 32(%r9) +; AVX1-ONLY-NEXT: vmovq %xmm1, 32(%r9) +; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride5_vf4: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovq %xmm0, 32(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6],ymm0[7],ymm1[8,9],ymm0[10,11],ymm1[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-SLOW-NEXT: vmovq %xmm2, 32(%r9) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -274,25 +310,26 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovq %xmm0, 32(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6],ymm0[7],ymm1[8,9],ymm0[10,11],ymm1[12,13,14],ymm0[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-FAST-NEXT: vmovq %xmm2, 32(%r9) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -300,78 +337,79 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm0, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq (%r8), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6],ymm0[7],ymm1[8,9],ymm0[10,11],ymm1[12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm2, 32(%r9) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: store_i16_stride5_vf4: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: movq (%r8), %rax ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpbroadcastq %rax, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15] -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vmovq %xmm0, 32(%r9) -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%r9) +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6],ymm0[7],ymm3[8,9],ymm0[10,11],ymm3[12,13,14],ymm0[15] +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovq %xmm1, 32(%r9) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%r9) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i16_stride5_vf4: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: movq (%r8), %rax ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vpbroadcastq %rax, %ymm3 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15] -; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 -; AVX512F-FAST-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 -; AVX512F-FAST-NEXT: vmovq %xmm0, 32(%r9) -; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%r9) +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6],ymm0[7],ymm3[8,9],ymm0[10,11],ymm3[12,13,14],ymm0[15] +; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovq %xmm1, 32(%r9) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%r9) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -383,14 +421,13 @@ ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3,7,11,15,19,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512BW-NEXT: vmovq %xmm1, 32(%r9) -; AVX512BW-NEXT: vmovdqa %ymm0, (%r9) +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,4,8,12,32,1,5,9,13,33,2,6,10,14,34,3,7,11,15,35,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm2, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, 32(%r9) +; AVX512BW-NEXT: vmovdqa %ymm2, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <4 x i16>, ptr %in.vecptr0, align 64 @@ -411,23 +448,23 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride5_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa (%rdi), %xmm2 ; SSE-NEXT: movdqa (%rsi), %xmm7 -; SSE-NEXT: movdqa (%rdx), %xmm2 -; SSE-NEXT: movdqa (%rcx), %xmm3 +; SSE-NEXT: movdqa (%rdx), %xmm3 +; SSE-NEXT: movdqa (%rcx), %xmm5 ; SSE-NEXT: movdqa (%r8), %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm0, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,65535,0,0,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,65535,65535,0] ; SSE-NEXT: pand %xmm1, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm5[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: pandn %xmm10, %xmm11 @@ -439,31 +476,31 @@ ; SSE-NEXT: pand %xmm4, %xmm8 ; SSE-NEXT: pandn %xmm6, %xmm4 ; SSE-NEXT: por %xmm8, %xmm4 -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,1] ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm9[0,1,3,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[0,1,3,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,1,1] ; SSE-NEXT: pand %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: pandn %xmm9, %xmm10 ; SSE-NEXT: por %xmm11, %xmm10 ; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,1,0,1] -; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,1,0,1] +; SSE-NEXT: pandn %xmm9, %xmm0 ; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] ; SSE-NEXT: psrlq $48, %xmm7 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm7[1] ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm7, %xmm11 ; SSE-NEXT: pandn %xmm10, %xmm11 -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,7,6] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,1,3,3] ; SSE-NEXT: pand %xmm7, %xmm12 @@ -472,29 +509,29 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] ; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: por %xmm12, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm9, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 ; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] -; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] +; SSE-NEXT: pand %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm10, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm8, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, 16(%r9) -; SSE-NEXT: movdqa %xmm5, 48(%r9) +; SSE-NEXT: por %xmm2, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: por %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm2, 48(%r9) +; SSE-NEXT: movdqa %xmm5, 16(%r9) ; SSE-NEXT: movdqa %xmm1, 64(%r9) ; SSE-NEXT: movdqa %xmm0, (%r9) ; SSE-NEXT: movdqa %xmm4, 32(%r9) @@ -502,55 +539,55 @@ ; ; AVX1-ONLY-LABEL: store_i16_stride5_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm1[1],xmm4[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3,4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3],xmm9[4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm9[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3],xmm10[4,5],xmm8[6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3,4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,0,2,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3],xmm10[4,5,6],xmm8[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3],xmm11[4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3,4,5],xmm10[6],xmm9[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3,4],xmm9[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm6[1,2,3,4],xmm9[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2],xmm6[3,4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2,3,4],xmm7[5],xmm6[6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 32(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm6, 48(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 16(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, (%r9) -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm9[2],xmm1[3,4,5,6],xmm9[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 64(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, 16(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 64(%r9) ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride5_vf8: @@ -608,24 +645,24 @@ ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm4 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,5,2,6,2,6,u,u> -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[2,3,6,7,2,3],zero,zero,zero,zero,ymm7[8,9,12,13,16,17],zero,zero,zero,zero,ymm7[18,19,22,23,28,29],zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <5,2,6,u,2,6,3,7> -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3],zero,zero,zero,zero,zero,zero,ymm8[4,5,8,9],zero,zero,zero,zero,zero,zero,ymm8[18,19,22,23],zero,zero,zero,zero,zero,zero,ymm8[24,25,28,29] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,2,0] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[0,1,8,9,12,13],zero,zero,zero,zero,ymm7[2,3,18,19,18,19],zero,zero,zero,zero,ymm7[28,29,20,21,28,29],zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,2,0,2] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,8,9],zero,zero,zero,zero,zero,zero,ymm8[2,3,10,11],zero,zero,zero,zero,zero,zero,ymm8[20,21,28,29],zero,zero,zero,zero,zero,zero,ymm8[22,23] ; AVX2-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,12,13],zero,zero,zero,zero,ymm6[2,3,18,19,18,19],zero,zero,zero,zero,ymm6[28,29,20,21,28,29],zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,8,9],zero,zero,zero,zero,zero,zero,ymm5[2,3,10,11],zero,zero,zero,zero,zero,zero,ymm5[20,21,28,29],zero,zero,zero,zero,zero,zero,ymm5[22,23] +; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <1,5,2,6,2,6,u,u> +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[2,3,6,7,2,3],zero,zero,zero,zero,ymm6[8,9,12,13,16,17],zero,zero,zero,zero,ymm6[18,19,22,23,28,29],zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <5,2,6,u,2,6,3,7> +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,3],zero,zero,zero,zero,zero,zero,ymm5[4,5,8,9],zero,zero,zero,zero,zero,zero,ymm5[18,19,22,23],zero,zero,zero,zero,zero,zero,ymm5[24,25,28,29] ; AVX2-FAST-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpsrlq $48, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] @@ -634,8 +671,8 @@ ; AVX2-FAST-NEXT: vpbroadcastd 12(%r8), %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] ; AVX2-FAST-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm4, 32(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm7, (%r9) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -800,321 +837,337 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride5_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm15 -; SSE-NEXT: movdqa 16(%rdi), %xmm5 -; SSE-NEXT: movdqa (%rsi), %xmm8 -; SSE-NEXT: movdqa 16(%rsi), %xmm0 -; SSE-NEXT: movdqa 16(%rdx), %xmm10 -; SSE-NEXT: movdqa (%rcx), %xmm14 +; SSE-NEXT: subq $24, %rsp +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsi), %xmm12 +; SSE-NEXT: movdqa 16(%rsi), %xmm14 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rcx), %xmm11 -; SSE-NEXT: movdqa 16(%r8), %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm5, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,65535,65535,65535,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,2,2] +; SSE-NEXT: movdqa (%rdx), %xmm11 +; SSE-NEXT: movdqa (%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rcx), %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r8), %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm9, %xmm13 ; SSE-NEXT: pandn %xmm7, %xmm13 -; SSE-NEXT: por %xmm4, %xmm13 -; SSE-NEXT: pand %xmm12, %xmm13 -; SSE-NEXT: por %xmm6, %xmm13 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm2, %xmm13 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: por %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: pandn %xmm15, %xmm7 -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] +; SSE-NEXT: por %xmm5, %xmm13 ; SSE-NEXT: pand %xmm1, %xmm13 -; SSE-NEXT: por %xmm7, %xmm13 +; SSE-NEXT: por %xmm4, %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm3, %xmm13 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm14[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm7[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm8, %xmm13 +; SSE-NEXT: por %xmm5, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm15[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm9, %xmm14 -; SSE-NEXT: pandn %xmm7, %xmm14 -; SSE-NEXT: movdqa (%rdx), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[1,1,2,2] -; SSE-NEXT: pand %xmm9, %xmm15 -; SSE-NEXT: por %xmm15, %xmm14 -; SSE-NEXT: pand %xmm12, %xmm14 -; SSE-NEXT: pandn %xmm13, %xmm12 -; SSE-NEXT: movdqa (%r8), %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm14 +; SSE-NEXT: movdqa 16(%rdx), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,2,2] +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: por %xmm5, %xmm14 +; SSE-NEXT: pand %xmm1, %xmm14 +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: movdqa 16(%r8), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: por %xmm14, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,0,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pandn %xmm5, %xmm13 +; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm14[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[0,1,1,1] +; SSE-NEXT: pand %xmm3, %xmm15 +; SSE-NEXT: por %xmm13, %xmm15 +; SSE-NEXT: pand %xmm8, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,0,1] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm15, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: por %xmm14, %xmm12 -; SSE-NEXT: pand %xmm2, %xmm12 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm12, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm14[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm15 +; SSE-NEXT: por %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: pandn %xmm12, %xmm14 -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm15[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,1,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,2,2] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm14, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm15 = zero,zero,xmm15[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm15, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,2] -; SSE-NEXT: pand %xmm1, %xmm13 -; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: por %xmm15, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: pand %xmm15, %xmm13 -; SSE-NEXT: por %xmm13, %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,1] -; SSE-NEXT: movdqa %xmm13, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: por %xmm2, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,1,2,2,4,5,6,7] +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: psrlq $48, %xmm12 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm12[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] +; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm2, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,1] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm11, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm6, %xmm11 -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; SSE-NEXT: psrlq $48, %xmm8 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm8[1] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,3] -; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm6[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,1,1] +; SSE-NEXT: pand %xmm3, %xmm11 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm11, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,2] -; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm8, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: por %xmm1, %xmm15 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,0,1] -; SSE-NEXT: pandn %xmm0, %xmm13 -; SSE-NEXT: por %xmm1, %xmm13 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,1] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm1, (%r9) -; SSE-NEXT: movdqa %xmm13, 16(%r9) -; SSE-NEXT: movdqa %xmm15, 48(%r9) -; SSE-NEXT: movdqa %xmm9, 64(%r9) -; SSE-NEXT: movdqa %xmm7, 80(%r9) -; SSE-NEXT: movdqa %xmm12, 96(%r9) -; SSE-NEXT: movdqa %xmm14, 128(%r9) +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: por %xmm3, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: por %xmm6, %xmm14 +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] +; SSE-NEXT: pandn %xmm3, %xmm15 +; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: psrlq $48, %xmm7 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm9, 144(%r9) +; SSE-NEXT: movdqa %xmm15, 128(%r9) +; SSE-NEXT: movdqa %xmm14, 96(%r9) +; SSE-NEXT: movdqa %xmm8, 80(%r9) +; SSE-NEXT: movdqa %xmm4, 64(%r9) +; SSE-NEXT: movdqa %xmm10, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%r9) +; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%r9) +; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%r9) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%r9) +; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride5_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm12 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm8[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2],xmm0[3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1,2,3,4],xmm7[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm7 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm3[1],xmm7[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm7, %ymm14 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm7[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm6, %ymm14 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm14, %ymm6 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0,1,2,3],xmm8[4],xmm12[5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm8[0,1],xmm13[2],xmm8[3,4,5,6],xmm13[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2,3],xmm14[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm15, %ymm9 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm14[1],xmm0[2,3,4,5],xmm14[6],xmm0[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm12[1],xmm3[2,3,4,5],xmm12[6],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm11[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm14, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm15 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm15 = xmm7[1],xmm15[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm15 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm14[2],xmm4[3,4,5,6],xmm14[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0],xmm3[1,2,3,4],xmm14[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4,5,6],xmm8[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm13[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4,5,6],xmm8[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm7, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1,2],xmm5[3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2],xmm11[3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,1,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm11 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm15, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm3[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm3[2],xmm6[3,4,5,6],xmm3[7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1,2],xmm11[3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4],xmm3[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 48(%r9) +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3,4],xmm6[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1],xmm6[2],xmm12[3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm7 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm15[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 112(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 96(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 48(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm5, 32(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm12, 16(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 112(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm9, 96(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 64(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 80(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 128(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 128(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 144(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 64(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 80(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%r9) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -1141,19 +1194,20 @@ ; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[1,1,1,2,5,5,5,6] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2,3],ymm6[4],ymm10[5],ymm6[6],ymm10[7,8],ymm6[9],ymm10[10,11],ymm6[12],ymm10[13],ymm6[14],ymm10[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm2[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,3,2,2,6,7,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2],ymm6[3],ymm10[4,5],ymm6[6],ymm10[7,8],ymm6[9],ymm10[10],ymm6[11],ymm10[12,13],ymm6[14],ymm10[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3,4],ymm11[5,6,7,8],ymm10[9],ymm11[10],ymm10[11,12],ymm11[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm6, %ymm10, %ymm6 -; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm6, %ymm10, %ymm6 ; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm10 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] @@ -1168,20 +1222,19 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm1[0,1,1,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[3,2,3,3,7,6,7,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3,4],ymm10[5,6,7,8],ymm9[9],ymm10[10],ymm9[11,12],ymm10[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[1,1,1,2,5,5,5,6] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm2[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] @@ -1195,9 +1248,9 @@ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, 128(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, 96(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm7, 32(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 96(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 128(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -1237,31 +1290,31 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm1[0,1,1,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[1,1,1,2,5,5,5,6] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,1,2,1,4,5,6,5] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,2] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[3,2,3,3,7,6,7,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3,4],ymm9[5,6,7,8],ymm8[9],ymm9[10],ymm8[11,12],ymm9[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,2] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3,4],ymm10[5,6,7,8],ymm9[9],ymm10[10],ymm9[11,12],ymm10[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] @@ -1275,8 +1328,8 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 128(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 96(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 96(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 128(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm6, 32(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FAST-NEXT: vzeroupper @@ -1317,31 +1370,31 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm1[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[1,1,1,2,5,5,5,6] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,1,2,1,4,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,2] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[3,2,3,3,7,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3,4],ymm9[5,6,7,8],ymm8[9],ymm9[10],ymm8[11,12],ymm9[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 16(%r8), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 24(%r8), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,2] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[3,2,3,3,7,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3,4],ymm10[5,6,7,8],ymm9[9],ymm10[10],ymm9[11,12],ymm10[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 24(%r8), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 16(%r8), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] @@ -1355,8 +1408,8 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 128(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 128(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 32(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -1364,155 +1417,144 @@ ; ; AVX512F-SLOW-LABEL: store_i16_stride5_vf16: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm4 -; AVX512F-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm6 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5],xmm5[6],xmm7[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,2,2,2] -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm9 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,2] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512F-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2,3],xmm6[4],xmm8[5],xmm6[6],xmm8[7] ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %ymm7, %ymm8, %ymm6 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm7, %zmm6 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm6 = zmm6[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm5 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm5 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[u,u,12,13,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,12,13,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %ymm5, %ymm9, %ymm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm1, %ymm10 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $226, %ymm5, %ymm8, %ymm10 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm7 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm2[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm3, %ymm9 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7 +; AVX512F-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm6 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpandn %ymm7, %ymm8, %ymm7 +; AVX512F-SLOW-NEXT: vpandn %ymm6, %ymm8, %ymm6 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4],ymm3[5,6,7,8],ymm2[9],ymm3[10],ymm2[11,12],ymm3[13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX512F-SLOW-NEXT: vpternlogq $226, %ymm2, %ymm9, %ymm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm4 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,2,6,7,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5,6,7,8],ymm0[9],ymm1[10],ymm0[11,12],ymm1[13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] ; AVX512F-SLOW-NEXT: vpternlogq $202, 24(%r8){1to4}, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 128(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, (%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i16_stride5_vf16: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm3 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,2,2,2] +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,0,1,8,9,8,8] +; AVX512F-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm7 ; AVX512F-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm5 ; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5],xmm5[6],xmm7[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,2,2,2] -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm9 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm5 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm5[1],xmm8[2,3],xmm5[4],xmm8[5],xmm5[6],xmm8[7] ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm8 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %ymm7, %ymm8, %ymm6 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm5[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm5 ; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm6 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,1,1,1] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,12,13,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %ymm5, %ymm9, %ymm7 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vprolq $16, %ymm1, %ymm10 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512F-FAST-NEXT: vpternlogq $226, %ymm5, %ymm8, %ymm10 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm5 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-FAST-NEXT: vpbroadcastq 16(%r8), %ymm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,12,13,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10],ymm5[11],ymm7[12,13],ymm5[14],ymm7[15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm4[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vprolq $16, %ymm2, %ymm9 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 +; AVX512F-FAST-NEXT: vpbroadcastq 16(%r8), %ymm5 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpandn %ymm7, %ymm8, %ymm7 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4],ymm3[5,6,7,8],ymm2[9],ymm3[10],ymm2[11,12],ymm3[13,14,15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX512F-FAST-NEXT: vpternlogq $226, %ymm2, %ymm9, %ymm0 +; AVX512F-FAST-NEXT: vpandn %ymm5, %ymm8, %ymm5 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[16,17],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm3 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,2] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5,6,7,8],ymm0[9],ymm1[10],ymm0[11,12],ymm1[13,14,15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] ; AVX512F-FAST-NEXT: vpternlogq $202, 24(%r8){1to4}, %ymm0, %ymm1 ; AVX512F-FAST-NEXT: vmovdqa %ymm1, 128(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 64(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 64(%r9) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -1526,19 +1568,19 @@ ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,16,32,48,u,1,17,33,49,u,2,18,34,50,u,3,19,35,51,u,4,20,36,52,u,5,21,37,53,u,6,22> -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-NEXT: vpermi2w %ymm2, %ymm3, %ymm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,16,32,48,u,1,17,33,49,u,2,18,34,50,u,3,19,35,51,u,4,20,36,52,u,5,21,37,53,u,6,22> +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <6,22,u,39,55,7,23,u,40,56,8,24,u,41,57,9,25,u,42,58,10,26,u,43,59,11,27,u,44,60,12,28> -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <6,22,u,39,55,7,23,u,40,56,8,24,u,41,57,9,25,u,42,58,10,26,u,43,59,11,27,u,44,60,12,28> +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm3, %ymm0 -; AVX512BW-NEXT: vmovdqa %ymm0, 128(%r9) +; AVX512BW-NEXT: vmovdqa %ymm4, 128(%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <16 x i16>, ptr %in.vecptr0, align 64 @@ -1981,598 +2023,587 @@ ; ; AVX1-ONLY-LABEL: store_i16_stride5_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $56, %rsp -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm4[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,2,2] -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm13 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm14, %ymm7 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm8 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm4[1],xmm8[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1,2,3,4],xmm7[5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1],xmm6[2],xmm4[3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm15[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm6 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm15[1],xmm6[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] +; AVX1-ONLY-NEXT: pushq %rax +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm1[1],xmm5[2,3,4,5],xmm1[6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4],xmm3[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2],xmm0[3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm1[1],xmm3[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm8 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0],xmm2[1,2,3,4],xmm7[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm7 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm15[1],xmm2[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm4[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,1,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm13[1],xmm11[2,3,4,5],xmm13[6],xmm11[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm13[4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm11[2],xmm4[3,4,5,6],xmm11[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm3[4],xmm9[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm13, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm3[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm14, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm11, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm8[4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4,5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm8[1],xmm6[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm1[3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0],xmm8[1,2,3,4],xmm1[5],xmm8[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm6[1],xmm4[2,3,4,5],xmm6[6],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1],xmm3[2],xmm1[3,4,5,6],xmm3[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm0[1,2,3,4],xmm3[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,1,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm14 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm3[0,1,2,3],xmm15[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2],xmm8[3,4,5,6],xmm3[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm0[4],xmm8[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm13, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm8, %ymm10 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,3,3] +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3,4,5],xmm6[6],xmm3[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm13[4],xmm1[5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3,4,5,6],xmm8[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,4,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm12, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm8 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2,3,4,5],xmm12[6],xmm9[7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm12[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4,5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm15[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm7, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1,2,3,4],xmm5[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm9, 16(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 96(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 112(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 64(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 80(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 128(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%r9) +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm13[1],xmm2[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm11, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1,2],xmm5[3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2,3,4],xmm5[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2],xmm3[3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm8 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm9[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm1[0],xmm9[1],xmm1[2,3,4,5],xmm9[6],xmm1[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm8[2],xmm10[3,4,5,6],xmm8[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1,2,3,4],xmm8[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 288(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 304(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, 256(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 272(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 224(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 240(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 192(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 208(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm15, 160(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm14, 176(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 128(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 144(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%r9) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%r9) -; AVX1-ONLY-NEXT: addq $56, %rsp +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%r9) +; AVX1-ONLY-NEXT: popq %rax ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride5_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $72, %rsp +; AVX2-SLOW-NEXT: subq $40, %rsp ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm5 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm8 ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,5,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm13, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm13 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm13, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm6, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,10,11,10,11,6,7,8,9,8,9,12,13,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm0 ; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm12 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3],xmm12[4],xmm0[5],xmm12[6],xmm0[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,10,11,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm6, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,1,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm7, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[0,1,1,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm6, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm7, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4,5],ymm0[6],ymm7[7,8],ymm0[9],ymm7[10],ymm0[11],ymm7[12,13],ymm0[14],ymm7[15] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm5 -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm0 ; AVX2-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm10 -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,2,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm8[1],xmm9[2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm15[0,1,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm14, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm10[3,2,3,3,7,6,7,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm11[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2],ymm13[3,4],ymm14[5,6,7,8],ymm13[9],ymm14[10],ymm13[11,12],ymm14[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm13, %ymm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm6[0,1,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm9, %ymm13, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3],xmm11[4],xmm0[5],xmm11[6],xmm0[7] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm11 +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm9, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,2,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm14[0],xmm10[1],xmm14[2],xmm10[3],xmm14[4,5],xmm10[6],xmm14[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,2,3,6,7,6,7,8,9,4,5,4,5,8,9,18,19,18,19,22,23,22,23,24,25,20,21,20,21,24,25] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm11, %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm5[1,1,1,2,5,5,5,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5],ymm13[6],ymm10[7,8],ymm13[9],ymm10[10,11],ymm13[12],ymm10[13],ymm13[14],ymm10[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,1,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm10, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm9 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm5[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15] -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm13 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm15 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[3,2,3,3,7,6,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3,4],ymm0[5,6,7,8],ymm5[9],ymm0[10],ymm5[11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm5, %ymm12 -; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm14 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[1,1,1,2,5,5,5,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7,8],ymm5[9],ymm1[10,11],ymm5[12],ymm1[13],ymm5[14],ymm1[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[1,1,1,2,5,5,5,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10,11],ymm5[12],ymm0[13],ymm5[14],ymm0[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm3[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1],ymm0[2],ymm12[3],ymm0[4],ymm12[5,6],ymm0[7],ymm12[8,9],ymm0[10],ymm12[11],ymm0[12],ymm12[13,14],ymm0[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm11, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[3,0,3,0,7,4,7,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm7[1,1,2,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm13, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm8, %ymm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[1,1,1,2,5,5,5,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13],ymm14[14],ymm13[15] +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm13[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3],ymm6[4],ymm0[5,6],ymm6[7],ymm0[8,9],ymm6[10],ymm0[11],ymm6[12],ymm0[13,14],ymm6[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm14[2,3,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm6, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm6, %ymm12 +; AVX2-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm14 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2],ymm6[3,4],ymm1[5,6,7,8],ymm6[9],ymm1[10],ymm6[11,12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7,8],ymm7[9],ymm1[10],ymm7[11],ymm1[12,13],ymm7[14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm8[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2],ymm10[3,4],ymm7[5,6,7,8],ymm10[9],ymm7[10],ymm10[11,12],ymm7[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm6, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,12,13,12,13,0,1,14,15,14,15,14,15,14,15,16,17,28,29,28,29,16,17,30,31,30,31,30,31,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm11, %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[3,0,3,0,7,4,7,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8],ymm7[9],ymm5[10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[3,0,3,0,7,4,7,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm8[1,1,2,2] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,0,3,0,7,4,7,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm13, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm15[1,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm9[1,1,2,2] ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 64(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 224(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm14, 128(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 224(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 64(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 288(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm14, 256(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm12, 288(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 256(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm12, 96(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%r9) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r9) -; AVX2-SLOW-NEXT: addq $72, %rsp +; AVX2-SLOW-NEXT: addq $40, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i16_stride5_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $40, %rsp -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX2-FAST-NEXT: subq $104, %rsp +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm0 +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm8 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm10, %xmm10 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm2, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm14 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm14 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm11 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm15 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm13 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm6 -; AVX2-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm9[1],xmm6[2,3],xmm9[4],xmm6[5],xmm9[6],xmm6[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm11, %ymm13, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,10,11,10,11,6,7,8,9,8,9,12,13,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm13[1],xmm6[2,3],xmm13[4],xmm6[5],xmm13[6],xmm6[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm13 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm9 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm14 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2],xmm12[3],xmm9[4,5],xmm12[6],xmm9[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,0,0] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm12, %ymm12 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,1,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm12, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm12 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm7 -; AVX2-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm14 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm14[1],xmm7[2,3],xmm14[4],xmm7[5],xmm14[6],xmm7[7] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm10 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2],xmm14[3],xmm10[4,5],xmm14[6],xmm10[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm13, %xmm9 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [10,11,10,11,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpbroadcastq 32(%r8), %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm11, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm2[0,1,1,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm7, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm10 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm8 +; AVX2-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5],xmm12[6],xmm8[7] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm15, %xmm13 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2],xmm9[3],xmm13[4,5],xmm9[6],xmm13[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm12, %ymm9, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,1,1,1] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm7, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm9, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm10 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm8 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq 32(%r8), %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7,8],ymm11[9],ymm1[10],ymm11[11],ymm1[12,13],ymm11[14],ymm1[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm11 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm9[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2],ymm13[3,4],ymm11[5,6,7,8],ymm13[9],ymm11[10],ymm13[11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm11, %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,2,3,6,7,6,7,8,9,4,5,4,5,8,9,18,19,18,19,22,23,22,23,24,25,20,21,20,21,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13],ymm12[14],ymm11[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [6,7,2,3,4,5,4,5,8,9,8,9,8,9,8,9,22,23,18,19,20,21,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm12 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm9 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3],ymm13[4],ymm12[5,6],ymm13[7],ymm12[8,9],ymm13[10],ymm12[11],ymm13[12],ymm12[13,14],ymm13[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm11, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm11 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2],ymm14[3],ymm0[4,5],ymm14[6],ymm0[7,8],ymm14[9],ymm0[10],ymm14[11],ymm0[12,13],ymm14[14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm14 -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm14, %ymm12 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm15 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3,4],ymm12[5,6,7,8],ymm6[9],ymm12[10],ymm6[11,12],ymm12[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq 56(%r8), %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm6, %ymm12 -; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm13 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[1,1,1,2,5,5,5,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5],ymm6[6],ymm1[7,8],ymm6[9],ymm1[10,11],ymm6[12],ymm1[13],ymm6[14],ymm1[15] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,2,1,4,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3],ymm7[4],ymm0[5,6],ymm7[7],ymm0[8,9],ymm7[10],ymm0[11],ymm7[12],ymm0[13,14],ymm7[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3],ymm13[4],ymm0[5],ymm13[6],ymm0[7,8],ymm13[9],ymm0[10,11],ymm13[12],ymm0[13],ymm13[14],ymm0[15] +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm14 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1],ymm2[2],ymm14[3],ymm2[4],ymm14[5,6],ymm2[7],ymm14[8,9],ymm2[10],ymm14[11],ymm2[12],ymm14[13,14],ymm2[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[1,1,1,2,5,5,5,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,2,1,4,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3],ymm6[4],ymm2[5,6],ymm6[7],ymm2[8,9],ymm6[10],ymm2[11],ymm6[12],ymm2[13,14],ymm6[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastq 48(%r8), %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm12, %ymm1, %ymm12 +; AVX2-FAST-NEXT: vpbroadcastq 48(%r8), %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [14,15,12,13,10,11,14,15,14,15,12,13,14,15,12,13,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,12,13,14,15,14,15,10,11,12,13,14,15,14,15,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[3,2,3,3,7,6,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2],ymm6[3,4],ymm0[5,6,7,8],ymm6[9],ymm0[10],ymm6[11,12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[3,2,3,3,7,6,7,7] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3,4],ymm2[5,6,7,8],ymm3[9],ymm2[10],ymm3[11,12],ymm2[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[3,0,3,0,7,4,7,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm6, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[3,0,3,0,7,4,7,4] +; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq 56(%r8), %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,12,13,12,13,0,1,14,15,14,15,14,15,14,15,16,17,28,29,28,29,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[3,0,3,0,7,4,7,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm9[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2,3],ymm6[4],ymm10[5],ymm6[6],ymm10[7,8],ymm6[9],ymm10[10,11],ymm6[12],ymm10[13],ymm6[14],ymm10[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[3,0,3,0,7,4,7,4] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8],ymm2[9],ymm6[10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm15[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13],ymm4[14],ymm6[15] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] @@ -2580,169 +2611,174 @@ ; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 224(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 96(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm13, 128(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm12, 288(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 256(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 160(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 224(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 64(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 288(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm14, 256(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 128(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm12, 96(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) +; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%r9) ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-NEXT: addq $40, %rsp +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) +; AVX2-FAST-NEXT: addq $104, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride5_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $40, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: subq $104, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm2, %ymm10, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq (%r8), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm2, %ymm14, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm5, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 8(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm9[1],xmm6[2,3],xmm9[4],xmm6[5],xmm9[6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm11, %ymm13, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,10,11,10,11,6,7,8,9,8,9,12,13,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 8(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm13[1],xmm6[2,3],xmm13[4],xmm6[5],xmm13[6],xmm6[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm13, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2],xmm12[3],xmm9[4,5],xmm12[6],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm12, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm12, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 40(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm14[1],xmm7[2,3],xmm14[4],xmm7[5],xmm14[6],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm11, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2],xmm14[3],xmm10[4,5],xmm14[6],xmm10[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm10, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm13, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [10,11,10,11,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 32(%r8), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm11, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm2[0,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm7, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm14, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 40(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5],xmm12[6],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm15, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2],xmm9[3],xmm13[4,5],xmm9[6],xmm13[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm12, %ymm9, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm7, %ymm9, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq (%r8), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm9, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 32(%r8), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7,8],ymm11[9],ymm1[10],ymm11[11],ymm1[12,13],ymm11[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm10, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm9[3,2,3,3,7,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2],ymm13[3,4],ymm11[5,6,7,8],ymm13[9],ymm11[10],ymm13[11,12],ymm11[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm1, %ymm11, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,2,3,6,7,6,7,8,9,4,5,4,5,8,9,18,19,18,19,22,23,22,23,24,25,20,21,20,21,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm10, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13],ymm12[14],ymm11[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [6,7,2,3,4,5,4,5,8,9,8,9,8,9,8,9,22,23,18,19,20,21,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm4, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3],ymm13[4],ymm12[5,6],ymm13[7],ymm12[8,9],ymm13[10],ymm12[11],ymm13[12],ymm12[13,14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm11, %ymm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2],ymm14[3],ymm0[4,5],ymm14[6],ymm0[7,8],ymm14[9],ymm0[10],ymm14[11],ymm0[12,13],ymm14[14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm14, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[3,2,3,3,7,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3,4],ymm12[5,6,7,8],ymm6[9],ymm12[10],ymm6[11,12],ymm12[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm6, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 56(%r8), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm1, %ymm6, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 24(%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[1,1,1,2,5,5,5,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5],ymm6[6],ymm1[7,8],ymm6[9],ymm1[10,11],ymm6[12],ymm1[13],ymm6[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,2,1,4,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3],ymm7[4],ymm0[5,6],ymm7[7],ymm0[8,9],ymm7[10],ymm0[11],ymm7[12],ymm0[13,14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3],ymm13[4],ymm0[5],ymm13[6],ymm0[7,8],ymm13[9],ymm0[10,11],ymm13[12],ymm0[13],ymm13[14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm13, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1],ymm2[2],ymm14[3],ymm2[4],ymm14[5,6],ymm2[7],ymm14[8,9],ymm2[10],ymm14[11],ymm2[12],ymm14[13,14],ymm2[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[1,1,1,2,5,5,5,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm11, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,2,1,4,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3],ymm6[4],ymm2[5,6],ymm6[7],ymm2[8,9],ymm6[10],ymm2[11],ymm6[12],ymm2[13,14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 48(%r8), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 16(%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 16(%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm12, %ymm1, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 48(%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [14,15,12,13,10,11,14,15,14,15,12,13,14,15,12,13,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,12,13,14,15,14,15,10,11,12,13,14,15,14,15,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[3,2,3,3,7,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2],ymm6[3,4],ymm0[5,6,7,8],ymm6[9],ymm0[10],ymm6[11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[3,2,3,3,7,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3,4],ymm2[5,6,7,8],ymm3[9],ymm2[10],ymm3[11,12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[3,0,3,0,7,4,7,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm6, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[3,0,3,0,7,4,7,4] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 24(%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 56(%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,12,13,12,13,0,1,14,15,14,15,14,15,14,15,16,17,28,29,28,29,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[3,0,3,0,7,4,7,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm8, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm9[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2,3],ymm6[4],ymm10[5],ymm6[6],ymm10[7,8],ymm6[9],ymm10[10,11],ymm6[12],ymm10[13],ymm6[14],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm6, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[3,0,3,0,7,4,7,4] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8],ymm2[9],ymm6[10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm11, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm13, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm15[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13],ymm4[14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm2, %ymm4, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] @@ -2750,333 +2786,335 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 224(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 128(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 288(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 256(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 160(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 224(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 288(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 256(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 128(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 96(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: addq $40, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r9) +; AVX2-FAST-PERLANE-NEXT: addq $104, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: store_i16_stride5_vf32: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %ymm17 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4],ymm3[5,6,7,8],ymm2[9],ymm3[10],ymm2[11,12],ymm3[13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %ymm20 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm6 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm6[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,2,6,7,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7,8],ymm3[9],ymm7[10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,2] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm2[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [10,11,10,11,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm19 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [6,7,10,11,10,11,6,7,8,9,8,9,12,13,12,13] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm3 +; AVX512F-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm3 = zmm1[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] ; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm3 -; AVX512F-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm0 -; AVX512F-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm19 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm12 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,1,1,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm11 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3,4],ymm9[5,6,7,8],ymm8[9],ymm9[10],ymm8[11,12],ymm9[13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,2] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm8[0,1,2,3],zmm6[0,1,0,1] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,6] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm8 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm9 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,3,2,2,6,7,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm14[1],ymm10[2],ymm14[3],ymm10[4,5],ymm14[6],ymm10[7,8],ymm14[9],ymm10[10],ymm14[11],ymm10[12,13],ymm14[14],ymm10[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,2] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm5[0,1,0,1] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm17, %zmm10 +; AVX512F-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm5 +; AVX512F-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] +; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm6, %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm14, %ymm20 ; AVX512F-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm18 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm18[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5],ymm0[6],ymm7[7,8],ymm0[9],ymm7[10,11],ymm0[12],ymm7[13],ymm0[14],ymm7[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3],xmm11[4],xmm8[5],xmm11[6],xmm8[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm11 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm23 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[3,0,3,0,7,4,7,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8],ymm2[9],ymm8[10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm10 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,2,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm9, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm10, %ymm4 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpandnq %ymm10, %ymm21, %ymm10 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm10, %zmm10 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm12, %xmm8 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3],xmm8[4,5],xmm2[6],xmm8[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm14, %xmm4 -; AVX512F-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm7 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm7[1],xmm4[2,3],xmm7[4],xmm4[5],xmm7[6],xmm4[7] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm7, %zmm4 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm4[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm7, %zmm4 -; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm2 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm9[0,1,1,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm18[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0],ymm10[1],ymm14[2,3],ymm10[4],ymm14[5],ymm10[6],ymm14[7,8],ymm10[9],ymm14[10,11],ymm10[12],ymm14[13],ymm10[14],ymm14[15] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm13 +; AVX512F-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm14 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5],xmm14[6],xmm13[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm13, %zmm10 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,12,13,12,13,0,1,14,15,14,15,14,15,14,15,16,17,28,29,28,29,16,17,30,31,30,31,30,31,30,31] +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm15, %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm13, %ymm21 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm11[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10],ymm14[11],ymm13[12,13],ymm14[14],ymm13[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm7 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2],xmm4[3],xmm7[4,5],xmm4[6],xmm7[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,0] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm4, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm10 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpandn %ymm12, %ymm13, %ymm12 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm10 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm10 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm3, %ymm8 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3],ymm4[4],ymm8[5,6],ymm4[7],ymm8[8,9],ymm4[10],ymm8[11],ymm4[12],ymm8[13,14],ymm4[15] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm6, %ymm12 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0,1],ymm4[2],ymm12[3],ymm4[4],ymm12[5,6],ymm4[7],ymm12[8,9],ymm4[10],ymm12[11],ymm4[12],ymm12[13,14],ymm4[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm18[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,2,6,7,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4,5],ymm8[6],ymm3[7,8],ymm8[9],ymm3[10],ymm8[11],ymm3[12,13],ymm8[14],ymm3[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,2] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm5[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5],ymm12[6],ymm8[7,8],ymm12[9],ymm8[10,11],ymm12[12],ymm8[13],ymm12[14],ymm8[15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm18[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,2,6,7,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7,8],ymm12[9],ymm6[10],ymm12[11],ymm6[12,13],ymm12[14],ymm6[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,2] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm6 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,2,3,6,7,6,7,8,9,4,5,4,5,8,9,18,19,18,19,22,23,22,23,24,25,20,21,20,21,24,25] +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm15, %ymm12 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm11[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10,11],ymm13[12],ymm12[13],ymm13[14],ymm12[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm15[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2],ymm11[3,4],ymm13[5,6,7,8],ymm11[9],ymm13[10],ymm11[11,12],ymm13[13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm16, %zmm11 +; AVX512F-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm6 +; AVX512F-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm12 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm6, %zmm6 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm11 +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm9, %ymm11 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm8[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13],ymm11[14],ymm12[15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm9, %ymm9 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3,4],ymm0[5,6,7,8],ymm5[9],ymm0[10],ymm5[11,12],ymm0[13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm4, %zmm7, %zmm0 -; AVX512F-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm4 -; AVX512F-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm20[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5],ymm0[6],ymm5[7,8],ymm0[9],ymm5[10,11],ymm0[12],ymm5[13],ymm0[14],ymm5[15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm20[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm6, %ymm6 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm5 -; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm5 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[3,0,3,0,7,4,7,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm17[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm1 -; AVX512F-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpandn %ymm0, %ymm3, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm9, %ymm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 64(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 256(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm9 +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm3, %ymm9 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1],ymm11[2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10],ymm9[11],ymm11[12,13],ymm9[14],ymm11[15] +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm9, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm8, %zmm17, %zmm2 +; AVX512F-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 256(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 192(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, 128(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 128(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i16_stride5_vf32: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 32(%rdi), %ymm19 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm19[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512F-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 32(%rdi), %ymm17 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm17[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,10,11,10,11,6,7,8,9,8,9,12,13,12,13] +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm19 +; AVX512F-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm3 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm6 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,12,13,12,13,0,1,14,15,14,15,14,15,14,15,16,17,28,29,28,29,16,17,30,31,30,31,30,31,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm4 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm20 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm13 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[1,2,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2],xmm14[3],xmm13[4,5],xmm14[6],xmm13[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,0] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm1 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm14 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm14, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm24 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,1] +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm7 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0],ymm4[1],ymm10[2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7,8],ymm4[9],ymm10[10],ymm4[11],ymm10[12,13],ymm4[14],ymm10[15] +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [10,11,10,11,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm10, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[1,2,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2],xmm15[3],xmm14[4,5],xmm15[6],xmm14[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,0] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm14, %zmm15 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm15 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm14 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm12, %ymm21 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpandnq %ymm14, %ymm16, %ymm14 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm14, %zmm17 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm17 +; AVX512F-FAST-NEXT: vpandnq %ymm1, %ymm16, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm4 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm4 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm14 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,2,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm15[1],xmm3[2],xmm15[3],xmm3[4,5],xmm15[6],xmm3[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm22 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,0,1,8,9,8,8] -; AVX512F-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm14 -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm1 -; AVX512F-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm23 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm13 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm15 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm15[1,2,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5],xmm12[6],xmm13[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm13, %xmm22 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,0,1,8,9,8,8] +; AVX512F-FAST-NEXT: vpermi2q %zmm12, %zmm1, %zmm18 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX512F-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm8 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm12, %xmm23 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm1 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm14, %zmm16, %zmm1 -; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm3 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm0[0,1,1,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm10 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm10 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm15, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm14 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm14[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vprolq $16, %ymm15, %ymm13 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0,1],ymm3[2],ymm13[3],ymm3[4],ymm13[5,6],ymm3[7],ymm13[8,9],ymm3[10],ymm13[11],ymm3[12],ymm13[13,14],ymm3[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm18[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm1[1],ymm13[2],ymm1[3],ymm13[4,5],ymm1[6],ymm13[7,8],ymm1[9],ymm13[10],ymm1[11],ymm13[12,13],ymm1[14],ymm13[15] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm7 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm18[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7,8],ymm11[9],ymm7[10,11],ymm11[12],ymm7[13],ymm11[14],ymm7[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm7 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm3, %zmm20, %zmm7 -; AVX512F-FAST-NEXT: vpbroadcastq 16(%r8), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpandn %ymm3, %ymm13, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm11 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm0 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm3 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2],ymm7[3],ymm3[4,5],ymm7[6],ymm3[7,8],ymm7[9],ymm3[10],ymm7[11],ymm3[12,13],ymm7[14],ymm3[15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm18, %zmm16, %zmm1 +; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm8 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm14[0,1,1,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm8 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm12 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %ymm18 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm18[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5],ymm12[6],ymm15[7,8],ymm12[9],ymm15[10,11],ymm12[12],ymm15[13],ymm12[14],ymm15[15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm18[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vprolq $16, %ymm1, %ymm13 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3],ymm15[4],ymm13[5,6],ymm15[7],ymm13[8,9],ymm15[10],ymm13[11],ymm15[12],ymm13[13,14],ymm15[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm13 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7,8],ymm13[9],ymm15[10],ymm13[11],ymm15[12,13],ymm13[14],ymm15[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,2,3,6,7,6,7,8,9,4,5,4,5,8,9,18,19,18,19,22,23,22,23,24,25,20,21,20,21,24,25] +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm15 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm9, %ymm20 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm7[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm15[0],ymm9[1],ymm15[2,3],ymm9[4],ymm15[5],ymm9[6],ymm15[7,8],ymm9[9],ymm15[10,11],ymm9[12],ymm15[13],ymm9[14],ymm15[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm13, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm12, %zmm13, %zmm9 +; AVX512F-FAST-NEXT: vpbroadcastq 16(%r8), %ymm12 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-FAST-NEXT: vpandn %ymm12, %ymm15, %ymm12 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm14 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm12 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm12 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm10 +; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [10,11,12,13,14,15,14,15,10,11,12,13,14,15,14,15,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm6 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[3,4],ymm6[5,6,7,8],ymm7[9],ymm6[10],ymm7[11,12],ymm6[13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,2,3,2,8,9,8,9] +; AVX512F-FAST-NEXT: vpermi2q %zmm9, %zmm6, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm6 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [14,15,12,13,10,11,14,15,14,15,12,13,14,15,12,13,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm18[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2],ymm9[3],ymm1[4,5],ymm9[6],ymm1[7,8],ymm9[9],ymm1[10],ymm9[11],ymm1[12,13],ymm9[14],ymm1[15] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,3,2,2,8,9,8,9] +; AVX512F-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm7, %zmm13, %zmm9 +; AVX512F-FAST-NEXT: vpbroadcastq 24(%r8), %ymm0 +; AVX512F-FAST-NEXT: vpbroadcastq 32(%r8), %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm0 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm1 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7,8],ymm6[9],ymm1[10],ymm6[11],ymm1[12,13],ymm6[14],ymm1[15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[0,1,2,1,4,5,6,5] ; AVX512F-FAST-NEXT: vprolq $16, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3],ymm7[4],ymm5[5,6],ymm7[7],ymm5[8,9],ymm7[10],ymm5[11],ymm7[12],ymm5[13,14],ymm7[15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,3,2,3,10,11,10,10] -; AVX512F-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm7 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm5 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm9[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2],ymm11[3,4],ymm5[5,6,7,8],ymm11[9],ymm5[10],ymm11[11,12],ymm5[13,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5],ymm8[6],ymm1[7,8],ymm8[9],ymm1[10,11],ymm8[12],ymm1[13],ymm8[14],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [2,3,2,3,10,10,11,10] -; AVX512F-FAST-NEXT: vpermi2q %zmm5, %zmm1, %zmm8 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm7, %zmm16, %zmm8 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,3,2,3,10,11,10,10] +; AVX512F-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm6 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3,4],ymm1[5,6,7,8],ymm5[9],ymm1[10],ymm5[11,12],ymm1[13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,3,2,3,10,10,11,10] +; AVX512F-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm6, %zmm16, %zmm3 ; AVX512F-FAST-NEXT: vpbroadcastq 48(%r8), %ymm1 -; AVX512F-FAST-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm5 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3,4],ymm2[5,6,7,8],ymm3[9],ymm2[10],ymm3[11,12],ymm2[13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,3,2,8,9,8,9] -; AVX512F-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm15, %ymm4 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [2,3,2,2,8,9,8,9] -; AVX512F-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm5 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm3, %zmm20, %zmm5 -; AVX512F-FAST-NEXT: vpbroadcastq 24(%r8), %ymm2 -; AVX512F-FAST-NEXT: vpbroadcastq 32(%r8), %ymm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 128(%r9) +; AVX512F-FAST-NEXT: vpbroadcastq 56(%r8), %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 256(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 64(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, (%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, 192(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 128(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 64(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 192(%r9) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -4029,511 +4067,506 @@ ; ; AVX1-ONLY-LABEL: store_i16_stride5_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $392, %rsp # imm = 0x188 -; AVX1-ONLY-NEXT: vmovdqa 80(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm8[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%r8), %xmm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm12, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm12, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm13 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3],xmm8[4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: subq $328, %rsp # imm = 0x148 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,1,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm10[0,1],xmm11[2],xmm10[3,4,5,6],xmm11[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0,1,2,3],xmm14[4],xmm13[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 +; AVX1-ONLY-NEXT: vandps %ymm11, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm10[0,1,2,3],xmm12[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3,4,5],xmm12[6],xmm10[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm14[1],xmm4[2,3,4,5],xmm14[6],xmm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm5[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2],xmm10[3,4,5,6],xmm12[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm15, %ymm9 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm8[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vandps %ymm15, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm9, %ymm10, %ymm9 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm2[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,0,2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm12[2],xmm8[3,4,5,6],xmm12[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm11, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm11, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm6[1],xmm3[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm7[3],xmm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm9[3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm6 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,1,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vmovdqa 64(%r8), %xmm4 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm15, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1,2,3],xmm8[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2,3,4],xmm7[5],xmm6[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4,5,6],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm5[2],xmm7[3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm7[2],xmm2[3,4,5,6],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm4[3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm4[1],xmm12[2,3,4,5],xmm4[6],xmm12[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm1[1,2,3,4],xmm2[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3,4],xmm5[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm5[4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4,5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm4[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm6, %ymm2 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3,4,5,6],xmm8[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm11, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm11, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm7 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm7 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm4[1],xmm7[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm4[1,2,3,4],xmm5[5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm10, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2],xmm7[3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm9[4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm10, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm13 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm13[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm11[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1,2,3,4],xmm8[5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm8[2],xmm0[3,4,5,6],xmm8[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6],xmm1[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm2[1],xmm5[2,3,4,5],xmm2[6],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm3[1,2,3,4],xmm0[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm5[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 112(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm6, %ymm2 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,1,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa 64(%r8), %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1,2,3],xmm6[4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5],xmm6[6],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,1,3,3] +; AVX1-ONLY-NEXT: vmovdqa 80(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm10, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm4[4],xmm8[5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3,4,5,6],xmm8[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm11, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm11, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm7 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm5[1],xmm7[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 112(%r8), %xmm7 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm4[1,2,3,4],xmm5[5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm7 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm4[1],xmm7[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm10, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm9 -; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm6[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm11[4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm10, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm10, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm9, %ymm13, %ymm9 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm13 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm11[1],xmm13[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4],xmm0[5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%r8), %xmm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1,2,3,4],xmm7[5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm7[2],xmm0[3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 80(%r8), %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm2[1],xmm5[2,3,4,5],xmm2[6],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm0[0],xmm3[1,2,3,4],xmm0[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2],xmm2[3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,1,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 96(%r8), %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm4[0,1,2,3],xmm6[4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0],xmm6[1],xmm4[2,3,4,5],xmm6[6],xmm4[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm13[0,1],xmm4[2],xmm13[3,4,5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm13 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,3,3] +; AVX1-ONLY-NEXT: vmovdqa 112(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,0,2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1],xmm6[2],xmm15[3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm11, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm1[1],xmm5[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm12, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1,2,3,4],xmm5[5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm5[2],xmm4[3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm9 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm10[1],xmm4[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm11[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm6[3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,1,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm6 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2],xmm7[3],xmm13[4,5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4],xmm3[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4,5,6],xmm3[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 112(%r8), %xmm6 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0,1,2,3],xmm7[4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3,4,5],xmm7[6],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm11[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm10[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm7, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1],xmm5[2],xmm12[3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm7[1],xmm4[2,3,4,5],xmm7[6],xmm4[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4],xmm5[5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 48(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, 32(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 16(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 112(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 96(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 80(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 496(%r9) +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0],xmm0[1,2,3,4],xmm6[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2],xmm0[3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 624(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 608(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 592(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 576(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 560(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 544(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 528(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 512(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 496(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, 480(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, 464(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm14, 448(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 480(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 560(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 416(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 544(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 528(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 512(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 368(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 624(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 352(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 608(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 336(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 592(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 320(%r9) ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 576(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4543,34 +4576,30 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 368(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 352(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 336(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 320(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 416(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 464(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 448(%r9) -; AVX1-ONLY-NEXT: addq $392, %rsp # imm = 0x188 +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%r9) +; AVX1-ONLY-NEXT: addq $328, %rsp # imm = 0x148 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -4642,11 +4671,11 @@ ; AVX2-SLOW-NEXT: vpbroadcastq 96(%r8), %ymm5 ; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,10,11,10,11,6,7,8,9,8,9,12,13,12,13] ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm4 ; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [10,11,10,11,6,7,6,7,8,9,8,9,8,9,8,9] ; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm5 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6],xmm5[7] @@ -4708,8 +4737,7 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm11[0,1,1,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,2,3,6,7,6,7,8,9,4,5,4,5,8,9,18,19,18,19,22,23,22,23,24,25,20,21,20,21,24,25] ; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,1,2,5,5,5,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] @@ -4859,11 +4887,11 @@ ; AVX2-SLOW-NEXT: vpbroadcastq 120(%r8), %ymm0 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,12,13,12,13,0,1,14,15,14,15,14,15,14,15,16,17,28,29,28,29,16,17,30,31,30,31,30,31,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm10, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[3,0,3,0,7,4,7,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm9 ; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload @@ -4953,145 +4981,143 @@ ; AVX2-FAST-LABEL: store_i16_stride5_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $936, %rsp # imm = 0x3A8 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm12[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm15, %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm4 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm8[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm9, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm14 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm14, %ymm14 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm11 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm11, %ymm13 +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm11 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm13 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm13, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm13 +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm12 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm14, %xmm14 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm14 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm15, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm5 ; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm15 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm6 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm10, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 32(%r8), %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm14, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 64(%r8), %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq 32(%r8), %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm13, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq 64(%r8), %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm6, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq 96(%r8), %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,10,11,10,11,6,7,8,9,8,9,12,13,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm0 ; AVX2-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [10,11,10,11,6,7,6,7,8,9,8,9,8,9,8,9] ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX2-FAST-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = mem[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm2 -; AVX2-FAST-NEXT: vpbroadcastq 72(%rdi), %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm1 +; AVX2-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm4 +; AVX2-FAST-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = mem[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2],xmm6[3],xmm4[4,5],xmm6[6],xmm4[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm4 +; AVX2-FAST-NEXT: vpbroadcastq 72(%rdi), %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm11 +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm6 +; AVX2-FAST-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm8 = mem[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2],xmm8[3],xmm6[4,5],xmm8[6],xmm6[7] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm12 ; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm3 -; AVX2-FAST-NEXT: vpbroadcastq 104(%rdi), %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm8 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm8 ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm14, %xmm6 +; AVX2-FAST-NEXT: vpbroadcastq 104(%rdi), %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm15, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7] -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm9 +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm9 ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2],xmm5[3],xmm0[4,5],xmm5[6],xmm0[7] +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm6[0,1,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,1,1,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm5, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,1,1,1] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm10[0,1,1,1] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,1,1,1] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,1,1] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm10[0,1,1,1] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,2,3,6,7,6,7,8,9,4,5,4,5,8,9,18,19,18,19,22,23,22,23,24,25,20,21,20,21,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,2,5,5,5,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm11 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [6,7,2,3,4,5,4,5,8,9,8,9,8,9,8,9,22,23,18,19,20,21,20,21,24,25,24,25,24,25,24,25] ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,2,1,4,5,6,5] @@ -5099,7 +5125,7 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm7 ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 @@ -5136,8 +5162,8 @@ ; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm7 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm8[1,1,1,2,5,5,5,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0],ymm13[1],ymm1[2,3],ymm13[4],ymm1[5],ymm13[6],ymm1[7,8],ymm13[9],ymm1[10,11],ymm13[12],ymm1[13],ymm13[14],ymm1[15] ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm3 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 @@ -5151,7 +5177,7 @@ ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm13, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm13 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm8, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm13, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq 48(%r8), %ymm13 ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm1 @@ -5162,13 +5188,12 @@ ; AVX2-FAST-NEXT: vpbroadcastq 112(%r8), %ymm10 ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm10, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [14,15,12,13,10,11,14,15,14,15,12,13,14,15,12,13,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm10 ; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [10,11,12,13,14,15,14,15,10,11,12,13,14,15,14,15,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm13 ; AVX2-FAST-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload @@ -5179,8 +5204,8 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm13 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm15 @@ -5207,8 +5232,8 @@ ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7,8],ymm13[9],ymm0[10],ymm13[11],ymm0[12,13],ymm13[14],ymm0[15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm3 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm8[3,2,3,3,7,6,7,7] +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2],ymm13[3,4],ymm12[5,6,7,8],ymm13[9],ymm12[10],ymm13[11,12],ymm12[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,2] @@ -5225,13 +5250,13 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq 120(%r8), %ymm9 ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,12,13,12,13,0,1,14,15,14,15,14,15,14,15,16,17,28,29,28,29,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm11 ; AVX2-FAST-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = mem[3,0,3,0,7,4,7,4] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4,5],ymm11[6],ymm14[7,8],ymm11[9],ymm14[10],ymm11[11],ymm14[12,13],ymm11[14],ymm14[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm15 ; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload @@ -5243,7 +5268,7 @@ ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[3,0,3,0,7,4,7,4] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15] ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm13 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm8[1,1,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm7[1,1,2,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5],ymm13[6],ymm10[7,8],ymm13[9],ymm10[10,11],ymm13[12],ymm10[13],ymm13[14],ymm10[15] ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm12, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm7 @@ -5321,145 +5346,143 @@ ; AVX2-FAST-PERLANE-LABEL: store_i16_stride5_vf64: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $936, %rsp # imm = 0x3A8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm12[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm15, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm8[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm9, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm14, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm14, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm11, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm13, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm14, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm15, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm15, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm10, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq (%r8), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 32(%r8), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm14, %ymm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 64(%r8), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq (%r8), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 32(%r8), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm13, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 64(%r8), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm6, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 96(%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,10,11,10,11,6,7,8,9,8,9,12,13,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm2, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 8(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 40(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [10,11,10,11,6,7,6,7,8,9,8,9,8,9,8,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 72(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 40(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2],xmm6[3],xmm4[4,5],xmm6[6],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm11, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 72(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2],xmm8[3],xmm6[4,5],xmm8[6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 104(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm4, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm14, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 104(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm15, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2],xmm5[3],xmm0[4,5],xmm5[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm6[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm5, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm10[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm10[0,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm14, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,2,3,6,7,6,7,8,9,4,5,4,5,8,9,18,19,18,19,22,23,22,23,24,25,20,21,20,21,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm12, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,2,5,5,5,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [6,7,2,3,4,5,4,5,8,9,8,9,8,9,8,9,22,23,18,19,20,21,20,21,24,25,24,25,24,25,24,25] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,2,1,4,5,6,5] @@ -5467,7 +5490,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm2 @@ -5504,8 +5527,8 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm8[1,1,1,2,5,5,5,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0],ymm13[1],ymm1[2,3],ymm13[4],ymm1[5],ymm13[6],ymm1[7,8],ymm13[9],ymm1[10,11],ymm13[12],ymm1[13],ymm13[14],ymm1[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm0 @@ -5519,7 +5542,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm13, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 16(%r8), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm8, %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm7, %ymm13, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 48(%r8), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm1 @@ -5530,13 +5553,12 @@ ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 112(%r8), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm10, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [14,15,12,13,10,11,14,15,14,15,12,13,14,15,12,13,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [10,11,12,13,14,15,14,15,10,11,12,13,14,15,14,15,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm1, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload @@ -5547,8 +5569,8 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm6, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm4, %ymm15 @@ -5575,8 +5597,8 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7,8],ymm13[9],ymm0[10],ymm13[11],ymm0[12,13],ymm13[14],ymm0[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[3,2,3,3,7,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm8[3,2,3,3,7,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2],ymm13[3,4],ymm12[5,6,7,8],ymm13[9],ymm12[10],ymm13[11,12],ymm12[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,2] @@ -5593,13 +5615,13 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 120(%r8), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm9, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm7, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,12,13,12,13,0,1,14,15,14,15,14,15,14,15,16,17,28,29,28,29,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm10, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[3,0,3,0,7,4,7,4] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4,5],ymm11[6],ymm14[7,8],ymm11[9],ymm14[10],ymm11[11],ymm14[12,13],ymm11[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload @@ -5611,7 +5633,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[3,0,3,0,7,4,7,4] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm6, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm8[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm7[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5],ymm13[6],ymm10[7,8],ymm13[9],ymm10[10,11],ymm13[12],ymm10[13],ymm13[14],ymm10[15] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm12, %ymm10, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm5, %ymm7 @@ -5688,729 +5710,727 @@ ; ; AVX512F-SLOW-LABEL: store_i16_stride5_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $520, %rsp # imm = 0x208 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm15, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa64 96(%rdx), %ymm16 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm16[3,0,3,0,7,4,7,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %xmm3 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,2,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 96(%rdi), %ymm18 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm18[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13],ymm1[14],ymm4[15] -; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm5 -; AVX512F-SLOW-NEXT: vpbroadcastq 104(%rdi), %xmm6 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm29 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpandnq %ymm4, %ymm17, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm13 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm4 -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3,4],ymm4[5,6,7,8],ymm3[9],ymm4[10],ymm3[11,12],ymm4[13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,2] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm0[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm19 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,2,6,7,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm1[0,1,2,3],zmm0[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm1 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm1 -; AVX512F-SLOW-NEXT: vpbroadcastq 72(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm0 -; AVX512F-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: subq $328, %rsp # imm = 0x148 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm13 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [10,11,10,11,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,10,11,10,11,6,7,8,9,8,9,12,13,12,13] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm17 +; AVX512F-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm8 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0],xmm8[1],xmm1[2,3],xmm8[4],xmm1[5],xmm8[6],xmm1[7] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %ymm16 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm9 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm16[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm8[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3,4],ymm14[5,6,7,8],ymm12[9],ymm14[10],ymm12[11,12],ymm14[13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,2] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm12[0,1,2,3],zmm11[0,1,0,1] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm11[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm12 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,2,6,7,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2],ymm14[3],ymm2[4,5],ymm14[6],ymm2[7,8],ymm14[9],ymm2[10],ymm14[11],ymm2[12,13],ymm14[14],ymm2[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,2] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm2[0,1,2,3],zmm1[0,1,0,1] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,12,13,12,13,0,1,14,15,14,15,14,15,14,15,16,17,28,29,28,29,16,17,30,31,30,31,30,31,30,31] +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2],ymm2[3],ymm14[4,5],ymm2[6],ymm14[7,8],ymm2[9],ymm14[10],ymm2[11],ymm14[12,13],ymm2[14],ymm14[15] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,2,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm10[1],xmm4[2],xmm10[3],xmm4[4,5],xmm10[6],xmm4[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,0] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm10, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm24 +; AVX512F-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm28 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm28[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13],ymm2[14],ymm4[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm4 +; AVX512F-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm6 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm31 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm23 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm6[0,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpandnq %ymm4, %ymm18, %ymm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm27 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm13 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm13[0],xmm2[1],xmm13[2],xmm2[3],xmm13[4,5],xmm2[6],xmm13[7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm15, %xmm2 +; AVX512F-SLOW-NEXT: vpbroadcastq 72(%rdi), %xmm13 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm13[1],xmm2[2,3],xmm13[4],xmm2[5],xmm13[6],xmm2[7] +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[0,2,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3,4],ymm0[5,6,7,8],ymm3[9],ymm0[10],ymm3[11,12],ymm0[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1] +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm22 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm13 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm22[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3,4],ymm2[5,6,7,8],ymm3[9],ymm2[10],ymm3[11,12],ymm2[13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,0,1] ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %ymm30 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm9 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %ymm3 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm30[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm26 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm26[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm0[0,1,2,3],zmm1[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm22 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1 -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm22[3,0,3,0,7,4,7,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7,8],ymm0[9],ymm8[10],ymm0[11],ymm8[12,13],ymm0[14],ymm8[15] -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3],xmm7[4,5],xmm3[6],xmm7[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm7 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm3[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm8 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5],xmm8[6],xmm4[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] -; AVX512F-SLOW-NEXT: vpandnq %ymm7, %ymm17, %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm8 -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm17 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm24 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm27 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,2,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2],xmm5[3],xmm4[4,5],xmm5[6],xmm4[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm2, %ymm5 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm21 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,2,6,7,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15] -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512F-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm15, %ymm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm15[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm4[0],ymm5[1],ymm4[2],ymm5[3,4],ymm4[5,6,7,8],ymm5[9],ymm4[10],ymm5[11,12],ymm4[13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] +; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm0[0,1,2,3],zmm2[0,1,0,1] +; AVX512F-SLOW-NEXT: vmovdqa64 96(%rdx), %ymm29 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8],ymm2[9],ymm6[10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm5 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5],xmm4[6],xmm5[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,0] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm30 +; AVX512F-SLOW-NEXT: vmovdqa64 96(%rdi), %ymm19 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm24, %ymm14 +; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm4, %ymm2 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm19[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5],ymm2[6],ymm5[7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13],ymm2[14],ymm5[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm5 +; AVX512F-SLOW-NEXT: vpbroadcastq 104(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm24 +; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm2 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,1,1,1] +; AVX512F-SLOW-NEXT: vpandnq %ymm5, %ymm18, %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm6 +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm18 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm28[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm10, %ymm6 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm20 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7,8],ymm3[9],ymm0[10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15] -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm22[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5],ymm3[6],ymm0[7,8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13],ymm3[14],ymm0[15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm10[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,2,6,7,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm28[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm28 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,2,3,6,7,6,7,8,9,4,5,4,5,8,9,18,19,18,19,22,23,22,23,24,25,20,21,20,21,24,25] +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm1, %ymm5 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5],ymm10[6],ymm5[7,8],ymm10[9],ymm5[10,11],ymm10[12],ymm5[13],ymm10[14],ymm5[15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm9[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm5[1],ymm1[2],ymm5[3,4],ymm1[5,6,7,8],ymm5[9],ymm1[10],ymm5[11,12],ymm1[13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm4, %ymm5 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm1[2],ymm5[3],ymm1[4],ymm5[5,6],ymm1[7],ymm5[8,9],ymm1[10],ymm5[11],ymm1[12],ymm5[13,14],ymm1[15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm19[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7,8],ymm4[9],ymm1[10],ymm4[11],ymm1[12,13],ymm4[14],ymm1[15] +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm29[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5],ymm15[6],ymm1[7,8],ymm15[9],ymm1[10,11],ymm15[12],ymm1[13],ymm15[14],ymm1[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6,7,8],ymm1[9],ymm0[10],ymm1[11,12],ymm0[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm12, %ymm2 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm9, %ymm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm30[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0],ymm1[1],ymm14[2,3],ymm1[4],ymm14[5],ymm1[6],ymm14[7,8],ymm1[9],ymm14[10,11],ymm1[12],ymm14[13],ymm1[14],ymm14[15] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm9, %ymm9 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm14[2],ymm9[3],ymm14[4],ymm9[5,6],ymm14[7],ymm9[8,9],ymm14[10],ymm9[11],ymm14[12],ymm9[13,14],ymm14[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm23 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm6, %ymm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[3,0,3,0,7,4,7,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm1[1],ymm9[2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7,8],ymm1[9],ymm9[10],ymm1[11],ymm9[12,13],ymm1[14],ymm9[15] -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm9[1],ymm6[2,3],ymm9[4],ymm6[5],ymm9[6],ymm6[7,8],ymm9[9],ymm6[10,11],ymm9[12],ymm6[13],ymm9[14],ymm6[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm6 -; AVX512F-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpandnq %ymm1, %ymm18, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm9, %ymm13 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm13 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm12, %ymm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm19[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0],ymm1[1],ymm14[2,3],ymm1[4],ymm14[5],ymm1[6],ymm14[7,8],ymm1[9],ymm14[10,11],ymm1[12],ymm14[13],ymm1[14],ymm14[15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm29[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm15[1],ymm0[2],ymm15[3,4],ymm0[5,6,7,8],ymm15[9],ymm0[10],ymm15[11,12],ymm0[13,14,15] +; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm12, %ymm15 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm11[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm15[1],ymm7[2,3],ymm15[4],ymm7[5],ymm15[6],ymm7[7,8],ymm15[9],ymm7[10,11],ymm15[12],ymm7[13],ymm15[14],ymm7[15] ; AVX512F-SLOW-NEXT: vprolq $16, %ymm12, %ymm12 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3],ymm14[4],ymm12[5,6],ymm14[7],ymm12[8,9],ymm14[10],ymm12[11],ymm14[12],ymm12[13,14],ymm14[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm1, %zmm30 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm12[3,0,3,0,7,4,7,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm1[1],ymm14[2],ymm1[3],ymm14[4,5],ymm1[6],ymm14[7,8],ymm1[9],ymm14[10],ymm1[11],ymm14[12,13],ymm1[14],ymm14[15] -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm31 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vpbroadcastq 88(%r8), %ymm1 -; AVX512F-SLOW-NEXT: vpbroadcastq 96(%r8), %ymm14 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm1, %zmm29 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm31, %zmm19, %zmm29 -; AVX512F-SLOW-NEXT: vpternlogq $226, (%rsp), %zmm16, %zmm28 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm14 -; AVX512F-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm22 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm14, %zmm14 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm28, %zmm19, %zmm14 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm24[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7],ymm12[8,9],ymm11[10],ymm12[11],ymm11[12],ymm12[13,14],ymm11[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm19 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,12,13,12,13,0,1,14,15,14,15,14,15,14,15,16,17,28,29,28,29,16,17,30,31,30,31,30,31,30,31] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm8, %ymm7 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm16[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2],ymm7[3],ymm11[4,5],ymm7[6],ymm11[7,8],ymm7[9],ymm11[10],ymm7[11],ymm11[12,13],ymm7[14],ymm11[15] +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm8 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm16[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5],ymm11[6],ymm8[7,8],ymm11[9],ymm8[10,11],ymm11[12],ymm8[13],ymm11[14],ymm8[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpandn %ymm8, %ymm12, %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm11 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm15 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm15, %zmm8 +; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm3, %ymm15 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm26[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10,11],ymm15[12],ymm14[13],ymm15[14],ymm14[15] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm3, %ymm3 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm26[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm15[2],ymm3[3],ymm15[4],ymm3[5,6],ymm15[7],ymm3[8,9],ymm15[10],ymm3[11],ymm15[12],ymm3[13,14],ymm15[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm14, %zmm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm14 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm22[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10],ymm14[11],ymm15[12,13],ymm14[14],ymm15[15] +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm13, %ymm6 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm22[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3],ymm13[4],ymm6[5],ymm13[6],ymm6[7,8],ymm13[9],ymm6[10,11],ymm13[12],ymm6[13],ymm13[14],ymm6[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm6 +; AVX512F-SLOW-NEXT: vpbroadcastq 80(%r8), %ymm13 +; AVX512F-SLOW-NEXT: vpandn %ymm13, %ymm12, %ymm12 +; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm13 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm13, %ymm14 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm12 +; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpermq $4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm15 = mem[0,1,0,0] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,1] +; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm16 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpermq $4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm22 = mem[0,1,0,0] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpbroadcastq 80(%r8), %ymm2 -; AVX512F-SLOW-NEXT: vpandnq %ymm2, %ymm18, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm12 -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm12, %ymm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm27[0,1,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[0,1,0,0] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm21[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm20[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm10[2,2,3,2] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm21 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm25 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm2, %zmm21, %zmm10 -; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm2, %zmm25, %zmm17 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm19, %zmm2 -; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm19 = mem[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm21, %zmm19 -; AVX512F-SLOW-NEXT: vpbroadcastq 64(%r8), %ymm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm12, %zmm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm18, %zmm18 -; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm19 = mem[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm18, %zmm21, %zmm19 -; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm18 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm18, %zmm9 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm12, %zmm9 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm28, %zmm11 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm8, %zmm8 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm11, %zmm21, %zmm8 -; AVX512F-SLOW-NEXT: vpbroadcastq 112(%r8), %ymm11 -; AVX512F-SLOW-NEXT: vpbroadcastq 120(%r8), %ymm12 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm8, %zmm12, %zmm11 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm21, %zmm3 -; AVX512F-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm4 -; AVX512F-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm12, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm16, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm30, %zmm16, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm3, %zmm6, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm3, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 384(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 64(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 256(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 576(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, (%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 192(%r9) +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm20[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,3,2,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm17[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,2,3,2] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm15 = mem[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm14, %zmm29, %zmm15 +; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm14 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm15, %zmm14, %zmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm25 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm16, %zmm16 +; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm20 = mem[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm29, %zmm20 +; AVX512F-SLOW-NEXT: vpbroadcastq 64(%r8), %ymm16 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm16, %zmm13 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm14, %zmm13 +; AVX512F-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm14 +; AVX512F-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm16 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm14, %zmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm25, %zmm16, %zmm14 +; AVX512F-SLOW-NEXT: vpternlogq $226, (%rsp), %zmm15, %zmm21 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vpbroadcastq 88(%r8), %ymm17 +; AVX512F-SLOW-NEXT: vpbroadcastq 96(%r8), %ymm20 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm17 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm16, %zmm17 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] +; AVX512F-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm31 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm30, %zmm16, %zmm24 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm16, %zmm31, %zmm27 +; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm16, %zmm24, %zmm18 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm26, %zmm16 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm29, %zmm9 +; AVX512F-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm10 +; AVX512F-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm16 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm10, %zmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm9, %zmm16, %zmm10 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm4, %zmm29, %zmm0 +; AVX512F-SLOW-NEXT: vpbroadcastq 112(%r8), %ymm1 +; AVX512F-SLOW-NEXT: vpbroadcastq 120(%r8), %ymm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm16, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm19, %zmm15, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm15, %zmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm0, %zmm7, %zmm8 +; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm0, %zmm6, %zmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 384(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 64(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 576(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 256(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, 512(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 448(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 320(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, 192(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 128(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 320(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm29, 448(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 512(%r9) -; AVX512F-SLOW-NEXT: addq $520, %rsp # imm = 0x208 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, (%r9) +; AVX512F-SLOW-NEXT: addq $328, %rsp # imm = 0x148 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i16_stride5_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $360, %rsp # imm = 0x168 -; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %ymm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 96(%rdx), %ymm18 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 96(%rdi), %ymm23 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm23[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] -; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX512F-FAST-NEXT: vpbroadcastq 104(%rdi), %xmm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm4 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,1,1,1] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpandn %ymm5, %ymm3, %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX512F-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 32(%rdx), %ymm27 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm27[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 -; AVX512F-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX512F-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] +; AVX512F-FAST-NEXT: subq $296, %rsp # imm = 0x128 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm0 +; AVX512F-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,12,13,12,13,0,1,14,15,14,15,14,15,14,15,16,17,28,29,28,29,16,17,30,31,30,31,30,31,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa64 32(%rdx), %ymm25 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm25[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm12 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [10,11,10,11,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,2,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, %ymm8 +; AVX512F-FAST-NEXT: vmovdqa64 32(%rdi), %ymm27 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm27[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,10,11,10,11,6,7,8,9,8,9,12,13,12,13] +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm5 +; AVX512F-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm5 -; AVX512F-FAST-NEXT: vpbroadcastq 72(%rdi), %xmm7 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2,3],xmm7[4],xmm5[5],xmm7[6],xmm5[7] -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm7 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm10 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpandn %ymm4, %ymm6, %ymm4 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm7 -; AVX512F-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm9 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5],xmm9[6],xmm7[7] -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm13, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa64 32(%rdi), %ymm30 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm30[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13],ymm7[14],ymm9[15] -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm9 -; AVX512F-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX512F-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm9 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3],xmm9[4],xmm1[5],xmm9[6],xmm1[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm1[0,1,1,1] -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpandn %ymm7, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm29 -; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %xmm7 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,2,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm9[1],xmm3[2],xmm9[3],xmm3[4,5],xmm9[6],xmm3[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm9 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,0,1,8,9,8,8] -; AVX512F-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm19 = zmm4[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm31, %zmm19 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %ymm4 +; AVX512F-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %ymm5 +; AVX512F-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %xmm10 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm5 +; AVX512F-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm18 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %xmm6 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,2,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2],xmm5[3],xmm3[4,5],xmm5[6],xmm3[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] +; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %xmm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm3 +; AVX512F-FAST-NEXT: vpbroadcastq 72(%rdi), %xmm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm14, %xmm19 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm23 +; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %ymm3 +; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 96(%rdi), %ymm28 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm28[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm21 +; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %xmm3 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512F-FAST-NEXT: vpbroadcastq 104(%rdi), %xmm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm14, %ymm16 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,2,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7] +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,2,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm7[1],xmm4[2],xmm7[3],xmm4[4,5],xmm7[6],xmm4[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm8 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,0,1,8,9,8,8] +; AVX512F-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm8 +; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %xmm9 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[1,2,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm11[1],xmm7[2],xmm11[3],xmm7[4,5],xmm11[6],xmm7[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm11 +; AVX512F-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm11 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm15 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [10,11,12,13,14,15,14,15,10,11,12,13,14,15,14,15,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm15, %ymm5 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[3,4],ymm5[5,6,7,8],ymm7[9],ymm5[10],ymm7[11,12],ymm5[13,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [2,2,3,2,8,9,8,9] +; AVX512F-FAST-NEXT: vpermt2q %zmm2, %zmm31, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %ymm29 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [14,15,12,13,10,11,14,15,14,15,12,13,14,15,12,13,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm12, %ymm17 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm29[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm7[0],ymm13[1],ymm7[2],ymm13[3],ymm7[4,5],ymm13[6],ymm7[7,8],ymm13[9],ymm7[10],ymm13[11],ymm7[12,13],ymm13[14],ymm7[15] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm12 +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,3,2,2,8,9,8,9] +; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm5, %zmm26, %zmm13 +; AVX512F-FAST-NEXT: vpbroadcastq 24(%r8), %ymm0 +; AVX512F-FAST-NEXT: vpbroadcastq 32(%r8), %ymm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm20 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm13, %zmm5, %zmm20 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %ymm30 +; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm6 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm30[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2],ymm10[3,4],ymm6[5,6,7,8],ymm10[9],ymm6[10],ymm10[11,12],ymm6[13,14,15] +; AVX512F-FAST-NEXT: vpermt2q %zmm1, %zmm31, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm21 = zmm5[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm31, %zmm21 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %ymm28 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm0 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm28[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm28[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vprolq $16, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8,9],ymm1[10],ymm3[11],ymm1[12],ymm3[13,14],ymm1[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm10 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512F-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25 -; AVX512F-FAST-NEXT: vpbroadcastq 16(%r8), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpandn %ymm0, %ymm9, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm24 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm26 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512F-FAST-NEXT: vprolq $16, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11],ymm2[12],ymm1[13,14],ymm2[15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [2,3,2,3,10,11,10,10] -; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm1 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm6, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3,4],ymm0[5,6,7,8],ymm2[9],ymm0[10],ymm2[11,12],ymm0[13,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm2 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,3,2,3,10,10,11,10] -; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm31, %zmm2 -; AVX512F-FAST-NEXT: vpbroadcastq 112(%r8), %ymm0 -; AVX512F-FAST-NEXT: vpbroadcastq 120(%r8), %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm12 -; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %ymm23 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm23[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vprolq $16, %ymm6, %ymm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm22 -; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %ymm3 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %ymm31 +; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %ymm10 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm12 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm3 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm31[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2],ymm13[3],ymm3[4,5],ymm13[6],ymm3[7,8],ymm13[9],ymm3[10],ymm13[11],ymm3[12,13],ymm13[14],ymm3[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm16[0,1,0,1] +; AVX512F-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm6, %zmm26, %zmm3 +; AVX512F-FAST-NEXT: vpbroadcastq 88(%r8), %ymm1 +; AVX512F-FAST-NEXT: vpbroadcastq 96(%r8), %ymm6 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm5, %zmm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm13, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %ymm3 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm3[0,1,1,1] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpandn %ymm6, %ymm7, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm18[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm8, %zmm18, %zmm6 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm23[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm11, %zmm18, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm8 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm8 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm29[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2,3],ymm8[4],ymm11[5],ymm8[6],ymm11[7,8],ymm8[9],ymm11[10,11],ymm8[12],ymm11[13],ymm8[14],ymm11[15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm29[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vprolq $16, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm11[2],ymm2[3],ymm11[4],ymm2[5,6],ymm11[7],ymm2[8,9],ymm11[10],ymm2[11],ymm11[12],ymm2[13,14],ymm11[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm21 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,1,12,13,12,13,0,1,14,15,14,15,14,15,14,15,16,17,28,29,28,29,16,17,30,31,30,31,30,31,30,31] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm2 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[3,0,3,0,7,4,7,4] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8],ymm2[9],ymm8[10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm8 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5],ymm15[6],ymm8[7,8],ymm15[9],ymm8[10,11],ymm15[12],ymm8[13],ymm15[14],ymm8[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm8 -; AVX512F-FAST-NEXT: vpbroadcastq 80(%r8), %ymm2 -; AVX512F-FAST-NEXT: vpandn %ymm2, %ymm9, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm15 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm15, %zmm15 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm9 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm14[1],ymm9[2],ymm14[3],ymm9[4,5],ymm14[6],ymm9[7,8],ymm14[9],ymm9[10],ymm14[11],ymm9[12,13],ymm14[14],ymm9[15] -; AVX512F-FAST-NEXT: vprolq $16, %ymm13, %ymm13 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3],ymm14[4],ymm13[5,6],ymm14[7],ymm13[8,9],ymm14[10],ymm13[11],ymm14[12],ymm13[13,14],ymm14[15] -; AVX512F-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm13 -; AVX512F-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm9 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm27[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm14[1],ymm9[2],ymm14[3,4],ymm9[5,6,7,8],ymm14[9],ymm9[10],ymm14[11,12],ymm9[13,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm7 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm27[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm7[0],ymm14[1],ymm7[2,3],ymm14[4],ymm7[5],ymm14[6],ymm7[7,8],ymm14[9],ymm7[10,11],ymm14[12],ymm7[13],ymm14[14],ymm7[15] -; AVX512F-FAST-NEXT: vpermt2q %zmm9, %zmm18, %zmm14 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm13, %zmm31, %zmm14 -; AVX512F-FAST-NEXT: vpbroadcastq 48(%r8), %ymm7 -; AVX512F-FAST-NEXT: vpbroadcastq 56(%r8), %ymm9 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm14, %zmm17, %zmm7 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3] -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5,6,7,8],ymm0[9],ymm1[10],ymm0[11,12],ymm1[13,14,15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm9, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,2,3,2,8,9,8,9] -; AVX512F-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2],ymm13[3],ymm6[4,5],ymm13[6],ymm6[7,8],ymm13[9],ymm6[10],ymm13[11],ymm6[12,13],ymm13[14],ymm6[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,3,2,2,8,9,8,9] -; AVX512F-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm1, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,1] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3,4],ymm4[5,6,7,8],ymm5[9],ymm4[10],ymm5[11,12],ymm4[13,14,15] -; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm4 -; AVX512F-FAST-NEXT: vpbroadcastq 88(%r8), %ymm0 -; AVX512F-FAST-NEXT: vpbroadcastq 96(%r8), %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm6, %zmm5, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,2,3,6,7,6,7,8,9,4,5,4,5,8,9,18,19,18,19,22,23,22,23,24,25,20,21,20,21,24,25] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm11 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5],ymm4[6],ymm11[7,8],ymm4[9],ymm11[10,11],ymm4[12],ymm11[13],ymm4[14],ymm11[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm4 +; AVX512F-FAST-NEXT: vpbroadcastq 16(%r8), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-FAST-NEXT: vpandnq %ymm2, %ymm23, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm11 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm9, %ymm24 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm11, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm9, %ymm11 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm12, %ymm19 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm27[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm27[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vprolq $16, %ymm9, %ymm9 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3],ymm12[4],ymm9[5,6],ymm12[7],ymm9[8,9],ymm12[10],ymm9[11],ymm12[12],ymm9[13,14],ymm12[15] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [2,3,2,3,10,11,10,10] +; AVX512F-FAST-NEXT: vpermt2q %zmm11, %zmm29, %zmm9 +; AVX512F-FAST-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm12, %ymm11 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm14, %ymm22 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm25[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2],ymm14[3,4],ymm11[5,6,7,8],ymm14[9],ymm11[10],ymm14[11,12],ymm11[13,14,15] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm12, %ymm14 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm25[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13],ymm13[14],ymm14[15] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [2,3,2,3,10,10,11,10] +; AVX512F-FAST-NEXT: vpermt2q %zmm11, %zmm27, %zmm13 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm9, %zmm18, %zmm13 +; AVX512F-FAST-NEXT: vpbroadcastq 48(%r8), %ymm9 +; AVX512F-FAST-NEXT: vpbroadcastq 56(%r8), %ymm11 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm11 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm13, %zmm25, %zmm11 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm9 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm9 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm24[0,1,1,1] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm28[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15] -; AVX512F-FAST-NEXT: vpermt2q %zmm6, %zmm13, %zmm9 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm4, %zmm1, %zmm9 -; AVX512F-FAST-NEXT: vpbroadcastq 24(%r8), %ymm4 -; AVX512F-FAST-NEXT: vpbroadcastq 32(%r8), %ymm6 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm9, %zmm5, %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm16 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $248, %zmm5, %zmm3, %zmm9 -; AVX512F-FAST-NEXT: vpternlogq $248, %zmm5, %zmm16, %zmm29 -; AVX512F-FAST-NEXT: vpbroadcastq 64(%r8), %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm19, %zmm5, %zmm2 -; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm6 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm21, %zmm5, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm25 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm22, %zmm1, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $248, %zmm1, %zmm25, %zmm26 -; AVX512F-FAST-NEXT: vpternlogq $248, %zmm1, %zmm8, %zmm15 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 128(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 256(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 448(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 384(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 576(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, 64(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, (%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 320(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, 192(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 512(%r9) -; AVX512F-FAST-NEXT: addq $360, %rsp # imm = 0x168 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm13 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm31[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm13[1],ymm9[2,3],ymm13[4],ymm9[5],ymm13[6],ymm9[7,8],ymm13[9],ymm9[10,11],ymm13[12],ymm9[13],ymm13[14],ymm9[15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm31[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vprolq $16, %ymm10, %ymm10 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3],ymm13[4],ymm10[5,6],ymm13[7],ymm10[8,9],ymm13[10],ymm10[11],ymm13[12],ymm10[13,14],ymm13[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm31 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm10 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm30[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2],ymm10[3],ymm13[4,5],ymm10[6],ymm13[7,8],ymm10[9],ymm13[10],ymm10[11],ymm13[12,13],ymm10[14],ymm13[15] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm30[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3],ymm13[4],ymm0[5],ymm13[6],ymm0[7,8],ymm13[9],ymm0[10,11],ymm13[12],ymm0[13],ymm13[14],ymm0[15] +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm12 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm9, %ymm13 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm28[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10],ymm14[11],ymm13[12,13],ymm14[14],ymm13[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm15[0,1,1,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm28[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vprolq $16, %ymm9, %ymm12 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3],ymm15[4],ymm12[5,6],ymm15[7],ymm12[8,9],ymm15[10],ymm12[11],ymm15[12],ymm12[13,14],ymm15[15] +; AVX512F-FAST-NEXT: vpermt2q %zmm13, %zmm29, %zmm12 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 +; AVX512F-FAST-NEXT: vpbroadcastq 80(%r8), %ymm10 +; AVX512F-FAST-NEXT: vpandnq %ymm10, %ymm23, %ymm10 +; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %ymm13 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm15 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm10 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm15 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm16[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm15[0],ymm9[1],ymm15[2],ymm9[3,4],ymm15[5,6,7,8],ymm9[9],ymm15[10],ymm9[11,12],ymm15[13,14,15] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm14, %ymm8 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,1] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm16[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5],ymm15[6],ymm8[7,8],ymm15[9],ymm8[10,11],ymm15[12],ymm8[13],ymm15[14],ymm8[15] +; AVX512F-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm8 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm12, %zmm18, %zmm8 +; AVX512F-FAST-NEXT: vpbroadcastq 112(%r8), %ymm9 +; AVX512F-FAST-NEXT: vpbroadcastq 120(%r8), %ymm12 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm8, %zmm25, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm14 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm5 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-FAST-NEXT: vpternlogq $248, %zmm8, %zmm14, %zmm12 +; AVX512F-FAST-NEXT: vpternlogq $248, %zmm8, %zmm5, %zmm3 +; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm6, %zmm8, %zmm5 +; AVX512F-FAST-NEXT: vpbroadcastq 64(%r8), %ymm6 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm6, %zmm6 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm7, %zmm8, %zmm6 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm21, %zmm26, %zmm4 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm31, %zmm26, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $248, %zmm7, %zmm4, %zmm2 +; AVX512F-FAST-NEXT: vpternlogq $248, %zmm7, %zmm0, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 576(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 448(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 384(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 256(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, 128(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 64(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 320(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 512(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 192(%r9) +; AVX512F-FAST-NEXT: addq $296, %rsp # imm = 0x128 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-LABEL: store_i16_stride5_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm15 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2w %zmm6, %zmm16, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm4, %zmm17, %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm16 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm14, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38> +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm4, %zmm2 ; AVX512BW-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm18, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm19, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u> -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm8, %zmm7 -; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm21, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u> -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm9 -; AVX512BW-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm9 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm23, %zmm24 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm25, %zmm14 +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm18, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm19, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44> +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm9, %zmm8 ; AVX512BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm24, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm26, %zmm27 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44> -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm28, %zmm29 -; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm29 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm27, %zmm29 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm16, %zmm12 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm20, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm21, %zmm13 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm11, %zmm10 +; AVX512BW-NEXT: vmovdqu16 %zmm13, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm22, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm24 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm23, %zmm24 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u> +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm25, %zmm13 +; AVX512BW-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm13 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm24, %zmm13 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm26, %zmm12 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u> +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm0 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm0 -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm1, %zmm19 -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm8 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm20, %zmm8 -; AVX512BW-NEXT: vpermi2w %zmm6, %zmm2, %zmm21 -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm10 {%k3} -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm22, %zmm10 -; AVX512BW-NEXT: vpermi2w %zmm6, %zmm2, %zmm23 -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm1, %zmm25 -; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm25 {%k2} -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm24, %zmm25 -; AVX512BW-NEXT: vpermt2w %zmm4, %zmm26, %zmm1 -; AVX512BW-NEXT: vpermt2w %zmm6, %zmm28, %zmm2 -; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm2 {%k2} -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm27, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 512(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 576(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%r9) +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm12, %zmm0 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm6, %zmm14 +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm4 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm18, %zmm4 +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm1, %zmm19 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm6, %zmm9 +; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm9 {%k2} +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm20, %zmm9 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm6, %zmm21 +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm11 {%k2} +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm22, %zmm11 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm6, %zmm23 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm5, %zmm25 +; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm25 {%k3} +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm24, %zmm25 +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm26, %zmm1 +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm3 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm12, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, 576(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 512(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 384(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i16>, ptr %in.vecptr0, align 64 @@ -6443,6 +6463,7 @@ ; AVX512DQ-SLOW: {{.*}} ; AVX512DQBW-FAST: {{.*}} ; AVX512DQBW-SLOW: {{.*}} +; AVX512F: {{.*}} ; AVX512F-ONLY-FAST: {{.*}} ; AVX512F-ONLY-SLOW: {{.*}} ; FALLBACK0: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll @@ -19,67 +19,67 @@ ; SSE-LABEL: store_i16_stride6_vf2: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movaps (%rdi), %xmm0 ; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: movdqa (%r8), %xmm2 ; SSE-NEXT: movdqa (%r9), %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,3,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,0,3,4,5,6,7] -; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[1,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movq %xmm1, 16(%rax) +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride6_vf2: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm3 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,5,7,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rax) -; AVX1-ONLY-NEXT: vmovq %xmm1, 16(%rax) +; AVX1-ONLY-NEXT: vmovq %xmm3, 16(%rax) ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride6_vf2: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX2-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] @@ -92,16 +92,18 @@ ; AVX2-FAST-LABEL: store_i16_stride6_vf2: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX2-FAST-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -113,16 +115,18 @@ ; AVX2-FAST-PERLANE-LABEL: store_i16_stride6_vf2: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -134,16 +138,18 @@ ; AVX512F-SLOW-LABEL: store_i16_stride6_vf2: ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm1 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] @@ -156,16 +162,18 @@ ; AVX512F-FAST-LABEL: store_i16_stride6_vf2: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm1 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX512F-FAST-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -177,13 +185,15 @@ ; AVX512BW-LABEL: store_i16_stride6_vf2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm1 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u> ; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 @@ -227,12 +237,12 @@ ; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,4,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm6[0,2] ; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm1[1] ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,1,1,3] @@ -248,38 +258,41 @@ ; AVX1-ONLY-LABEL: store_i16_stride6_vf4: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm5[0],xmm4[0] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,3,10,11,u,u,u,u,u,u,u,u,4,5,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,1,2,0] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[2,3,10,11,u,u,u,u,u,u,u,u,4,5,12,13] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -288,30 +301,33 @@ ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm4[0],xmm3[0] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2,3,4,5],ymm2[6],ymm6[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5],ymm0[6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %xmm3, 32(%rax) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -320,30 +336,32 @@ ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm4[0],xmm3[0] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,6,1,3,4,6,1,3] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,28,29,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2,3,4,5],ymm2[6],ymm6[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm3[0],xmm2[0] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm5[1,2],xmm4[3] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,6,1,5,2,6,1,5] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,28,29,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-FAST-NEXT: vmovdqa %xmm4, 32(%rax) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -352,28 +370,30 @@ ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm4[0],xmm3[0] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2,3,4,5],ymm2[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm3[0],xmm2[0] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm5[1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5],ymm0[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -382,32 +402,34 @@ ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4,5],ymm3[6],ymm4[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rax) +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1,2],xmm3[3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, 32(%rax) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -416,31 +438,33 @@ ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm3[0],xmm2[0] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm5[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,6,1,3,4,6,1,3] -; AVX512F-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm5 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,28,29,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4,5],ymm5[6],ymm6[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7] -; AVX512F-FAST-NEXT: vpermi2d %ymm2, %ymm5, %ymm3 +; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rax) +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1,2],xmm3[3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,6,1,3,4,6,1,3] +; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,28,29,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7] +; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa %xmm1, 32(%rax) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -449,19 +473,18 @@ ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) -; AVX512BW-NEXT: vmovdqa %ymm0, (%rax) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,4,8,12,32,40,1,5,9,13,33,41,2,6,10,14,34,42,3,7,11,15,35,43,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) +; AVX512BW-NEXT: vmovdqa %ymm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <4 x i16>, ptr %in.vecptr0, align 64 @@ -485,85 +508,85 @@ ; SSE-LABEL: store_i16_stride6_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm2 ; SSE-NEXT: movdqa (%rsi), %xmm8 -; SSE-NEXT: movdqa (%rdx), %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm0 ; SSE-NEXT: movdqa (%rcx), %xmm9 -; SSE-NEXT: movdqa (%r8), %xmm6 -; SSE-NEXT: movdqa (%r9), %xmm5 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE-NEXT: movdqa (%r8), %xmm4 +; SSE-NEXT: movdqa (%r9), %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm4[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,2],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm6[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,2],xmm3[0,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0,1,3] -; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm2, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,1,2,3] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: andnps %xmm11, %xmm3 -; SSE-NEXT: orps %xmm10, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm1[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm6[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,2],xmm9[0,1] +; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm3, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,1,2,1] +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: andnps %xmm11, %xmm5 +; SSE-NEXT: orps %xmm10, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,2],xmm9[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0,1,3] -; SSE-NEXT: andps %xmm2, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] -; SSE-NEXT: andnps %xmm9, %xmm2 -; SSE-NEXT: orps %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[1,3] +; SSE-NEXT: andps %xmm3, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; SSE-NEXT: andnps %xmm9, %xmm3 +; SSE-NEXT: orps %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm7[0] +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm7[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm8[0,2] ; SSE-NEXT: movaps {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,65535,65535] ; SSE-NEXT: andps %xmm8, %xmm10 -; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] ; SSE-NEXT: movaps %xmm8, %xmm9 ; SSE-NEXT: andnps %xmm11, %xmm9 ; SSE-NEXT: orps %xmm10, %xmm9 -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm4[1] -; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movdqa %xmm4, %xmm12 ; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm10[0,2] ; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: andps %xmm10, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[2,2,3,3] ; SSE-NEXT: movaps %xmm10, %xmm11 ; SSE-NEXT: andnps %xmm13, %xmm11 ; SSE-NEXT: orps %xmm12, %xmm11 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm6[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm7[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[0,2] -; SSE-NEXT: andps %xmm8, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] -; SSE-NEXT: pslld $16, %xmm5 -; SSE-NEXT: andnps %xmm5, %xmm8 -; SSE-NEXT: orps %xmm4, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[0,2] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm7[0,2] ; SSE-NEXT: andps %xmm10, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,1,1] ; SSE-NEXT: andnps %xmm6, %xmm10 ; SSE-NEXT: orps %xmm12, %xmm10 -; SSE-NEXT: movaps %xmm10, 16(%rax) +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2] +; SSE-NEXT: andps %xmm8, %xmm0 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: andnps %xmm1, %xmm8 +; SSE-NEXT: orps %xmm0, %xmm8 ; SSE-NEXT: movaps %xmm8, 48(%rax) +; SSE-NEXT: movaps %xmm10, 16(%rax) ; SSE-NEXT: movaps %xmm11, 64(%rax) ; SSE-NEXT: movaps %xmm9, (%rax) -; SSE-NEXT: movaps %xmm2, 32(%rax) ; SSE-NEXT: movaps %xmm3, 80(%rax) +; SSE-NEXT: movaps %xmm5, 32(%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride6_vf8: @@ -688,8 +711,10 @@ ; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,u,u,u,u,22,23,18,19,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,10,11,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,6,3,7,2,6,3,7] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,6,7,u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,u,u,u,u,26,27,30,31,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] @@ -745,9 +770,9 @@ ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,1,3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] @@ -784,9 +809,9 @@ ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm2 +; AVX512F-FAST-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 ; AVX512F-FAST-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5] ; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm3 @@ -814,8 +839,10 @@ ; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,u,u,u,u,22,23,18,19,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,10,11,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,6,3,7,2,6,3,7] +; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,6,7,u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,u,u,u,u,26,27,30,31,u,u,u,u] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] @@ -863,326 +890,319 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride6_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movdqa (%rdi), %xmm15 -; SSE-NEXT: movdqa 16(%rdi), %xmm11 -; SSE-NEXT: movdqa (%rsi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rsi), %xmm4 -; SSE-NEXT: movdqa (%rdx), %xmm14 -; SSE-NEXT: movdqa 16(%rdx), %xmm12 -; SSE-NEXT: movdqa (%rcx), %xmm3 -; SSE-NEXT: movdqa 16(%rcx), %xmm2 -; SSE-NEXT: movdqa 16(%r8), %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r9), %xmm8 -; SSE-NEXT: movdqa %xmm12, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm6[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,2],xmm0[0,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0,1,3] -; SSE-NEXT: movaps {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm7, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm8[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: andnps %xmm9, %xmm0 -; SSE-NEXT: orps %xmm5, %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm5 -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; SSE-NEXT: movdqa %xmm15, %xmm13 -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm5[3,3] -; SSE-NEXT: movdqa (%r8), %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,2],xmm9[2,3] -; SSE-NEXT: movdqa (%r9), %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: andnps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0,1,3] -; SSE-NEXT: andps %xmm7, %xmm13 -; SSE-NEXT: orps %xmm13, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm12[3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm4[2,3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: andnps %xmm4, %xmm0 +; SSE-NEXT: movdqa (%rsi), %xmm6 +; SSE-NEXT: movdqa 16(%rsi), %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdx), %xmm5 +; SSE-NEXT: movdqa 16(%rdx), %xmm11 +; SSE-NEXT: movdqa (%rcx), %xmm10 +; SSE-NEXT: movdqa 16(%rcx), %xmm14 +; SSE-NEXT: movdqa (%r8), %xmm8 +; SSE-NEXT: movdqa (%r9), %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm9[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,2],xmm2[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0,1,3] +; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm2, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: andnps %xmm13, %xmm1 +; SSE-NEXT: orps %xmm12, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm5[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,2],xmm10[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0,1,3] +; SSE-NEXT: andps %xmm2, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: andnps %xmm10, %xmm1 +; SSE-NEXT: orps %xmm6, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm12[3,3] +; SSE-NEXT: movdqa 16(%r8), %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm6[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm10[0,1] +; SSE-NEXT: movdqa 16(%r9), %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3] +; SSE-NEXT: andps %xmm0, %xmm15 +; SSE-NEXT: orps %xmm15, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm14[4],xmm11[5],xmm14[5],xmm11[6],xmm14[6],xmm11[7],xmm14[7] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm11[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm6[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm14[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: andps %xmm7, %xmm1 +; SSE-NEXT: andps %xmm0, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm10[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] +; SSE-NEXT: andnps %xmm14, %xmm0 ; SSE-NEXT: orps %xmm1, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3] -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm7[0] +; SSE-NEXT: movdqa %xmm8, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,0],xmm7[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm14[0,2] +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: pandn %xmm11, %xmm14 +; SSE-NEXT: andps %xmm15, %xmm1 +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[0,2] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,0,1,1] +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm11, %xmm7 +; SSE-NEXT: andps %xmm9, %xmm1 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm4[0] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm1[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm3 ; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm14[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm3[0,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: andps %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE-NEXT: andnps %xmm3, %xmm7 -; SSE-NEXT: orps %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm11[0] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm11[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[0,2] -; SSE-NEXT: pslld $16, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: pandn %xmm8, %xmm13 -; SSE-NEXT: andps %xmm1, %xmm3 -; SSE-NEXT: por %xmm3, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm5[1,1] +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: andps %xmm15, %xmm11 +; SSE-NEXT: por %xmm11, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[0,2] +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: andps %xmm9, %xmm8 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm13[0] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm13[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[0,2] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: andps %xmm3, %xmm4 -; SSE-NEXT: por %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm14[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm10[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm14[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: andps %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: andps %xmm15, %xmm4 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm12[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm12[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm13[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[0,0,1,1] +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: andps %xmm9, %xmm4 ; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm14 = xmm14[0],xmm15[0] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[0,2] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: andps %xmm1, %xmm14 -; SSE-NEXT: por %xmm14, %xmm15 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm11[0] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm11[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[0,2] +; SSE-NEXT: andps %xmm15, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm10 +; SSE-NEXT: pandn %xmm10, %xmm15 +; SSE-NEXT: por %xmm4, %xmm15 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm12[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: pandn %xmm4, %xmm11 -; SSE-NEXT: andps %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm6[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,2] -; SSE-NEXT: andps %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,1,1] -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[0,2] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: andps %xmm1, %xmm6 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm10[0,2] -; SSE-NEXT: andps %xmm1, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pslld $16, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm12[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm11[0,2] +; SSE-NEXT: andps %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm9 +; SSE-NEXT: por %xmm6, %xmm9 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movdqa %xmm9, 160(%rax) +; SSE-NEXT: movdqa %xmm15, 144(%rax) +; SSE-NEXT: movdqa %xmm8, 112(%rax) +; SSE-NEXT: movdqa %xmm3, 96(%rax) +; SSE-NEXT: movdqa %xmm0, 64(%rax) ; SSE-NEXT: movdqa %xmm1, 48(%rax) -; SSE-NEXT: movdqa %xmm0, 96(%rax) -; SSE-NEXT: movdqa %xmm3, 112(%rax) -; SSE-NEXT: movdqa %xmm11, 160(%rax) -; SSE-NEXT: movdqa %xmm15, (%rax) -; SSE-NEXT: movdqa %xmm8, 16(%rax) -; SSE-NEXT: movdqa %xmm9, 64(%rax) -; SSE-NEXT: movdqa %xmm13, 144(%rax) -; SSE-NEXT: movaps %xmm7, 32(%rax) +; SSE-NEXT: movdqa %xmm7, 16(%rax) +; SSE-NEXT: movdqa %xmm14, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rax) -; SSE-NEXT: addq $24, %rsp +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride6_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6],ymm2[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm4 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm13 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm13[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3],xmm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm13[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm14, %ymm2 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm11 -; AVX1-ONLY-NEXT: vpslld $16, %xmm11, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm12[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0],xmm1[1,2],xmm7[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm7[2],ymm10[3,4],ymm7[5],ymm10[6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm11 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm11[4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm12 +; AVX1-ONLY-NEXT: vpslld $16, %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1,2],xmm2[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0],xmm10[1],xmm2[2,3,4,5,6],xmm10[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm0[1,2],xmm2[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm5[1],xmm2[2,3,4,5,6],xmm5[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm14 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm0[0,1],xmm12[2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm2[1],xmm12[2,3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0],xmm2[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm2[0,1,2],xmm15[3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2,3],ymm2[4],ymm8[5,6],ymm2[7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = xmm2[0,1],xmm10[0],xmm2[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5],xmm8[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2],xmm9[3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm14[0],xmm3[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm1[1,2],xmm2[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = xmm1[0,1],xmm11[0],xmm1[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm12[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5],xmm2[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm11[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3,4,5],xmm8[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm12[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3,4,5,6],xmm8[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 176(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 160(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 112(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 96(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 48(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 32(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 16(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm9, 112(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 96(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm15, 64(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm12, 80(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 176(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 32(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 128(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 144(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -1239,65 +1259,65 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,1,1,1] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,0,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[2,3,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm15 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero +; AVX2-SLOW-NEXT: vpbroadcastq %xmm15, %ymm15 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm11[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm15, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,2,2,5,4,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4],ymm15[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6],ymm15[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm15, %ymm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,0,2,2] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero -; AVX2-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[2,3,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm14, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm14, (%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vzeroupper @@ -1334,7 +1354,7 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm12, %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,6,7,4,5,10,11,8,9,10,11,12,13,14,15,24,25,22,23,20,21,26,27,24,25,26,27,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm14 ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm12 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[8],ymm14[8],ymm12[9],ymm14[9],ymm12[10],ymm14[10],ymm12[11],ymm14[11] @@ -1352,67 +1372,67 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [1,0,3,2,1,0,3,2] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [1,0,2,2,1,0,2,2] ; AVX2-FAST-NEXT: # ymm15 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[1,1,1,1] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6],ymm14[7] +; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm15 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero +; AVX2-FAST-NEXT: vpbroadcastq %xmm15, %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm11[0,0,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm15, %ymm15 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,0,7,6,5,0,7,6] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,4,0,6,5,4,0,6] ; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm15, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm13[4],ymm1[4],ymm13[5],ymm1[5],ymm13[6],ymm1[6],ymm13[7],ymm1[7],ymm13[12],ymm1[12],ymm13[13],ymm1[13],ymm13[14],ymm1[14],ymm13[15],ymm1[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm13[0],ymm1[0],ymm13[1],ymm1[1],ymm13[2],ymm1[2],ymm13[3],ymm1[3],ymm13[8],ymm1[8],ymm13[9],ymm1[9],ymm13[10],ymm1[10],ymm13[11],ymm1[11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5,6],ymm5[7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4],ymm15[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6],ymm15[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm12[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm15, %ymm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,0,3,2,1,0,3,2] ; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm5 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero -; AVX2-FAST-NEXT: vpbroadcastq %xmm6, %ymm6 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [5,4,0,6,5,4,0,6] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [5,0,7,6,5,0,7,6] ; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm1[0],ymm13[1],ymm1[1],ymm13[2],ymm1[2],ymm13[3],ymm1[3],ymm13[8],ymm1[8],ymm13[9],ymm1[9],ymm13[10],ymm1[10],ymm13[11],ymm1[11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm13[4],ymm1[4],ymm13[5],ymm1[5],ymm13[6],ymm1[6],ymm13[7],ymm1[7],ymm13[12],ymm1[12],ymm13[13],ymm1[13],ymm13[14],ymm1[14],ymm13[15],ymm1[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm12[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm1, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm14, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm14, (%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-NEXT: vzeroupper @@ -1420,210 +1440,203 @@ ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride6_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $24, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm10[4],ymm0[5],ymm10[5],ymm0[6],ymm10[6],ymm0[7],ymm10[7],ymm0[12],ymm10[12],ymm0[13],ymm10[13],ymm0[14],ymm10[14],ymm0[15],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm1, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm5, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm12 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm13 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[2,1,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm11, %ymm12, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,6,7,4,5,10,11,8,9,10,11,12,13,14,15,24,25,22,23,20,21,26,27,24,25,26,27,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm3, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm15, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[8],ymm14[8],ymm12[9],ymm14[9],ymm12[10],ymm14[10],ymm12[11],ymm14[11] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm12 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm15 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm8[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm13, %ymm15, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm14, %ymm15, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm15 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm15 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm3[0,0,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[0,0,2,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm14, %ymm15, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[8],ymm11[8],ymm9[9],ymm11[9],ymm9[10],ymm11[10],ymm9[11],ymm11[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm14, %ymm15, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm11[0],ymm3[0],ymm11[1],ymm3[1],ymm11[2],ymm3[2],ymm11[3],ymm3[3],ymm11[8],ymm3[8],ymm11[9],ymm3[9],ymm11[10],ymm3[10],ymm11[11],ymm3[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[8],ymm10[8],ymm0[9],ymm10[9],ymm0[10],ymm10[10],ymm0[11],ymm10[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[1,0,2,2,5,4,6,6] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm15 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[1,0,2,2,5,4,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm15 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm15 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm12, %ymm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm6 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm7 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,1,3,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm0[1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm11, %ymm15, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm6, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm11, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm9, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11] -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm4 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm2 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm9[4],ymm3[5],ymm9[5],ymm3[6],ymm9[6],ymm3[7],ymm9[7],ymm3[12],ymm9[12],ymm3[13],ymm9[13],ymm3[14],ymm9[14],ymm3[15],ymm9[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm12[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $24, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, (%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: store_i16_stride6_vf16: ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm3 ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm4 ; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm3 -; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} ymm6 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} ymm7 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[2,1,2,3,6,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,1,2,3,6,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm1 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] +; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} ymm7 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} ymm8 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512F-SLOW-NEXT: vpermi2d %ymm6, %ymm7, %ymm8 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[1,2,2,3,5,6,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[2,1,2,3,6,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[2,1,2,3,6,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,5,20,3,4,21,6,7,13,14,30,14,13,31,15,15] +; AVX512F-SLOW-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[1,2,2,3,5,6,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[1,2,2,3,5,6,6,7] ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <5,u,14,6,u,15,7,u> -; AVX512F-SLOW-NEXT: vpermi2d %ymm7, %ymm8, %ymm9 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [8,21,10,11,22,13,14,23] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] -; AVX512F-SLOW-NEXT: vpermi2d %zmm9, %zmm7, %zmm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm16 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [8,21,10,11,22,13,14,23] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512F-SLOW-NEXT: vpermi2d %zmm9, %zmm8, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm16 ; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} xmm9 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <0,8,u,1,9,u,2,10> -; AVX512F-SLOW-NEXT: vpermi2d %ymm9, %ymm11, %ymm13 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm11 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512F-SLOW-NEXT: vpbroadcastq %xmm14, %ymm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} xmm14 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[0,1,2,1] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,1,2,1] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512F-SLOW-NEXT: vpermi2d %ymm14, %ymm12, %ymm15 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,2,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[1,2,2,3] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm15[1,2],ymm12[3],ymm15[4,5],ymm12[6],ymm15[7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 +; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} xmm10 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm9 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[0,1,2,1] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,1,2,1] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,5] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm13, %zmm12, %zmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,16,0,1,17,5,2,18,8,9,24,11,8,25,10,11] +; AVX512F-SLOW-NEXT: vpermi2d %zmm9, %zmm12, %zmm13 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm13, %ymm9 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm12 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[1,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm15 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[1,2,2,3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; AVX512F-SLOW-NEXT: vpbroadcastq %xmm9, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1],ymm9[2],ymm13[3,4],ymm9[5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <4,12,u,5,13,u,6,14> -; AVX512F-SLOW-NEXT: vpermi2d %ymm4, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <1,2,18,2,u,19,3,3,12,28,12,13,29,13,14,30> +; AVX512F-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <1,u,10,2,u,11,3,u> -; AVX512F-SLOW-NEXT: vpermi2d %ymm1, %ymm2, %ymm3 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,9,2,3,10,5,6,11] ; AVX512F-SLOW-NEXT: vpermi2d %ymm1, %ymm3, %ymm2 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq @@ -1635,80 +1648,78 @@ ; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm3 ; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm4 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %ymm16 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm0 ; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,8,u,1,9,u,2,10> -; AVX512F-FAST-NEXT: vpermi2d %ymm9, %ymm11, %ymm12 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm9 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm11 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,8,3,4,9,6,7] -; AVX512F-FAST-NEXT: vpermi2d %ymm13, %ymm12, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm13 -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm12 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] -; AVX512F-FAST-NEXT: vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512F-FAST-NEXT: vpermi2d %ymm13, %ymm12, %ymm15 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm12 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[1,2,2,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,2,2,3] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm12, %zmm13 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm0 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,u,10,2,u,11,3,u> -; AVX512F-FAST-NEXT: vpermi2d %ymm8, %ymm6, %ymm7 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,9,2,3,10,5,6,11] -; AVX512F-FAST-NEXT: vpermi2d %ymm6, %ymm7, %ymm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <4,12,u,5,13,u,6,14> -; AVX512F-FAST-NEXT: vpermi2d %ymm6, %ymm7, %ymm9 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm10 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11] -; AVX512F-FAST-NEXT: vpermi2d %zmm9, %zmm6, %zmm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] +; AVX512F-FAST-NEXT: vpsrldq {{.*#+}} ymm7 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpsrldq {{.*#+}} ymm8 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,6,7,4,5,10,11,8,9,10,11,12,13,14,15,24,25,22,23,20,21,26,27,24,25,26,27,28,29,30,31] ; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm8 ; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm7 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] -; AVX512F-FAST-NEXT: vpsrldq {{.*#+}} ymm8 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpsrldq {{.*#+}} ymm9 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX512F-FAST-NEXT: vpermi2d %ymm8, %ymm7, %ymm9 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[1,2,2,3,5,6,6,7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7] -; AVX512F-FAST-NEXT: vpermi2d %ymm7, %ymm9, %ymm8 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <5,u,14,6,u,15,7,u> +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,5,20,3,4,21,6,7,13,14,30,14,13,31,15,15] +; AVX512F-FAST-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[1,2,2,3,5,6,6,7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[1,2,2,3,5,6,6,7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [12,1,2,13,4,5,14,7] +; AVX512F-FAST-NEXT: vpermi2d %ymm6, %ymm8, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [8,21,10,11,22,13,14,23] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-FAST-NEXT: vpermi2d %zmm9, %zmm8, %zmm6 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm16 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512F-FAST-NEXT: vpsrldq {{.*#+}} xmm9 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512F-FAST-NEXT: vpsrldq {{.*#+}} xmm10 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm12 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm11 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm12, %zmm11 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,16,0,1,17,5,2,18,8,9,24,11,8,25,10,11] +; AVX512F-FAST-NEXT: vpermi2d %zmm9, %zmm11, %zmm12 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm9 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm11 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,8,3,4,9,6,7] +; AVX512F-FAST-NEXT: vpermi2d %ymm14, %ymm12, %ymm15 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [16,9,10,17,12,13,18,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[1,2,2,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,2,2,3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm12, %zmm14 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm15, %zmm0 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <1,2,18,2,u,19,3,3,12,28,12,13,29,13,14,30> +; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,9,2,3,10,5,6,11] ; AVX512F-FAST-NEXT: vpermi2d %ymm2, %ymm3, %ymm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,21,10,11,22,13,14,23] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm10[4],ymm1[4],ymm10[5],ymm1[5],ymm10[6],ymm1[6],ymm10[7],ymm1[7],ymm10[12],ymm1[12],ymm10[13],ymm1[13],ymm10[14],ymm1[14],ymm10[15],ymm1[15] -; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,20,11,12,21,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm5 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11] +; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm3, %zmm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 128(%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -1923,7 +1934,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[1,0] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -1936,8 +1947,8 @@ ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm5[1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm5[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm4[0,2] ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] @@ -1951,7 +1962,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] ; SSE-NEXT: movdqa %xmm8, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[1,0] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm11[0,2] ; SSE-NEXT: movdqa %xmm3, %xmm0 @@ -1964,7 +1975,7 @@ ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] ; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm7[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[0,2] ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: pandn %xmm11, %xmm0 @@ -1976,7 +1987,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] ; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[0,2] ; SSE-NEXT: movdqa %xmm9, %xmm8 ; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5] @@ -1986,8 +1997,8 @@ ; SSE-NEXT: por %xmm6, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm15[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm3[1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm15[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,0,1,1] ; SSE-NEXT: movdqa %xmm5, %xmm0 @@ -2000,7 +2011,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm6[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm9 @@ -2010,7 +2021,7 @@ ; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm0[0,2] ; SSE-NEXT: movdqa %xmm5, %xmm8 ; SSE-NEXT: pandn %xmm11, %xmm8 @@ -2020,7 +2031,7 @@ ; SSE-NEXT: movdqa %xmm2, %xmm9 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0] ; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm10[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm10[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm3[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movdqa %xmm0, %xmm11 @@ -2030,8 +2041,8 @@ ; SSE-NEXT: andps %xmm1, %xmm9 ; SSE-NEXT: por %xmm9, %xmm6 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm2[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm14[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm2[1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm14[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm10[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,1,1] ; SSE-NEXT: movdqa %xmm5, %xmm9 @@ -2043,7 +2054,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0] ; SSE-NEXT: movdqa %xmm14, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm2[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm13[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm0 @@ -2053,7 +2064,7 @@ ; SSE-NEXT: por %xmm11, %xmm15 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: psrldq {{.*#+}} xmm14 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm2[0,2] ; SSE-NEXT: movdqa %xmm5, %xmm10 ; SSE-NEXT: pandn %xmm13, %xmm10 @@ -2064,7 +2075,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] ; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,0],xmm0[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm13[0,2] ; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload ; SSE-NEXT: movdqa %xmm4, %xmm14 @@ -2074,8 +2085,8 @@ ; SSE-NEXT: andps %xmm1, %xmm11 ; SSE-NEXT: por %xmm11, %xmm13 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm12[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm2[1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm12[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,1,1] ; SSE-NEXT: movdqa %xmm5, %xmm11 @@ -2087,7 +2098,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movdqa %xmm12, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm14[0,2] ; SSE-NEXT: andps %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[2,2,3,3] @@ -2096,7 +2107,7 @@ ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[0,2] ; SSE-NEXT: andps %xmm5, %xmm12 ; SSE-NEXT: pandn %xmm14, %xmm5 @@ -2145,264 +2156,268 @@ ; ; AVX1-ONLY-LABEL: store_i16_stride6_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $120, %rsp -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm2 -; AVX1-ONLY-NEXT: vpslld $16, %xmm2, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm12[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2,3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: subq $88, %rsp +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm11 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm1[0,1],xmm14[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = xmm13[0,1],xmm0[0],xmm13[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm14[5],xmm13[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm0[4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5],xmm5[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3,4,5,6],xmm5[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm13[2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm5 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm5[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6],xmm14[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6],ymm6[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm12 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm12[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm6 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm11[0],xmm3[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm11[4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5],xmm4[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5,6],xmm4[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5,6],xmm3[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm2 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm12 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm12[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2],xmm15[3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = xmm13[0,1],xmm1[0],xmm13[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm15[5],xmm13[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm1[4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vpslld $16, %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm5[0,1,2,3,4],xmm6[5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0],xmm0[1,2],xmm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm3[0],xmm5[1],xmm3[2,3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5],xmm3[6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3,4,5,6],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6],xmm14[7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm14 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm14, %ymm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm14, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6],ymm11[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm10 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm14 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm11[0,1],xmm3[0],xmm11[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm0 ; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0],xmm7[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm9[5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1],xmm9[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm12[3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = xmm8[0,1],xmm1[0],xmm8[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm12[5],xmm8[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1,2],xmm1[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6],xmm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm6[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0],xmm3[1,2],xmm6[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 48(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, (%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 16(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 96(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 112(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm12, 64(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm13, 80(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm15, 160(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 352(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 368(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 320(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 336(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 288(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, 304(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 256(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm11, 272(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm15, 224(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm14, 240(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm13, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2410,1767 +2425,1756 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 368(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 336(%rax) -; AVX1-ONLY-NEXT: addq $120, %rsp +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rax) +; AVX1-ONLY-NEXT: addq $88, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride6_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $616, %rsp # imm = 0x268 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: subq $696, %rsp # imm = 0x2B8 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,1,2,1] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm14 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm8 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,1,2,1] ; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,2,1] -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,1,2,1] -; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm14 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,1,2,1] ; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[2,1,2,3,6,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm15 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm6, %ymm14, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm6[2],ymm14[3,4],ymm6[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[2,3,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[12],ymm10[12],ymm8[13],ymm10[13],ymm8[14],ymm10[14],ymm8[15],ymm10[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[12],mem[12],ymm2[13],mem[13],ymm2[14],mem[14],ymm2[15],mem[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm14 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3],ymm14[4],ymm1[5,6],ymm14[7] -; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm14 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm3 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm4[4],ymm12[5],ymm4[5],ymm12[6],ymm4[6],ymm12[7],ymm4[7],ymm12[12],ymm4[12],ymm12[13],ymm4[13],ymm12[14],ymm4[14],ymm12[15],ymm4[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2],ymm12[3,4],ymm1[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero +; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero ; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm12 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm12, %ymm12 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = mem[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,0,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[2],mem[2],ymm7[3],mem[3],ymm7[8],mem[8],ymm7[9],mem[9],ymm7[10],mem[10],ymm7[11],mem[11] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm10[0],ymm15[1],ymm10[1],ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[8],ymm10[8],ymm15[9],ymm10[9],ymm15[10],ymm10[10],ymm15[11],ymm10[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm11[0],ymm15[0],ymm11[1],ymm15[1],ymm11[2],ymm15[2],ymm11[3],ymm15[3],ymm11[8],ymm15[8],ymm11[9],ymm15[9],ymm11[10],ymm15[10],ymm11[11],ymm15[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[1,0,2,2,5,4,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3],ymm13[4],ymm0[5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm9, %ymm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[8],ymm5[8],ymm2[9],ymm5[9],ymm2[10],ymm5[10],ymm2[11],ymm5[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,0,2,2,5,4,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm12 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6],ymm12[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm12, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm15[4],ymm11[5],ymm15[5],ymm11[6],ymm15[6],ymm11[7],ymm15[7],ymm11[12],ymm15[12],ymm11[13],ymm15[13],ymm11[14],ymm15[14],ymm11[15],ymm15[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm11 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3],ymm11[4],ymm0[5,6],ymm11[7] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm5[4],mem[4],ymm5[5],mem[5],ymm5[6],mem[6],ymm5[7],mem[7],ymm5[12],mem[12],ymm5[13],mem[13],ymm5[14],mem[14],ymm5[15],mem[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm5, %ymm2 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 288(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 352(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 288(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 256(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm12, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm12, (%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-SLOW-NEXT: addq $616, %rsp # imm = 0x268 +; AVX2-SLOW-NEXT: addq $696, %rsp # imm = 0x2B8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i16_stride6_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $648, %rsp # imm = 0x288 +; AVX2-FAST-NEXT: subq $632, %rsp # imm = 0x278 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm15 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 ; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm12 +; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm14 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,1,3,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm13 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm14 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm12 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm1 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa %xmm6, %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm8 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm9 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [8,9,6,7,4,5,10,11,8,9,10,11,12,13,14,15,24,25,22,23,20,21,26,27,24,25,26,27,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm7 +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm9 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm5 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm6 +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm1 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,2,1,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm13 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0],ymm13[1],ymm0[2,3],ymm13[4],ymm0[5,6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm13, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm4[4],ymm10[5],ymm4[5],ymm10[6],ymm4[6],ymm10[7],ymm4[7],ymm10[12],ymm4[12],ymm10[13],ymm4[13],ymm10[14],ymm4[14],ymm10[15],ymm4[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [5,6,5,6,5,6,7,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm3[4],mem[4],ymm3[5],mem[5],ymm3[6],mem[6],ymm3[7],mem[7],ymm3[12],mem[12],ymm3[13],mem[13],ymm3[14],mem[14],ymm3[15],mem[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm13 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm13, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm13 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,0,2,2,1,0,2,2] ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero -; AVX2-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa %xmm15, %xmm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6],ymm0[7] +; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero +; AVX2-FAST-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa %xmm13, %xmm15 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,0,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm4, %ymm13 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm13, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6],ymm0[7] ; AVX2-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = mem[0,0,2,1,4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[1],ymm7[1],ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[8],ymm7[8],ymm0[9],ymm7[9],ymm0[10],ymm7[10],ymm0[11],ymm7[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [5,4,2,2,5,4,6,6] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[2],mem[2],ymm7[3],mem[3],ymm7[8],mem[8],ymm7[9],mem[9],ymm7[10],mem[10],ymm7[11],mem[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [5,4,2,2,5,4,6,6] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm15, %ymm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5,6],ymm3[7] -; AVX2-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm12[2],ymm3[3,4],ymm12[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm11, %ymm3 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm15, %ymm7 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm0 -; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm13, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 288(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 192(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $648, %rsp # imm = 0x288 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: store_i16_stride6_vf32: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $648, %rsp # imm = 0x288 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm13[0],ymm7[0],ymm13[1],ymm7[1],ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[8],ymm7[8],ymm13[9],ymm7[9],ymm13[10],ymm7[10],ymm13[11],ymm7[11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm0[1],ymm13[2,3],ymm0[4],ymm13[5,6],ymm0[7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm13, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11] +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm13 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[8],ymm9[8],ymm6[9],ymm9[9],ymm6[10],ymm9[10],ymm6[11],ymm9[11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <1,2,1,2,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,1,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[12],mem[12],ymm2[13],mem[13],ymm2[14],mem[14],ymm2[15],mem[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[12],ymm7[12],ymm4[13],ymm7[13],ymm4[14],ymm7[14],ymm4[15],ymm7[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [5,6,5,6,5,6,7,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm14 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3],ymm14[4],ymm2[5,6],ymm14[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm15 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm15, %ymm2 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm5[4],ymm13[4],ymm5[5],ymm13[5],ymm5[6],ymm13[6],ymm5[7],ymm13[7],ymm5[12],ymm13[12],ymm5[13],ymm13[13],ymm5[14],ymm13[14],ymm5[15],ymm13[15] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm6[4],ymm9[4],ymm6[5],ymm9[5],ymm6[6],ymm9[6],ymm6[7],ymm9[7],ymm6[12],ymm9[12],ymm6[13],ymm9[13],ymm6[14],ymm9[14],ymm6[15],ymm9[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4],ymm8[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm1 +; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FAST-NEXT: vmovdqa %ymm1, 352(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 288(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 160(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 320(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 256(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FAST-NEXT: addq $632, %rsp # imm = 0x278 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: store_i16_stride6_vf32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: subq $568, %rsp # imm = 0x238 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,0,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm13[0],ymm1[1],ymm13[1],ymm1[2],ymm13[2],ymm1[3],ymm13[3],ymm1[8],ymm13[8],ymm1[9],ymm13[9],ymm1[10],ymm13[10],ymm1[11],ymm13[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm2[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm4 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm14 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[2,1,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm1 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[2,1,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [8,9,6,7,4,5,10,11,8,9,10,11,12,13,14,15,24,25,22,23,20,21,26,27,24,25,26,27,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm13, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm7, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm2 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm14 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm14[0],ymm2[0],ymm14[1],ymm2[1],ymm14[2],ymm2[2],ymm14[3],ymm2[3],ymm14[8],ymm2[8],ymm14[9],ymm2[9],ymm14[10],ymm2[10],ymm14[11],ymm2[11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm15, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm8, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm14, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm1 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm13 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm1[0],ymm13[1],ymm1[1],ymm13[2],ymm1[2],ymm13[3],ymm1[3],ymm13[8],ymm1[8],ymm13[9],ymm1[9],ymm13[10],ymm1[10],ymm13[11],ymm1[11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm14, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm4[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm14, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[12],ymm10[12],ymm12[13],ymm10[13],ymm12[14],ymm10[14],ymm12[15],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm11, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm14, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm9[4],ymm7[5],ymm9[5],ymm7[6],ymm9[6],ymm7[7],ymm9[7],ymm7[12],ymm9[12],ymm7[13],ymm9[13],ymm7[14],ymm9[14],ymm7[15],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm15[4],ymm14[4],ymm15[5],ymm14[5],ymm15[6],ymm14[6],ymm15[7],ymm14[7],ymm15[12],ymm14[12],ymm15[13],ymm14[13],ymm15[14],ymm14[14],ymm15[15],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[1,2,3,3,5,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm15 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,0,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm15[2],ymm1[3,4],ymm15[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[0,0,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[2],ymm13[2],ymm2[3],ymm13[3],ymm2[8],ymm13[8],ymm2[9],ymm13[9],ymm2[10],ymm13[10],ymm2[11],ymm13[11] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[8],ymm10[8],ymm12[9],ymm10[9],ymm12[10],ymm10[10],ymm12[11],ymm10[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[1,0,2,2,5,4,6,6] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[8],ymm9[8],ymm7[9],ymm9[9],ymm7[10],ymm9[10],ymm7[11],ymm9[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm2[0],ymm14[0],ymm2[1],ymm14[1],ymm2[2],ymm14[2],ymm2[3],ymm14[3],ymm2[8],ymm14[8],ymm2[9],ymm14[9],ymm2[10],ymm14[10],ymm2[11],ymm14[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm8[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,2,2,5,4,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm15[4],mem[4],ymm15[5],mem[5],ymm15[6],mem[6],ymm15[7],mem[7],ymm15[12],mem[12],ymm15[13],mem[13],ymm15[14],mem[14],ymm15[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[4],mem[4],ymm4[5],mem[5],ymm4[6],mem[6],ymm4[7],mem[7],ymm4[12],mem[12],ymm4[13],mem[13],ymm4[14],mem[14],ymm4[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3],ymm11[4],ymm1[5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm12, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[12],ymm10[12],ymm8[13],ymm10[13],ymm8[14],ymm10[14],ymm8[15],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm10[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm9, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 288(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 320(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 288(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 256(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $648, %rsp # imm = 0x288 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $568, %rsp # imm = 0x238 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: store_i16_stride6_vf32: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm9[4],ymm12[5],ymm9[5],ymm12[6],ymm9[6],ymm12[7],ymm9[7],ymm12[12],ymm9[12],ymm12[13],ymm9[13],ymm12[14],ymm9[14],ymm12[15],ymm9[15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm8 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[2,1,2,3,6,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[2,1,2,3,6,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm10[4],ymm8[4],ymm10[5],ymm8[5],ymm10[6],ymm8[6],ymm10[7],ymm8[7],ymm10[12],ymm8[12],ymm10[13],ymm8[13],ymm10[14],ymm8[14],ymm10[15],ymm8[15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm13, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm2[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm0[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm5 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm5[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[2],ymm7[2],ymm11[3],ymm7[3],ymm11[8],ymm7[8],ymm11[9],ymm7[9],ymm11[10],ymm7[10],ymm11[11],ymm7[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm7[1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm7[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm7[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm7[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = <17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6> -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm24, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm3[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm4 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,2,3,3,5,6,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm5[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm7[1,2],ymm1[3],ymm7[4,5],ymm1[6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm7 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm9 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm16, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm9[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm15[0,1,2,3],zmm2[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: movw $9362, %ax # imm = 0x2492 +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm14, %zmm2 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm3[1,2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm15 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm15, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm15[2],ymm2[3,4],ymm15[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm2[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm2[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6> +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,1,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[8],ymm8[8],ymm10[9],ymm8[9],ymm10[10],ymm8[10],ymm10[11],ymm8[11] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm14, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5,6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm13 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm16, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm13[0,1,2,3],zmm3[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm2[1,2],ymm13[3],ymm2[4,5],ymm13[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm13 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm13, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm14[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm2[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm2[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3],ymm7[4],ymm2[5,6],ymm7[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm2[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm5[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[8],ymm9[8],ymm12[9],ymm9[9],ymm12[10],ymm9[10],ymm12[11],ymm9[11] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm24, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm4[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[8],ymm8[8],ymm10[9],ymm8[9],ymm10[10],ymm8[10],ymm10[11],ymm8[11] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm7, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm9, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2,3],ymm6[4],ymm9[5,6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm6[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm5, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm9[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: movw $9362, %ax # imm = 0x2492 -; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,0,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[0,2,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm10 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm12 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm5, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm10[0,1,2,3],zmm5[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm9, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm5, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm14[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm10, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm21, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm10, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm16, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm12, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm12, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm6, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm6, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm20, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm11, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm16, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm18, %zmm11, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm8, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i16_stride6_vf32: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: pushq %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,1,2,3,11,11,11,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [5,6,5,6,5,6,7,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm5 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: movw $18724, %ax # imm = 0x4924 -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [8,21,10,11,20,13,14,23] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm11, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [12,1,2,13,4,5,14,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm3, %ymm17, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm1[0,1,2,3],zmm4[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <2,2,u,3,10,u,10,11> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm9[4],ymm2[5],ymm9[5],ymm2[6],ymm9[6],ymm2[7],ymm9[7],ymm2[12],ymm9[12],ymm2[13],ymm9[13],ymm2[14],ymm9[14],ymm2[15],ymm9[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm6[4],ymm1[4],ymm6[5],ymm1[5],ymm6[6],ymm1[6],ymm6[7],ymm1[7],ymm6[12],ymm1[12],ymm6[13],ymm1[13],ymm6[14],ymm1[14],ymm6[15],ymm1[15] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm10 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm10[0],ymm4[1],ymm10[1],ymm4[2],ymm10[2],ymm4[3],ymm10[3],ymm4[8],ymm10[8],ymm4[9],ymm10[9],ymm4[10],ymm10[10],ymm4[11],ymm10[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm2, %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm2, %ymm17, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm0[0,1,2,3],zmm11[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm11, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[8],ymm1[8],ymm6[9],ymm1[9],ymm6[10],ymm1[10],ymm6[11],ymm1[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512F-ONLY-FAST-NEXT: subq $40, %rsp +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,0,2,1,8,9,8,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,0,2,1,8,9,8,9] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [1,0,2,2,1,0,2,2] -; AVX512F-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm5, %ymm23, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [1,0,2,2,1,0,2,2] +; AVX512F-ONLY-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-ONLY-FAST-NEXT: movw $9362, %ax # imm = 0x2492 +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [0,1,8,3,4,9,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm21, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [16,9,10,17,12,13,18,15] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[8],ymm0[8],ymm15[9],ymm0[9],ymm15[10],ymm0[10],ymm15[11],ymm0[11] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm9[0],ymm5[0],ymm9[1],ymm5[1],ymm9[2],ymm5[2],ymm9[3],ymm5[3],ymm9[8],ymm5[8],ymm9[9],ymm5[9],ymm9[10],ymm5[10],ymm9[11],ymm5[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,9,6,7,4,5,10,11,8,9,10,11,12,13,14,15,24,25,22,23,20,21,26,27,24,25,26,27,28,29,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [2,1,2,3,11,11,11,11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm4[4],ymm12[5],ymm4[5],ymm12[6],ymm4[6],ymm12[7],ymm4[7],ymm12[12],ymm4[12],ymm12[13],ymm4[13],ymm12[14],ymm4[14],ymm12[15],ymm4[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm25, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [5,6,5,6,5,6,7,7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm0[4],ymm15[5],ymm0[5],ymm15[6],ymm0[6],ymm15[7],ymm0[7],ymm15[12],ymm0[12],ymm15[13],ymm0[13],ymm15[14],ymm0[14],ymm15[15],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm26, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm12 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [16,9,10,17,12,13,18,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [12,1,2,13,4,5,14,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm2, %ymm27, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [8,21,10,11,20,13,14,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm15, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm28, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm11, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = <2,2,u,3,10,u,10,11> +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm29, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm4 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm7 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [0,1,8,3,4,9,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm8, %ymm22, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm12[0,1,2,3],zmm1[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm4, %ymm23, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm14 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm15 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm14, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm2 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm14 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm14, %ymm22, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm2[0,1,2,3],zmm5[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm5[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm21, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm9[0],ymm2[1],ymm9[1],ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[8],ymm9[8],ymm2[9],ymm9[9],ymm2[10],ymm9[10],ymm2[11],ymm9[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [1,1,1,1,10,10,10,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm18, %zmm7, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm18 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [8,9,20,11,12,21,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm1, %ymm0, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [16,9,10,17,12,13,18,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm14, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm14[4],ymm4[5],ymm14[5],ymm4[6],ymm14[6],ymm4[7],ymm14[7],ymm4[12],ymm14[12],ymm4[13],ymm14[13],ymm4[14],ymm14[14],ymm4[15],ymm14[15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm9[4],ymm5[4],ymm9[5],ymm5[5],ymm9[6],ymm5[6],ymm9[7],ymm5[7],ymm9[12],ymm5[12],ymm9[13],ymm5[13],ymm9[14],ymm5[14],ymm9[15],ymm5[15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm26, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm5 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm28, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm27, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm29, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[2],ymm0[2],ymm12[3],ymm0[3],ymm12[8],ymm0[8],ymm12[9],ymm0[9],ymm12[10],ymm0[10],ymm12[11],ymm0[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [1,1,1,1,10,10,10,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm22, %zmm1, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm12, %zmm19 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [0,9,2,3,8,5,6,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm4, %ymm21, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,u,0,1,u,10,10,u> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[8],ymm9[8],ymm4[9],ymm9[9],ymm4[10],ymm9[10],ymm4[11],ymm9[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm20, %zmm7, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm3, %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm2, %ymm21, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm0[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm21, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [8,9,20,11,12,21,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm18, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm30, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,u,0,1,u,10,10,u> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm25, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm14[0],ymm4[1],ymm14[1],ymm4[2],ymm14[2],ymm4[3],ymm14[3],ymm4[8],ymm14[8],ymm4[9],ymm14[9],ymm4[10],ymm14[10],ymm4[11],ymm14[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm17, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm24, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm11, %zmm1 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm4, %ymm1, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm18, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm21, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm4, %zmm0 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm18[0,1,2,3],zmm10[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm22, %zmm0, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm19, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm22, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm17, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm16, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: popq %rax +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm20, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm16, %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm23, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: addq $40, %rsp ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i16_stride6_vf32: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm24 ; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm25 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] ; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm29 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm8, %xmm27 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[0,1,0,1] ; AVX512DQ-SLOW-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 ; AVX512DQ-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,1,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm31 +; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm26 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm30 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm0[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm6, %xmm25 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm16 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm26 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm28 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm7, %xmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm6, %xmm30 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[2,1,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm17 +; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm8, %xmm31 ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm22 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm16 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %ymm8 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm8[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm13[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[12],ymm13[12],ymm15[13],ymm13[13],ymm15[14],ymm13[14],ymm15[15],ymm13[15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm14 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm12 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm14 ; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm3 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm2[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm14[4],ymm12[4],ymm14[5],ymm12[5],ymm14[6],ymm12[6],ymm14[7],ymm12[7],ymm14[12],ymm12[12],ymm14[13],ymm12[13],ymm14[14],ymm12[14],ymm14[15],ymm12[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,3,3,5,6,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX512DQ-SLOW-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm6, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm1, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm5, %ymm1 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %ymm7 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm0[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm9 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[8],ymm2[8],ymm9[9],ymm2[9],ymm9[10],ymm2[10],ymm9[11],ymm2[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm9 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[2],ymm0[2],ymm9[3],ymm0[3],ymm9[8],ymm0[8],ymm9[9],ymm0[9],ymm9[10],ymm0[10],ymm9[11],ymm0[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[1,2,3,3,5,6,7,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm24, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm9, %zmm2, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm2, %ymm9 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm9, %zmm0, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm0 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm4[1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm4 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm18 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm0[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm8, %ymm20 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm8[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, %xmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm9 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6> +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm8, %zmm12, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm27, %xmm8 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm0[1,2],ymm10[3],ymm0[4,5],ymm10[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm0[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm9 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm0[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm10[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6> -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm10, %zmm3 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm4, %zmm0, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm16[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm10, %zmm8, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm9, %ymm8 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm8 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6],ymm8[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm9 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm9[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm26, %xmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm28, %xmm14 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm13, %zmm12, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm30, %xmm12 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,1,1] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm6, %zmm3, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm27, %xmm5 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm4 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm4, %zmm10, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm28, %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm10 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm4, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm4 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm31, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm30[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm31, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,1,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm9[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm26, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm6 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm22, %zmm8 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm7, %zmm8 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm24, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm10 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm9, %zmm10 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm3, %zmm1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 320(%rax) +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm16, %zmm10, %zmm9 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm20, %zmm12 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm10, %zmm12 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm10, %zmm8 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm10, %zmm3 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i16_stride6_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm14 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm14[0],ymm1[1],ymm14[1],ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[8],ymm14[8],ymm1[9],ymm14[9],ymm1[10],ymm14[10],ymm1[11],ymm14[11] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [1,1,1,1,10,10,10,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm20, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm26 +; AVX512DQ-FAST-NEXT: subq $72, %rsp +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [1,1,1,1,10,10,10,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[8],ymm15[8],ymm1[9],ymm15[9],ymm1[10],ymm15[10],ymm1[11],ymm15[11] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 ; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> -; AVX512DQ-FAST-NEXT: vpermd %zmm2, %zmm21, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm22, %zmm19 ; AVX512DQ-FAST-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm1, %zmm19 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [8,9,20,11,12,21,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm0 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm17, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm4 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[8],ymm9[8],ymm4[9],ymm9[9],ymm4[10],ymm9[10],ymm4[11],ymm9[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm29 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm28 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,9,2,3,8,5,6,11] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm6, %ymm19 -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,u,0,1,u,10,10,u> -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm5, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,9,2,3,8,5,6,11] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm23 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm17, %ymm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [8,9,20,11,12,21,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm11, %xmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm10, %xmm27 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm5 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm16 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm21, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,u,0,1,u,10,10,u> +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm20, %zmm11 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm21, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm0, %zmm20, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm11 -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm11, %xmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm6, %ymm20 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm31 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm31 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm14, %zmm12 +; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm22, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm12, %zmm18 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm13 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm9 +; AVX512DQ-FAST-NEXT: vpermi2d %ymm9, %ymm18, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm21, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm12 +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm12, %xmm9 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm5 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm15 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[8],ymm0[8],ymm15[9],ymm0[9],ymm15[10],ymm0[10],ymm15[11],ymm0[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,1,2,3,11,11,11,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [5,6,5,6,5,6,7,7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm9[4],ymm4[5],ymm9[5],ymm4[6],ymm9[6],ymm4[7],ymm9[7],ymm4[12],ymm9[12],ymm4[13],ymm9[13],ymm4[14],ymm9[14],ymm4[15],ymm9[15] -; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm21, %ymm1 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm9 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[8],ymm9[8],ymm4[9],ymm9[9],ymm4[10],ymm9[10],ymm4[11],ymm9[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm4, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [8,21,10,11,20,13,14,23] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm4, %zmm9 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [12,1,2,13,4,5,14,7] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm3, %ymm22, %ymm0 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm0[0,1,2,3],zmm9[0,1,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm9 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm3 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm14[4],ymm9[5],ymm14[5],ymm9[6],ymm14[6],ymm9[7],ymm14[7],ymm9[12],ymm14[12],ymm9[13],ymm14[13],ymm9[14],ymm14[14],ymm9[15],ymm14[15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm6 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <2,2,u,3,10,u,10,11> -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm1[4],ymm14[4],ymm1[5],ymm14[5],ymm1[6],ymm14[6],ymm1[7],ymm14[7],ymm1[12],ymm14[12],ymm1[13],ymm14[13],ymm1[14],ymm14[14],ymm1[15],ymm14[15] -; AVX512DQ-FAST-NEXT: vpermd %ymm6, %ymm21, %ymm6 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm15 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm14 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm6, %zmm14, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm22, %ymm0 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3],zmm4[0,1,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm4 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,2,1,8,9,8,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2] -; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm9 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm6 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm9 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm8, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm20, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm14 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,0,2,1,8,9,8,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm2 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [1,0,2,2,1,0,2,2] +; AVX512DQ-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm4, %ymm25, %ymm4 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX512DQ-FAST-NEXT: movw $9362, %ax # imm = 0x2492 -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm8, %zmm6, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [16,9,10,17,12,13,18,15] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm8, %zmm6, %zmm9 -; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm14, %xmm7 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,8,3,4,9,6,7] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm8, %ymm5, %ymm3 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm9[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm7 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm8 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm7, %zmm4, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm7 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm4 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm11 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm4, %zmm10, %zmm6 -; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm28[0],zero,xmm28[1],zero,xmm28[2],zero,xmm28[3],zero -; AVX512DQ-FAST-NEXT: vpermt2d %ymm4, %ymm5, %ymm10 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm10[0,1,2,3],zmm6[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm5 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm4, %zmm2 {%k2} +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero +; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [16,9,10,17,12,13,18,15] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm4, %zmm22, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm20, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm29 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,6,7,4,5,10,11,8,9,10,11,12,13,14,15,24,25,22,23,20,21,26,27,24,25,26,27,28,29,30,31] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm8[4],ymm2[5],ymm8[5],ymm2[6],ymm8[6],ymm2[7],ymm8[7],ymm2[12],ymm8[12],ymm2[13],ymm8[13],ymm2[14],ymm8[14],ymm2[15],ymm8[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [2,1,2,3,11,11,11,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [5,6,5,6,5,6,7,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15] +; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm24, %ymm1 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm8 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm14 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm14[0],ymm8[0],ymm14[1],ymm8[1],ymm14[2],ymm8[2],ymm14[3],ymm8[3],ymm14[8],ymm8[8],ymm14[9],ymm8[9],ymm14[10],ymm8[10],ymm14[11],ymm8[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm8, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [12,1,2,13,4,5,14,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm8, %ymm1, %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm21, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [8,21,10,11,20,13,14,23] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm10, %zmm21, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm7 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm10, %ymm25, %ymm10 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm10, %zmm15 {%k2} +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero +; AVX512DQ-FAST-NEXT: vpermi2d %ymm2, %ymm15, %ymm20 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm22, %zmm15 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm14, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm4 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm7, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <2,2,u,3,10,u,10,11> +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX512DQ-FAST-NEXT: vpermd %ymm12, %ymm24, %ymm12 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm7 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm8 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm12, %zmm7, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm20, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermi2d %ymm8, %ymm4, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm8 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm8, %zmm21, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm4, %zmm8 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm5, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm18, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm20[0,1,2,3],zmm17[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm23[0,1,2,3] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 320(%rax) +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm6, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm7, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm6, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm29, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-FAST-NEXT: addq $72, %rsp ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -4616,7 +4620,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[1,0] ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -4629,8 +4633,8 @@ ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[0,0,1,1] @@ -4644,7 +4648,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm1[1,0] ; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[0,2] ; SSE-NEXT: movdqa %xmm2, %xmm1 @@ -4658,7 +4662,7 @@ ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm3[0,2] ; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn %xmm13, %xmm0 @@ -4671,7 +4675,7 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[1,0] ; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -4683,8 +4687,8 @@ ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm9[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,1,1] ; SSE-NEXT: movdqa %xmm4, %xmm5 @@ -4698,7 +4702,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[1,0] ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: movdqa %xmm5, %xmm1 @@ -4712,7 +4716,7 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm9[1] ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2] ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 @@ -4725,7 +4729,7 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[1,0] ; SSE-NEXT: movaps %xmm3, %xmm4 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -4737,8 +4741,8 @@ ; SSE-NEXT: por %xmm0, %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm5[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] ; SSE-NEXT: movdqa %xmm10, %xmm4 @@ -4751,7 +4755,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm4[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm3 @@ -4763,7 +4767,7 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm5[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2] ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 @@ -4775,7 +4779,7 @@ ; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[1,0] ; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -4787,8 +4791,8 @@ ; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; SSE-NEXT: movdqa %xmm10, %xmm3 @@ -4801,7 +4805,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm3[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm1 @@ -4812,7 +4816,7 @@ ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[0,2] ; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 @@ -4824,7 +4828,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[1,0] ; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -4836,8 +4840,8 @@ ; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; SSE-NEXT: movdqa %xmm10, %xmm3 @@ -4850,7 +4854,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm3[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm1 @@ -4861,7 +4865,7 @@ ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[0,2] ; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 @@ -4873,7 +4877,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[1,0] ; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -4885,8 +4889,8 @@ ; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; SSE-NEXT: movdqa %xmm10, %xmm3 @@ -4899,7 +4903,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm3[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm1 @@ -4910,7 +4914,7 @@ ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm3[0,2] ; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 @@ -4922,7 +4926,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[1,0] ; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -4933,8 +4937,8 @@ ; SSE-NEXT: andps %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm13 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; SSE-NEXT: movdqa %xmm1, %xmm4 @@ -4947,7 +4951,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[1,0] ; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: movdqa %xmm4, %xmm1 @@ -4959,7 +4963,7 @@ ; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] ; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm7[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm3[0,2] ; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm5, %xmm4 @@ -4971,7 +4975,7 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm2[1,0] ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[0,2] ; SSE-NEXT: movdqa %xmm15, %xmm5 @@ -4982,9 +4986,9 @@ ; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,1,1,3,4,5,6,7] ; SSE-NEXT: movaps %xmm1, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] ; SSE-NEXT: movdqa %xmm15, %xmm8 @@ -4997,7 +5001,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[0,2] ; SSE-NEXT: andps %xmm12, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,2,3,3] @@ -5006,7 +5010,7 @@ ; SSE-NEXT: por %xmm1, %xmm12 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm2[0,2] ; SSE-NEXT: andps %xmm10, %xmm7 ; SSE-NEXT: pandn %xmm5, %xmm10 @@ -5104,558 +5108,554 @@ ; ; AVX1-ONLY-LABEL: store_i16_stride6_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $504, %rsp # imm = 0x1F8 -; AVX1-ONLY-NEXT: vmovdqa 80(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa 80(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0],xmm6[1,2],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa 80(%r9), %xmm3 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6],xmm8[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm3, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5],xmm6[6,7] +; AVX1-ONLY-NEXT: subq $472, %rsp # imm = 0x1D8 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm0 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = xmm2[0,1],xmm6[0],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm7 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5],xmm14[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm14 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm14[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm14[3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm6[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0],xmm9[1,2],xmm11[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5,6],xmm12[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm6[4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm7, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0],xmm9[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3,4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5,6],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa 64(%r8), %xmm5 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0],xmm6[1],xmm12[2,3] -; AVX1-ONLY-NEXT: vmovdqa 64(%r9), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm5[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm6[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6],xmm13[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm2[0],xmm0[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm12[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1],xmm5[0],xmm1[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm8[5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0],xmm6[1,2],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm3 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6],xmm8[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5,6],ymm7[7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = xmm7[0,1],xmm0[0],xmm7[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm3, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm5[1,2],xmm6[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5,6],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm0[4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5],xmm5[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0],xmm5[1],xmm12[2,3] -; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm5[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm4[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm5[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6],xmm13[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm2[0],xmm0[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm12[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1],xmm4[0],xmm1[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm8[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6],xmm2[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 112(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa 112(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0],xmm6[1,2],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa 112(%r9), %xmm3 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6],xmm8[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm3, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5],xmm6[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = xmm8[0,1],xmm2[0],xmm8[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm3[1,2],xmm5[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6],xmm6[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa 96(%r8), %xmm4 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0],xmm5[1],xmm12[2,3] -; AVX1-ONLY-NEXT: vmovdqa 96(%r9), %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm5[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm4[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm5[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6],xmm13[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm2[0],xmm0[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm12[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1],xmm4[0],xmm1[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm8[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm3 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0],xmm1[1,2],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm2 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm0 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = xmm8[0,1],xmm1[0],xmm8[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm3[1,2],xmm5[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6],xmm6[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm2, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = xmm8[0,1],xmm1[0],xmm8[3] +; AVX1-ONLY-NEXT: vmovdqa 64(%r9), %xmm0 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm3[1,2],xmm5[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 80(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0],xmm0[1],xmm12[2,3] -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1],xmm12[2,3,4,5,6],xmm15[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm15, %ymm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm15, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6],ymm11[7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = xmm11[0,1],xmm3[0],xmm11[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm15[5],xmm10[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm3[2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = xmm7[0,1],xmm1[0],xmm7[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5],xmm8[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 80(%r8), %xmm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = xmm8[0,1],xmm1[0],xmm8[3] +; AVX1-ONLY-NEXT: vmovdqa 80(%r9), %xmm0 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm3[1,2],xmm4[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm3[1,2],xmm5[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = xmm8[0,1],xmm1[0],xmm8[3] +; AVX1-ONLY-NEXT: vmovdqa 96(%r9), %xmm0 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm15[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm15 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm8[0,1,2],xmm15[3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm3[1,2],xmm5[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6],xmm6[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0,1,2,3,4],xmm5[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa 112(%rcx), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 112(%r8), %xmm0 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = xmm7[0,1],xmm0[0],xmm7[3] +; AVX1-ONLY-NEXT: vmovdqa 112(%r9), %xmm1 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm14[5],xmm8[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm14[2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm14[3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm2[1,2],xmm3[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm6[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 48(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 16(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 112(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 96(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm12, 80(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm13, 64(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm14, 176(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 752(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 736(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 720(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 704(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 688(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 672(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, 656(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 640(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm11, 624(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, 608(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm13, 592(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 576(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 560(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 544(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 624(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 528(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 608(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 512(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 592(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 496(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 576(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 480(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 688(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 464(%rax) ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 672(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 656(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 640(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 448(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 752(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 736(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 416(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 720(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 704(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 368(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 352(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 336(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 320(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -5665,38 +5665,38 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 368(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 336(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 496(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 480(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 464(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 560(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 544(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 528(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 512(%rax) -; AVX1-ONLY-NEXT: addq $504, %rsp # imm = 0x1F8 +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rax) +; AVX1-ONLY-NEXT: addq $472, %rsp # imm = 0x1D8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -6185,7 +6185,7 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] @@ -6413,7 +6413,7 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [8,9,6,7,4,5,10,11,8,9,10,11,12,13,14,15,24,25,22,23,20,21,26,27,24,25,26,27,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] @@ -6434,8 +6434,7 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3 @@ -6745,12 +6744,12 @@ ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm14 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] @@ -7139,7 +7138,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [8,9,6,7,4,5,10,11,8,9,10,11,12,13,14,15,24,25,22,23,20,21,26,27,24,25,26,27,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm6 @@ -7155,8 +7154,7 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] @@ -7314,12 +7312,12 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] @@ -7420,2019 +7418,1983 @@ ; ; AVX512F-ONLY-SLOW-LABEL: store_i16_stride6_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $600, %rsp # imm = 0x258 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: subq $200, %rsp +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm9[4],ymm6[5],ymm9[5],ymm6[6],ymm9[6],ymm6[7],ymm9[7],ymm6[12],ymm9[12],ymm6[13],ymm9[13],ymm6[14],ymm9[14],ymm6[15],ymm9[15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[12],ymm10[12],ymm8[13],ymm10[13],ymm8[14],ymm10[14],ymm8[15],ymm10[15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[2,1,2,3,6,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,1,2,3,6,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[12],ymm10[12],ymm8[13],ymm10[13],ymm8[14],ymm10[14],ymm8[15],ymm10[15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm9[4],ymm7[5],ymm9[5],ymm7[6],ymm9[6],ymm7[7],ymm9[7],ymm7[12],ymm9[12],ymm7[13],ymm9[13],ymm7[14],ymm9[14],ymm7[15],ymm9[15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm6, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm11, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,3,3,5,6,7,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,1,2,3,6,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[2,1,2,3,6,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm4[4],ymm14[5],ymm4[5],ymm14[6],ymm4[6],ymm14[7],ymm4[7],ymm14[12],ymm4[12],ymm14[13],ymm4[13],ymm14[14],ymm4[14],ymm14[15],ymm4[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm3 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm20 = ymm3[1,2,3,3,5,6,7,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[8],ymm0[8],ymm15[9],ymm0[9],ymm15[10],ymm0[10],ymm15[11],ymm0[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm25, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm20, %zmm15 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm15, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0],ymm7[1],ymm14[2,3],ymm7[4],ymm14[5,6],ymm7[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm0[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm14[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = <17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm14, %xmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm29, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm14, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,3,3,5,6,7,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm3[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm13[4],ymm5[4],ymm13[5],ymm5[5],ymm13[6],ymm5[6],ymm13[7],ymm5[7],ymm13[12],ymm5[12],ymm13[13],ymm5[13],ymm13[14],ymm5[14],ymm13[15],ymm5[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm3, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,3,3,5,6,7,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm13[4],ymm5[4],ymm13[5],ymm5[5],ymm13[6],ymm5[6],ymm13[7],ymm5[7],ymm13[12],ymm5[12],ymm13[13],ymm5[13],ymm13[14],ymm5[14],ymm13[15],ymm5[15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[8],ymm9[8],ymm6[9],ymm9[9],ymm6[10],ymm9[10],ymm6[11],ymm9[11] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm29, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[8],ymm10[8],ymm8[9],ymm10[9],ymm8[10],ymm10[10],ymm8[11],ymm10[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6],ymm8[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm6[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm31, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm24, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm15[0,1,2,3],zmm1[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm0 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm0[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm6 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm8, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3,4],ymm8[5],ymm6[6,7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm6[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm29, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm15 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm15, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm15[2],ymm1[3,4],ymm15[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm1, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[8],ymm10[8],ymm8[9],ymm10[9],ymm8[10],ymm10[10],ymm8[11],ymm10[11] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6> +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm23, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[8],ymm9[8],ymm7[9],ymm9[9],ymm7[10],ymm9[10],ymm7[11],ymm9[11] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm14, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm2, %xmm8 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm1[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm0[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm22 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm29, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0],ymm9[1],ymm6[2,3],ymm9[4],ymm6[5,6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm6 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm23 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm2, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm9[0,1,2,3],zmm14[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm13 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm31, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm14[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm14 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm14[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm9, %zmm0 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm1[1,2],ymm9[3],ymm1[4,5],ymm9[6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm9, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm9[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm8, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm26, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm7 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm24, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm2[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm6 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm31, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm13 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm14[0,1,2,3],zmm1[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm0[1,2],ymm14[3],ymm0[4,5],ymm14[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm23, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm8, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm0[1],ymm6[2,3],ymm0[4],ymm6[5,6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm24, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm2[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm6 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm7[1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm3[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm3[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm3[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm10[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm5[0],ymm10[0],ymm5[1],ymm10[1],ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[8],ymm10[8],ymm5[9],ymm10[9],ymm5[10],ymm10[10],ymm5[11],ymm10[11] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm23, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm11, %xmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm24, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm10[0,1,2,3],zmm5[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm10[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm11 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm11, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm11[2],ymm5[3,4],ymm11[5],ymm5[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm31, %zmm19, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm5[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm5 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm23, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm15, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm15[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm10[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm10 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm4[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm14 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm14[2],ymm2[3,4],ymm14[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm26 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm26, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, (%rsp), %zmm19, %zmm9 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm28, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm11, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[0,2,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,4,4,4] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm20, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm24, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm27, %zmm10, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm25, %zmm10, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm14, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm28, %zmm13, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm13, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm3, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm13, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm16 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm25, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm25 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm17, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm19, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm19, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm27, %zmm19, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm11, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm24, %zmm19, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm18, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm11, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm26, %zmm11, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm29, %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm11, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $600, %rsp # imm = 0x258 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $200, %rsp ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i16_stride6_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $1256, %rsp # imm = 0x4E8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: subq $1368, %rsp # imm = 0x558 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm10, %xmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm23 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[8],ymm7[8],ymm10[9],ymm7[9],ymm10[10],ymm7[10],ymm10[11],ymm7[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm13, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm15, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm15[0],ymm0[1],ymm15[1],ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[8],ymm15[8],ymm0[9],ymm15[9],ymm0[10],ymm15[10],ymm0[11],ymm15[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,0,2,1,8,9,8,9] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [1,0,2,2,1,0,2,2] +; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm4, %ymm25, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm0 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm1 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: movw $9362, %ax # imm = 0x2492 +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [16,9,10,17,12,13,18,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm19, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,1,8,3,4,9,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm24, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[1],ymm0[1],ymm8[2],ymm0[2],ymm8[3],ymm0[3],ymm8[8],ymm0[8],ymm8[9],ymm0[9],ymm8[10],ymm0[10],ymm8[11],ymm0[11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [2,1,2,3,11,11,11,11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm1[4],ymm12[4],ymm1[5],ymm12[5],ymm1[6],ymm12[6],ymm1[7],ymm12[7],ymm1[12],ymm12[12],ymm1[13],ymm12[13],ymm1[14],ymm12[14],ymm1[15],ymm12[15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm27, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [5,6,5,6,5,6,7,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm8, %ymm26, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm11, %xmm21 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [8,9,6,7,4,5,10,11,8,9,10,11,12,13,14,15,24,25,22,23,20,21,26,27,24,25,26,27,28,29,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [2,1,2,3,11,11,11,11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm31, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [5,6,5,6,5,6,7,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm27, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512F-ONLY-FAST-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [8,21,10,11,20,13,14,23] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm13, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm25, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = [12,1,2,13,4,5,14,7] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm4, %ymm29, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = <2,2,u,3,10,u,10,11> -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512F-ONLY-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm3, %ymm26, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm5 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm25, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm4, %ymm29, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm30 = [12,1,2,13,4,5,14,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm2, %ymm30, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [8,21,10,11,20,13,14,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,2,1,8,9,8,9] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm31 = [1,0,2,2,1,0,2,2] -; AVX512F-ONLY-FAST-NEXT: # ymm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm3, %ymm31, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = <2,2,u,3,10,u,10,11> +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm25, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: movw $9362, %ax # imm = 0x2492 -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm4 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm30 = [16,9,10,17,12,13,18,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm30, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,1,8,3,4,9,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm3, %ymm16, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm2[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm19, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm2, %ymm24, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm18, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm31, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm30, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm4, %ymm16, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm2[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm31, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm27, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm10[4],ymm7[4],ymm10[5],ymm7[5],ymm10[6],ymm7[6],ymm10[7],ymm7[7],ymm10[12],ymm7[12],ymm10[13],ymm7[13],ymm10[14],ymm7[14],ymm10[15],ymm7[15] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm3, %ymm26, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm5 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm8, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm25, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm0[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm4, %ymm29, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm3[0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm2, %ymm30, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm8, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm4, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm25, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm19, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm2, %ymm24, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm31, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm3, %ymm26, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm6 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm1, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm4, %ymm29, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm1[0,1,2,3],zmm25[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm28, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm31, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm6 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm15 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm7 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm6[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm30, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm15 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm15, %ymm16, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm7[0,1,2,3],zmm1[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm15, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm27, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm2, %ymm30, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm14, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm13, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm9, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm31, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm1 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm14 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm18, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm12 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm1, %zmm12, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm16, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm12[0,1,2,3],zmm30[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [1,1,1,1,10,10,10,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm9, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> -; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [8,9,20,11,12,21,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,9,2,3,8,5,6,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm14, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = <0,u,0,1,u,10,10,u> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm27, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm25, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm31, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm24, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm15[4],ymm3[5],ymm15[5],ymm3[6],ymm15[6],ymm3[7],ymm15[7],ymm3[12],ymm15[12],ymm3[13],ymm15[13],ymm3[14],ymm15[14],ymm3[15],ymm15[15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm27, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm6 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm1, %ymm0, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm6, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm30, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm18, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm7 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [1,1,1,1,10,10,10,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> +; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm17 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm17 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm14, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [0,9,2,3,8,5,6,11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm28, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = [8,9,20,11,12,21,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = <0,u,0,1,u,10,10,u> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm1 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm28, %ymm31 ; AVX512F-ONLY-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm14, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[8],ymm13[8],ymm0[9],ymm13[9],ymm0[10],ymm13[10],ymm0[11],ymm13[11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm29, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm0 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm0, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm6, %ymm14, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm26, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm28, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm8, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm7, %ymm0, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm6, %zmm29, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm30, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm27, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm27, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm18[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm7, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm30, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm9, %zmm6 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm20[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm28[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm7, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm17[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm7, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm31, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm9, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm9, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm16, %zmm0, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, (%rax) -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm26, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm25, %zmm1, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm20, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm24, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 512(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 384(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $1256, %rsp # imm = 0x4E8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: addq $1368, %rsp # imm = 0x558 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i16_stride6_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $936, %rsp # imm = 0x3A8 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %xmm0 +; AVX512DQ-SLOW-NEXT: subq $552, %rsp # imm = 0x228 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm5 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] ; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm6 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,0,1] ; AVX512DQ-SLOW-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 ; AVX512DQ-SLOW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %xmm11 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[2,1,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm12, %xmm26 ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm7, %xmm31 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm8, %xmm30 ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm10, %xmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm9, %xmm28 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,1,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm11, %xmm29 ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm3, %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %xmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 ; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm19 ; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm15 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm18 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm26 ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm2 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm25 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %ymm4 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %ymm11 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm3 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm3 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm11[4],ymm4[4],ymm11[5],ymm4[5],ymm11[6],ymm4[6],ymm11[7],ymm4[7],ymm11[12],ymm4[12],ymm11[13],ymm4[13],ymm11[14],ymm4[14],ymm11[15],ymm4[15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm21 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX512DQ-SLOW-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm9, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm9, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm6 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm12 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm10 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm3 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[12],ymm12[12],ymm10[13],ymm12[13],ymm10[14],ymm12[14],ymm10[15],ymm12[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm1 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm14 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm14[4],ymm2[4],ymm14[5],ymm2[5],ymm14[6],ymm2[6],ymm14[7],ymm2[7],ymm14[12],ymm2[12],ymm14[13],ymm2[13],ymm14[14],ymm2[14],ymm14[15],ymm2[15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm20 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[3,3,3,3] ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm7 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm3 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm1 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm25 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm4 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm5 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm7[4],ymm2[5],ymm7[5],ymm2[6],ymm7[6],ymm2[7],ymm7[7],ymm2[12],ymm7[12],ymm2[13],ymm7[13],ymm2[14],ymm7[14],ymm2[15],ymm7[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm5, %zmm4, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm1, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm14 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm5[4],ymm3[5],ymm5[5],ymm3[6],ymm5[6],ymm3[7],ymm5[7],ymm3[12],ymm5[12],ymm3[13],ymm5[13],ymm3[14],ymm5[14],ymm3[15],ymm5[15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm0[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm4 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm8 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,2,3,3,5,6,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm17, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm8, %zmm1, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm1, %ymm8 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm13 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5,6],ymm8[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm0[1,2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm0[0,1,2,3],zmm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm8 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm30 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm8 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm6 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm12 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm12[0],ymm6[0],ymm12[1],ymm6[1],ymm12[2],ymm6[2],ymm12[3],ymm6[3],ymm12[8],ymm6[8],ymm12[9],ymm6[9],ymm12[10],ymm6[10],ymm12[11],ymm6[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[1,2,3,3,5,6,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm12, %zmm6, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm6, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm16 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm5[1],ymm12[2,3],ymm5[4],ymm12[5,6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm3[1,2],ymm12[3],ymm3[4,5],ymm12[6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm14 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm0[4],ymm14[5],ymm0[5],ymm14[6],ymm0[6],ymm14[7],ymm0[7],ymm14[12],ymm0[12],ymm14[13],ymm0[13],ymm14[14],ymm0[14],ymm14[15],ymm0[15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm3[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %ymm5 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %ymm3 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm15 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[8],ymm0[8],ymm15[9],ymm0[9],ymm15[10],ymm0[10],ymm15[11],ymm0[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm0[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm3[4],ymm5[4],ymm3[5],ymm5[5],ymm3[6],ymm5[6],ymm3[7],ymm5[7],ymm3[12],ymm5[12],ymm3[13],ymm5[13],ymm3[14],ymm5[14],ymm3[15],ymm5[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[1,2,3,3,5,6,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm15, %zmm24, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm14 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm12, %ymm15 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm14 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm0[1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6> +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm16, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm15 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm14, %zmm0, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm11, %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2],ymm9[3,4],ymm0[5],ymm9[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm26, %xmm9 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm14 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5,6],ymm14[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm26 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm31, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm13 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm13[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6> -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %ymm15 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm15[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm13[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm15, %ymm16 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm5[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm30, %xmm9 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[1],ymm7[1],ymm2[2],ymm7[2],ymm2[3],ymm7[3],ymm2[8],ymm7[8],ymm2[9],ymm7[9],ymm2[10],ymm7[10],ymm2[11],ymm7[11] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm16, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm28, %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm13[0],ymm7[0],ymm13[1],ymm7[1],ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[8],ymm7[8],ymm13[9],ymm7[9],ymm13[10],ymm7[10],ymm13[11],ymm7[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm0, %zmm2 {%k1} ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, %ymm15 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm5[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm2[0,1,2,3],zmm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[2],ymm0[2],ymm11[3],ymm0[3],ymm11[8],ymm0[8],ymm11[9],ymm0[9],ymm11[10],ymm0[10],ymm11[11],ymm0[11] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm4, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm3 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm2, %zmm11 {%k1} -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm11, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[8],ymm12[8],ymm10[9],ymm12[9],ymm10[10],ymm12[10],ymm10[11],ymm12[11] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm4, %zmm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm1 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %xmm11 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm7, %ymm14 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm7[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %xmm13 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm13[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm7, %ymm24 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm13[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm7[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm16, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,1,1,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm9 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm8, %zmm7, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm10, %xmm22 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5,6],ymm7[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %xmm15 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm29 ; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm3 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] +; AVX512DQ-SLOW-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm0[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11] +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm8 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm6, %zmm16, %zmm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm3[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm5[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm6 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm5 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm9[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm14, %zmm6, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm14 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm5 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm7 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm10, %zmm4, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[8],ymm0[8],ymm14[9],ymm0[9],ymm14[10],ymm0[10],ymm14[11],ymm0[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm2, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm6, %zmm4, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm2 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm3 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm7 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm15, %ymm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5,6],ymm7[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[2,3,2,3,6,7,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm7[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm13, %ymm10 -; AVX512DQ-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm12 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6],ymm12[7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm13[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm12[0,1,2,3],zmm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm10[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm9 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm4 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm24, %zmm16 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %ymm14 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm14[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm9[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm15 = mem[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm7 = mem[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm13 = mem[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm4 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm25 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm25 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm23, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm6 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm17, %zmm16 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm16 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm24, %zmm14 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm14 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm29, %zmm18, %zmm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm9, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm7 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm13, %zmm10 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm9, %zmm10 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm16 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm29, %zmm19 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm19 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm31, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm18 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm17, %zmm6 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm6 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm5, %zmm5 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm24, %zmm5 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm26, %zmm7, %zmm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm8 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm27, %zmm7, %zmm8 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm9 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm28, %zmm7, %zmm9 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm15, %zmm10 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm7, %zmm10 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 448(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 384(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512DQ-SLOW-NEXT: addq $936, %rsp # imm = 0x3A8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512DQ-SLOW-NEXT: addq $552, %rsp # imm = 0x228 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i16_stride6_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $1224, %rsp # imm = 0x4C8 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm0 +; AVX512DQ-FAST-NEXT: subq $1304, %rsp # imm = 0x518 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm27 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm3 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %ymm6 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm31 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm30 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm8 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm8, %ymm28 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [1,1,1,1,10,10,10,11] -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %xmm2 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm20, %zmm22 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [1,1,1,1,10,10,10,11] +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm12 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> +; AVX512DQ-FAST-NEXT: vpermd %zmm6, %zmm23, %zmm20 ; AVX512DQ-FAST-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm3, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [8,9,20,11,12,21,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [0,9,2,3,8,5,6,11] -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm5, %zmm20 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [0,9,2,3,8,5,6,11] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm5 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm19, %ymm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,u,0,1,u,10,10,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm2, %ymm22, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [8,9,20,11,12,21,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm14 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm18, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = <0,u,0,1,u,10,10,u> ; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm24, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm17, %zmm10 +; AVX512DQ-FAST-NEXT: vpermd %zmm21, %zmm23, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm10, %zmm25 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm10 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm10, %xmm11 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm11, %ymm22, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm11, %zmm18, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm9[0],ymm5[1],ymm9[1],ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[8],ymm9[8],ymm5[9],ymm9[9],ymm5[10],ymm9[10],ymm5[11],ymm9[11] -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %xmm14 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm4 -; AVX512DQ-FAST-NEXT: vpermd %zmm6, %zmm20, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm4, %zmm26 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %ymm10 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm3, %zmm25, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm24 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm3, %ymm19, %ymm26 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %ymm11 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm12 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm18, %zmm6 -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm20, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm6, %zmm17 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %ymm6 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm25, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm17, %zmm0 +; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm23, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm21 ; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm7 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm7, %ymm19, %ymm17 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm22, %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm19 ; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm7 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm16, %zmm20, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm0, %zmm20, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm1 +; AVX512DQ-FAST-NEXT: vpermd %zmm16, %zmm23, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm1, %zmm17 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm19, %ymm20 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX512DQ-FAST-NEXT: vpermi2d %ymm0, %ymm17, %ymm22 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm9, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm1 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm5[4],ymm9[4],ymm5[5],ymm9[5],ymm5[6],ymm9[6],ymm5[7],ymm9[7],ymm5[12],ymm9[12],ymm5[13],ymm9[13],ymm5[14],ymm9[14],ymm5[15],ymm9[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [2,1,2,3,11,11,11,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [5,6,5,6,5,6,7,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm1 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] -; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm27, %ymm0 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm1, %zmm8 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [8,21,10,11,20,13,14,23] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm9 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm23, %zmm0 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm30 = [12,1,2,13,4,5,14,7] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm30, %ymm8 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm0[0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm1 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,2,1,8,9,8,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [1,0,2,2,1,0,2,2] +; AVX512DQ-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm3 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm24, %ymm1 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512DQ-FAST-NEXT: movw $9362, %ax # imm = 0x2492 +; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm1, %zmm0 {%k2} +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [16,9,10,17,12,13,18,15] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm23, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm8 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = <2,2,u,3,10,u,10,11> +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, %xmm7 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = ; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm13, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm12, %ymm2 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,6,7,4,5,10,11,8,9,10,11,12,13,14,15,24,25,22,23,20,21,26,27,24,25,26,27,28,29,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm12, %ymm1 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [2,1,2,3,11,11,11,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [5,6,5,6,5,6,7,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm2, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [12,1,2,13,4,5,14,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm14[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm2, %ymm27, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm14, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = [8,21,10,11,20,13,14,23] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <2,2,u,3,10,u,10,11> +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm6, %xmm1 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,2,1,8,9,8,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm24, %ymm0 +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero +; AVX512DQ-FAST-NEXT: vpermt2d %ymm2, %ymm23, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm1 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm31, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm27, %ymm2 ; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [5,6,5,6,5,6,7,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm28, %ymm2 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm3, %zmm23, %zmm2 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm3, %ymm30, %ymm0 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm3 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,0,2,1,8,9,8,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm3 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,0,2,2,1,0,2,2] -; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm5, %ymm5 -; AVX512DQ-FAST-NEXT: movw $9362, %ax # imm = 0x2492 -; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm4, %zmm3 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [16,9,10,17,12,13,18,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm6 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm28, %zmm4 -; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm24[0],zero,xmm24[1],zero,xmm24[2],zero,xmm24[3],zero -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,1,8,3,4,9,6,7] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm5, %ymm16, %ymm3 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, %xmm8 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm0[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm3, %ymm27, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm3, %zmm29, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm3 ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm6, %xmm4 ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm3, %ymm24, %ymm3 ; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 ; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm5, %ymm5 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm3, %zmm4 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm28, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm26, %zmm4 ; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512DQ-FAST-NEXT: vpermt2d %ymm5, %ymm16, %ymm4 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm4[0,1,2,3],zmm3[0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm5, %ymm23, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm8, %xmm21 +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm7, %xmm0 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm5 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15] -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm6 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm8 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11] -; AVX512DQ-FAST-NEXT: vpermd %ymm5, %ymm27, %ymm5 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm6, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa %ymm10, %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm23, %zmm6 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm5, %ymm30, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm13, %ymm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm31, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm6 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm7[4],ymm10[4],ymm7[5],ymm10[5],ymm7[6],ymm10[6],ymm7[7],ymm10[7],ymm7[12],ymm10[12],ymm7[13],ymm10[13],ymm7[14],ymm10[14],ymm7[15],ymm10[15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm8 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[8],ymm5[8],ymm8[9],ymm5[9],ymm8[10],ymm5[10],ymm8[11],ymm5[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm10[4],ymm13[4],ymm10[5],ymm13[5],ymm10[6],ymm13[6],ymm10[7],ymm13[7],ymm10[12],ymm13[12],ymm10[13],ymm13[13],ymm10[14],ymm13[14],ymm10[15],ymm13[15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm29, %zmm5 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm3[0,1,2,3],zmm6[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm31, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm11[4],ymm6[4],ymm11[5],ymm6[5],ymm11[6],ymm6[6],ymm11[7],ymm6[7],ymm11[12],ymm6[12],ymm11[13],ymm6[13],ymm11[14],ymm6[14],ymm11[15],ymm6[15] -; AVX512DQ-FAST-NEXT: vpermd %ymm8, %ymm27, %ymm8 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm10 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm11 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm8, %zmm10, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm8 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm8, %zmm5, %zmm23 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm8, %ymm30, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm10 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm31, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm10 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15] +; AVX512DQ-FAST-NEXT: vpermd %ymm6, %ymm28, %ymm7 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm6 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm10 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[8],ymm6[8],ymm10[9],ymm6[9],ymm10[10],ymm6[10],ymm10[11],ymm6[11] ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm11 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm19, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm11 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm14 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm13, %ymm13 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm13, %zmm11, %zmm10 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm15[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm13, %zmm28, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm13 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm9 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm19, %zmm13 -; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm14, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm14 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm15 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm14, %ymm14 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm14, %zmm0, %zmm13 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm0, %zmm13, %zmm28 -; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512DQ-FAST-NEXT: vpermt2d %ymm9, %ymm16, %ymm10 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm16, %ymm13 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm23[0,1,2,3] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm10[0,1,2,3],zmm11[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm11 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm9 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm6[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm18, %zmm10 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm28[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm11 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm18, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm9, %zmm11, %zmm13 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm13, %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm11 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm7, %zmm10, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm7, %ymm27, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm7, %ymm24, %ymm7 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm11 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm11, %ymm11 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm11, %zmm7, %zmm6 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero +; AVX512DQ-FAST-NEXT: vpermi2d %ymm7, %ymm6, %ymm23 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm26, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm11 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm31, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm11 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm11, %zmm29, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm12, %ymm9 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm9[0],ymm11[0],ymm9[1],ymm11[1],ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[8],ymm11[8],ymm9[9],ymm11[9],ymm9[10],ymm11[10],ymm9[11],ymm11[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm30, %zmm9 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm18, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] +; AVX512DQ-FAST-NEXT: vpermd %ymm11, %ymm28, %ymm11 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm12 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm13 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm11, %zmm12, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermi2d %ymm11, %ymm9, %ymm27 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm11 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm11, %zmm29, %zmm9 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm23, %zmm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm27, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm12, %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm18, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm9, %zmm11, %zmm12 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, (%rax) -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm29, %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 320(%rax) -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm20[0,1,2,3],zmm25[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 256(%rax) -; AVX512DQ-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm0 = zmm17[0,1,2,3],mem[0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 704(%rax) +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm22, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm9, %zmm12, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm9, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 576(%rax) +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm10, %zmm11, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 512(%rax) +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm21, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm12, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 448(%rax) -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm12, %zmm11, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm9, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 384(%rax) -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm7 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 576(%rax) -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 512(%rax) -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 704(%rax) -; AVX512DQ-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm0 = zmm26[0,1,2,3],mem[0,1,2,3] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm16, %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 320(%rax) ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 640(%rax) -; AVX512DQ-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm0 = zmm22[0,1,2,3],mem[0,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm8 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm14 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm12, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512DQ-FAST-NEXT: addq $1224, %rsp # imm = 0x4C8 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-FAST-NEXT: addq $1304, %rsp # imm = 0x518 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-LABEL: store_i16_stride6_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm15 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm13 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm16, %zmm26, %zmm2 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm20 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm11, %zmm20 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm5, %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm24 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm19 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm10, %zmm14 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm3, %zmm2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm13, %zmm20 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm6, %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm22 ; AVX512BW-NEXT: vpermt2w %zmm24, %zmm18, %zmm22 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm8, %zmm6 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm9, %zmm7 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm21, %zmm23 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm10, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm23 +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm21, %zmm23 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm12, %zmm11 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] ; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm17, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm26, %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm26, %zmm25 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm16 +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm17, %zmm16 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm25, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm5 -; AVX512BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm18 -; AVX512BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm10 -; AVX512BW-NEXT: vpermi2w %zmm16, %zmm3, %zmm17 -; AVX512BW-NEXT: vpermt2w %zmm16, %zmm26, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm24, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm19, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm19, %zmm16 -; AVX512BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm11 -; AVX512BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm8 -; AVX512BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm21 -; AVX512BW-NEXT: vpermi2w %zmm12, %zmm1, %zmm24 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm1 +; AVX512BW-NEXT: vpermi2w %zmm19, %zmm5, %zmm10 +; AVX512BW-NEXT: vpermi2w %zmm19, %zmm5, %zmm6 +; AVX512BW-NEXT: vpermi2w %zmm19, %zmm5, %zmm18 +; AVX512BW-NEXT: vpermi2w %zmm19, %zmm5, %zmm21 +; AVX512BW-NEXT: vpermi2w %zmm19, %zmm5, %zmm17 +; AVX512BW-NEXT: vpermt2w %zmm19, %zmm25, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm24, %zmm19 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm25, %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm15 +; AVX512BW-NEXT: vpermi2w %zmm15, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermi2w %zmm15, %zmm1, %zmm13 +; AVX512BW-NEXT: vpermi2w %zmm15, %zmm1, %zmm9 +; AVX512BW-NEXT: vpermi2w %zmm15, %zmm1, %zmm12 +; AVX512BW-NEXT: vpermi2w %zmm15, %zmm1, %zmm24 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm25, %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm15 ; AVX512BW-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm12 +; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31> +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm14, %zmm2 ; AVX512BW-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31> -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u> -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm4 -; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31> -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm6 -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31> -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u> -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31> -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm16 -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm5 -; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm8 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm20, %zmm8 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm11 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm22, %zmm10 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm13 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31> +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm20, %zmm4 +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u> +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm22, %zmm7 +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31> +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm23, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31> +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm19, %zmm16 +; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u> +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm6 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm20, %zmm6 +; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm9 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm22, %zmm9 +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm23, %zmm12 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm14, %zmm2 ; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm18, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm7, %zmm17 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm7, %zmm4 -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm3, %zmm6 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm12, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm16 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm7, %zmm5 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm3, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm12, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm15, %zmm17 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm4 +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm19, %zmm17 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm7 +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm5, %zmm11 +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm8, %zmm16 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm6 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm9 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm5, %zmm12 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm8, %zmm17 +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm10, %zmm1 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -20,83 +20,85 @@ ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa (%rdx), %xmm1 -; SSE-NEXT: movdqa (%r8), %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,0,0,65535] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,7,7] -; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,0,3,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movq %xmm1, 16(%rax) -; SSE-NEXT: movdqa %xmm4, (%rax) -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa (%rsi), %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm6 +; SSE-NEXT: movdqa (%rcx), %xmm3 +; SSE-NEXT: movdqa (%r8), %xmm4 +; SSE-NEXT: movdqa (%r9), %xmm0 +; SSE-NEXT: movdqa (%r10), %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[1,1,1,1,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm7[2,1] +; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,65535,65535,65535,0,0,0,65535] +; SSE-NEXT: andps %xmm5, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; SSE-NEXT: andnps %xmm4, %xmm5 +; SSE-NEXT: orps %xmm6, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,1,3] ; SSE-NEXT: movd %xmm0, 24(%rax) +; SSE-NEXT: movq %xmm1, 16(%rax) +; SSE-NEXT: movaps %xmm5, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride7_vf2: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,2,3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,0,1,4,5,8,9,u,u] +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm0 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastss (%r10), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,2,3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,u,0,1,4,5,8,9,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,10,11,14,15,u,u,u,u,u,u,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,2,3,6,7,10,11,u,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpextrd $2, %xmm1, 24(%rax) -; AVX1-ONLY-NEXT: vmovq %xmm0, 16(%rax) +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,10,11,14,15,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,6,7,10,11,u,u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpextrd $2, %xmm0, 24(%rax) +; AVX1-ONLY-NEXT: vmovq %xmm1, 16(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rax) +; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i16_stride7_vf2: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-ONLY-NEXT: vpbroadcastd (%r10), %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX2-ONLY-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX2-ONLY-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-ONLY-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,ymm0[22,23,26,27,30,31],zero,zero,zero,zero,zero,zero,ymm0[24,25,20,21] ; AVX2-ONLY-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-ONLY-NEXT: vpextrd $2, %xmm1, 24(%rax) ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-ONLY-NEXT: vpextrd $2, %xmm1, 24(%rax) ; AVX2-ONLY-NEXT: vmovq %xmm1, 16(%rax) ; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rax) ; AVX2-ONLY-NEXT: vzeroupper @@ -106,22 +108,24 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512F-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-NEXT: vmovdqa (%r8), %xmm1 +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX512F-NEXT: vpbroadcastd (%r10), %ymm2 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512F-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512F-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 -; AVX512F-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX512F-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u] ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,ymm0[22,23,26,27,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vpextrd $2, %xmm1, 24(%rax) ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpextrd $2, %xmm1, 24(%rax) ; AVX512F-NEXT: vmovq %xmm1, 16(%rax) ; AVX512F-NEXT: vmovdqa %xmm0, (%rax) ; AVX512F-NEXT: vzeroupper @@ -131,20 +135,24 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm1 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX512BW-NEXT: vpbroadcastd (%r10), %ymm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,16,18,8,10,24,1,3,17,19,9,11,25,u,u> -; AVX512BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX512BW-NEXT: vpextrd $2, %xmm0, 24(%rax) -; AVX512BW-NEXT: vmovq %xmm0, 16(%rax) -; AVX512BW-NEXT: vmovdqa %xmm2, (%rax) +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,8,10,12,1,3,5,7,9,11,13,u,u> +; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpextrd $2, %xmm1, 24(%rax) +; AVX512BW-NEXT: vmovq %xmm1, 16(%rax) +; AVX512BW-NEXT: vmovdqa %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <2 x i16>, ptr %in.vecptr0, align 64 @@ -171,129 +179,139 @@ ; SSE-LABEL: store_i16_stride7_vf4: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm8 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm6 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,0,1] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm6, %xmm8 -; SSE-NEXT: pandn %xmm7, %xmm6 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm3[0] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm7, %xmm12 -; SSE-NEXT: pandn %xmm8, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: por %xmm12, %xmm7 -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: psrld $16, %xmm10 -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,0,0,65535] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,1,0,1] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm7, %xmm10 ; SSE-NEXT: pandn %xmm9, %xmm7 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,0,0,65535,65535,65535] -; SSE-NEXT: pand %xmm6, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm8[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0] -; SSE-NEXT: pandn %xmm9, %xmm6 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 -; SSE-NEXT: psrlq $48, %xmm4 -; SSE-NEXT: por %xmm8, %xmm4 -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,1] -; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,65535,0,0,0,65535] -; SSE-NEXT: andps %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; SSE-NEXT: por %xmm10, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm5[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm4[0] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm4[1,1,1,1,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: pandn %xmm10, %xmm9 +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pand %xmm8, %xmm9 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: por %xmm9, %xmm8 +; SSE-NEXT: psrld $16, %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,0,65535] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] +; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm11, %xmm9 +; SSE-NEXT: por %xmm7, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,0,0,65535,65535,65535] +; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm6[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm11[0] +; SSE-NEXT: pandn %xmm10, %xmm7 +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 +; SSE-NEXT: psrlq $48, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: por %xmm5, %xmm9 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[2,1] +; SSE-NEXT: movaps {{.*#+}} xmm4 = [65535,65535,65535,65535,0,0,0,65535] +; SSE-NEXT: andps %xmm4, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,1,0,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: andnps %xmm2, %xmm3 -; SSE-NEXT: orps %xmm0, %xmm3 -; SSE-NEXT: movaps %xmm3, (%rax) -; SSE-NEXT: movq %xmm7, 48(%rax) -; SSE-NEXT: movdqa %xmm6, 32(%rax) -; SSE-NEXT: movdqa %xmm5, 16(%rax) +; SSE-NEXT: andnps %xmm2, %xmm4 +; SSE-NEXT: orps %xmm0, %xmm4 +; SSE-NEXT: movaps %xmm4, (%rax) +; SSE-NEXT: movq %xmm9, 48(%rax) +; SSE-NEXT: movdqa %xmm7, 32(%rax) +; SSE-NEXT: movdqa %xmm8, 16(%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride7_vf4: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,1,2,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4],xmm3[5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[0,1,8,9,u,u,u,u,u,u,u,u,2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm6 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5],xmm8[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm10[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpackusdw %xmm8, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,u,u,u,u,u,u,u,u,u,u,4,5,12,13] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm6[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 16(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, (%rax) -; AVX1-ONLY-NEXT: vmovq %xmm4, 48(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 32(%rax) +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[14,15,4,5,8,9,u,u,8,9,14,15,12,13,14,15] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vmovlps %xmm1, 48(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%rax) +; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride7_vf4: @@ -304,39 +322,42 @@ ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX2-SLOW-NEXT: movq (%r10), %rcx +; AVX2-SLOW-NEXT: vmovq %rcx, %xmm6 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm4[4,5,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpor %ymm5, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,7,6,7,8,9,10,11,12,15,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,3],zero,zero,ymm2[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[0,1,8,9,4,5,6,7,4,5],zero,zero,ymm5[26,27],zero,zero,zero,zero,ymm5[24,25,20,21,22,23,20,21,28,29] -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm0[2,3,2,3,2,3,2,3],zero,zero,zero,zero,ymm0[0,1,2,3,18,19,18,19,18,19,18,19,26,27],zero,zero,ymm0[16,17,18,19] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19],zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,6,7],zero,zero,zero,zero,ymm0[4,5,4,5,4,5,4,5,28,29,22,23,30,31],zero,zero,ymm0[20,21,20,21,20,21,20,21] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[4,5,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[22,23],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[14,15,14,15,14,15,14,15,4,5,6,7,14,15,14,15,30,31,30,31,30,31,30,31,20,21,22,23,30,31,30,31] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,0,0,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%rax) -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vmovq %xmm1, 48(%rax) -; AVX2-SLOW-NEXT: vmovdqa %xmm0, 32(%rax) +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,0] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,20,21] +; AVX2-SLOW-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vmovd %ecx, %xmm2 +; AVX2-SLOW-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX2-SLOW-NEXT: vmovq %xmm0, 48(%rax) +; AVX2-SLOW-NEXT: vmovdqa %xmm3, 32(%rax) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -348,37 +369,42 @@ ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX2-FAST-NEXT: movq (%r10), %rcx +; AVX2-FAST-NEXT: vmovq %rcx, %xmm6 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm4[4,5,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpor %ymm5, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,1,5,0,1,1,7] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,7,6,7,8,9,10,11,12,15,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <5,7,1,3,7,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,3],zero,zero,ymm2[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,4,5,6,7,4,5],zero,zero,ymm2[26,27],zero,zero,zero,zero,ymm2[24,25,20,21,22,23,20,21,28,29] -; AVX2-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,4,5,8,9,u,u,u,u,u,u,u,u,18,19,22,23,26,27,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,0] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,20,21] +; AVX2-FAST-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovd %ecx, %xmm2 +; AVX2-FAST-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm0 ; AVX2-FAST-NEXT: vmovq %xmm0, 48(%rax) -; AVX2-FAST-NEXT: vmovdqa %xmm1, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %xmm3, 32(%rax) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -390,38 +416,41 @@ ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: movq (%r10), %rcx +; AVX2-FAST-PERLANE-NEXT: vmovq %rcx, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm4[4,5,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpor %ymm5, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,7,4,5,14,15,u,u,u,u,16,17,18,19,20,21,22,23,20,21,30,31,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,3],zero,zero,ymm2[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[0,1,8,9,4,5,6,7,4,5],zero,zero,ymm5[26,27],zero,zero,zero,zero,ymm5[24,25,20,21,22,23,20,21,28,29] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm0[2,3,2,3,2,3,2,3],zero,zero,zero,zero,ymm0[0,1,2,3,18,19,18,19,18,19,18,19,26,27],zero,zero,ymm0[16,17,18,19] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,6,7],zero,zero,zero,zero,ymm0[4,5,4,5,4,5,4,5,28,29,22,23,30,31],zero,zero,ymm0[20,21,20,21,20,21,20,21] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[4,5,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[22,23],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[14,15,14,15,14,15,14,15,4,5,6,7,14,15,14,15,30,31,30,31,30,31,30,31,20,21,22,23,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[4,5,12,13,4,5,6,7,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,0,0,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rax) -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm1, 48(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,20,21] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovd %ecx, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm0, 48(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -437,32 +466,33 @@ ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm3[26,27],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,20,21,28,29] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm2[u,u,u,u,u,u,6,7,14,15],zero,zero,ymm2[30,31,u,u,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,2,3],zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,12,13,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[6,7],zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 +; AVX512F-SLOW-NEXT: vporq %zmm3, %zmm2, %zmm2 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[0,1,u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm0[u,u,u,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[0,1,u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm0[u,u,u,u] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u],zero,zero,zero,zero,ymm0[4,5,u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm0[u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm1[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[18,19,u,u,u,u] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm1[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[18,19,u,u,u,u] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,4,5,12,13],zero,zero,ymm1[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[22,23,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512F-SLOW-NEXT: vporq %zmm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm2[2,3,0,1] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm1[26,27],zero,zero,zero,zero,ymm1[u,u,u,u,u,u,20,21,28,29] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm2[u,u,u,u,u,u,6,7,14,15],zero,zero,ymm2[30,31,u,u,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,2,3],zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,12,13,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[6,7],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) -; AVX512F-SLOW-NEXT: vextracti32x4 $3, %zmm1, %xmm0 -; AVX512F-SLOW-NEXT: vmovq %xmm0, 48(%rax) -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rax) +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) +; AVX512F-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; AVX512F-SLOW-NEXT: vmovq %xmm1, 48(%rax) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -478,29 +508,30 @@ ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,u> -; AVX512F-FAST-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6] -; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpermi2q %ymm4, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6] +; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,ymm2[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[18,19,22,23,26,27],zero,zero,zero,zero ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7] ; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,2,3],zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm3[26,27],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,20,21,28,29] -; AVX512F-FAST-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <5,7,1,3,7,u,u,u> -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,1,8,9],zero,zero,zero,zero,ymm0[u,u,u,u,u,u,2,3],zero,zero,ymm0[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <4,6,u,u,u,2,1,3> +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,4,5,u,u,u,u,u,u],zero,zero,ymm3[22,23],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,24,25,28,29] +; AVX512F-FAST-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <5,7,1,3,7,u,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm0[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512F-FAST-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512F-FAST-NEXT: vmovq %xmm1, 48(%rax) @@ -520,16 +551,16 @@ ; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,4,8,12,32,40,36,1,5,9,13,33,41,37,2,6,10,14,34,42,38,3,7,11,15,35,43,39,u,u,u,u> -; AVX512BW-SLOW-NEXT: vpermi2w %zmm0, %zmm2, %zmm1 -; AVX512BW-SLOW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) -; AVX512BW-SLOW-NEXT: vextracti32x4 $3, %zmm1, %xmm0 +; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,4,8,12,32,40,36,1,5,9,13,33,41,37,2,6,10,14,34,42,38,3,7,11,15,35,43,39,u,u,u,u> +; AVX512BW-SLOW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 +; AVX512BW-SLOW-NEXT: vextracti32x4 $2, %zmm2, 32(%rax) +; AVX512BW-SLOW-NEXT: vextracti32x4 $3, %zmm2, %xmm0 ; AVX512BW-SLOW-NEXT: vmovq %xmm0, 48(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa %ymm1, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa %ymm2, (%rax) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; @@ -545,12 +576,12 @@ ; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512BW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,u> -; AVX512BW-FAST-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 +; AVX512BW-FAST-NEXT: vpermi2q %ymm4, %ymm0, %ymm1 +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm0 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6,10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u> ; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) @@ -583,146 +614,148 @@ ; SSE-LABEL: store_i16_stride7_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa (%rsi), %xmm8 -; SSE-NEXT: movdqa (%rdx), %xmm5 -; SSE-NEXT: movdqa (%rcx), %xmm11 -; SSE-NEXT: movdqa (%r8), %xmm4 -; SSE-NEXT: movdqa (%r9), %xmm10 -; SSE-NEXT: movdqa (%rax), %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; SSE-NEXT: movdqa %xmm10, %xmm13 -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm9 +; SSE-NEXT: movdqa (%rdx), %xmm4 +; SSE-NEXT: movdqa (%rcx), %xmm7 +; SSE-NEXT: movdqa (%r8), %xmm3 +; SSE-NEXT: movdqa (%r9), %xmm6 +; SSE-NEXT: movdqa (%rax), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,1] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE-NEXT: movdqa %xmm6, %xmm15 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,xmm11[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm8, %xmm11 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: por %xmm11, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: psrld $16, %xmm11 +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm11, %xmm13 +; SSE-NEXT: pandn %xmm12, %xmm11 +; SSE-NEXT: por %xmm13, %xmm11 +; SSE-NEXT: pand %xmm5, %xmm11 +; SSE-NEXT: pandn %xmm8, %xmm5 +; SSE-NEXT: por %xmm11, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: movaps {{.*#+}} xmm8 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm7[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm12, %xmm13 -; SSE-NEXT: pandn %xmm9, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 ; SSE-NEXT: por %xmm13, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm12[0,3] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: movdqa %xmm7, %xmm12 -; SSE-NEXT: movdqa %xmm7, %xmm13 -; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm9, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm14, %xmm15 -; SSE-NEXT: pandn %xmm9, %xmm14 -; SSE-NEXT: movaps {{.*#+}} xmm9 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: por %xmm15, %xmm14 -; SSE-NEXT: movdqa %xmm6, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm14[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm11[3,3,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm15[0,2] -; SSE-NEXT: andps %xmm9, %xmm14 -; SSE-NEXT: andnps %xmm13, %xmm9 -; SSE-NEXT: orps %xmm14, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[0,1,0,1] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pslldq {{.*#+}} xmm12 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm14, %xmm12 -; SSE-NEXT: pandn %xmm13, %xmm14 -; SSE-NEXT: por %xmm12, %xmm14 -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: psrld $16, %xmm12 -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,0],xmm12[3,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm7[3,3,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm13[0,2] +; SSE-NEXT: andps %xmm8, %xmm12 +; SSE-NEXT: andnps %xmm10, %xmm8 +; SSE-NEXT: orps %xmm12, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; SSE-NEXT: movdqa %xmm11, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,2],xmm12[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm3[4],xmm15[5],xmm3[5],xmm15[6],xmm3[6],xmm15[7],xmm3[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm12, %xmm15 -; SSE-NEXT: pandn %xmm13, %xmm12 -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; SSE-NEXT: por %xmm15, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pand %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm14, %xmm15 -; SSE-NEXT: movdqa %xmm4, %xmm14 -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] -; SSE-NEXT: por %xmm12, %xmm15 -; SSE-NEXT: psrlq $48, %xmm11 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm11[1] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: pandn %xmm13, %xmm11 -; SSE-NEXT: movdqa %xmm6, %xmm13 -; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm11, %xmm13 -; SSE-NEXT: psrld $16, %xmm10 -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: pandn %xmm11, %xmm12 -; SSE-NEXT: por %xmm13, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm10, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[3,3,3,3] -; SSE-NEXT: pandn %xmm11, %xmm10 -; SSE-NEXT: por %xmm12, %xmm10 -; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; SSE-NEXT: psrld $16, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm12, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: por %xmm8, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,0,0,65535,65535,65535] -; SSE-NEXT: pand %xmm8, %xmm12 -; SSE-NEXT: movdqa %xmm7, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,2],xmm2[1,1] -; SSE-NEXT: pandn %xmm13, %xmm8 -; SSE-NEXT: por %xmm12, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pand %xmm14, %xmm15 +; SSE-NEXT: pandn %xmm13, %xmm14 +; SSE-NEXT: por %xmm15, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,2],xmm14[0,3] +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3] +; SSE-NEXT: psrld $16, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,0,65535] +; SSE-NEXT: movdqa %xmm15, %xmm9 +; SSE-NEXT: pandn %xmm13, %xmm9 +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm13[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: por %xmm9, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,0,0,65535,65535,65535] +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm0[1,1] +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: por %xmm6, %xmm9 +; SSE-NEXT: psrlq $48, %xmm7 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm7[1] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: psrld $16, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: pandn %xmm7, %xmm6 +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; SSE-NEXT: pandn %xmm7, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 ; SSE-NEXT: pslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm14[0,1] -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,1] -; SSE-NEXT: movaps {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,0,65535] -; SSE-NEXT: andps %xmm4, %xmm6 -; SSE-NEXT: andnps %xmm0, %xmm4 -; SSE-NEXT: orps %xmm6, %xmm4 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm11[2,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,65535,65535,65535,0,0,0,65535] -; SSE-NEXT: andps %xmm2, %xmm5 -; SSE-NEXT: andnps %xmm0, %xmm2 -; SSE-NEXT: orps %xmm5, %xmm2 +; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,xmm11[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm15, %xmm11 +; SSE-NEXT: pandn %xmm12, %xmm15 +; SSE-NEXT: por %xmm11, %xmm15 +; SSE-NEXT: movaps %xmm0, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[0,0] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[2,1] +; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,0,65535] +; SSE-NEXT: andps %xmm2, %xmm6 +; SSE-NEXT: andnps %xmm15, %xmm2 +; SSE-NEXT: orps %xmm6, %xmm2 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm13 = xmm13[0],xmm14[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2,0],mem[2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,65535,65535,65535,0,0,0,65535] +; SSE-NEXT: andps %xmm0, %xmm13 +; SSE-NEXT: andnps %xmm1, %xmm0 +; SSE-NEXT: orps %xmm13, %xmm0 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, (%rax) -; SSE-NEXT: movaps %xmm4, 64(%rax) -; SSE-NEXT: movdqa %xmm15, 16(%rax) -; SSE-NEXT: movdqa %xmm8, 32(%rax) -; SSE-NEXT: movaps %xmm9, 48(%rax) -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: movaps %xmm1, 80(%rax) -; SSE-NEXT: movdqa %xmm10, 96(%rax) +; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: movaps %xmm2, 64(%rax) +; SSE-NEXT: movdqa %xmm4, 96(%rax) +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0,1,3] +; SSE-NEXT: movaps %xmm10, 80(%rax) +; SSE-NEXT: movaps %xmm8, 48(%rax) +; SSE-NEXT: movdqa %xmm9, 32(%rax) +; SSE-NEXT: movdqa %xmm5, 16(%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride7_vf8: @@ -831,10 +864,10 @@ ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm3 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm9 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm8[4,5,u,u,u,u,u,u,u,u,u,u,u,u,6,7,22,23,u,u,u,u,u,u,u,u,u,u,u,u,24,25] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm9 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm9[4,5,u,u,u,u,u,u,u,u,u,u,u,u,6,7,22,23,u,u,u,u,u,u,u,u,u,u,u,u,24,25] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[2,2,2,2,4,5,6,7,10,10,10,10,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,2,2,4,5,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2,3,4,5],ymm10[6],ymm6[7,8],ymm10[9],ymm6[10,11,12,13],ymm10[14],ymm6[15] @@ -842,36 +875,36 @@ ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm6, %ymm10, %ymm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpbroadcastd 4(%r10), %ymm12 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = ; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm6, %ymm11, %ymm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm8[1,3,1,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[20,21,28,29],zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm7[1,3,1,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,ymm12[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[20,21,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm9[0,2,0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,ymm11[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,2,2,0] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[28,29,20,21] ; AVX2-SLOW-NEXT: vpor %ymm11, %ymm12, %ymm11 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,8,9,u,u,u,u,u,u,u,u,u,u,2,3,18,19,u,u,u,u,u,u,u,u,u,u,28,29,20,21] -; AVX2-SLOW-NEXT: vpbroadcastd 8(%r10), %ymm12 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,0] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[28,29,20,21] -; AVX2-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpbroadcastd (%r10), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpbroadcastd (%r10), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm11, %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,1,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[20,21,28,29],zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,ymm7[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[20,21,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,3,1] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,8,9,u,u,u,u,u,u,u,u,u,u,2,3,18,19,u,u,u,u,u,u,u,u,u,u,28,29,20,21] +; AVX2-SLOW-NEXT: vpbroadcastd 8(%r10), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7 ; AVX2-SLOW-NEXT: vpsrlq $48, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] @@ -883,8 +916,8 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, (%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm10, (%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm6, 32(%rax) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -902,42 +935,42 @@ ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm7 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm8 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm9 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm7[0,2,1,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,24,25],zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,5,u,u,5,2,6,u> -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[20,21,24,25] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm8[0,2,0,2] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm7[0,2,2,0] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[28,29,20,21] ; AVX2-FAST-NEXT: vpor %ymm6, %ymm10, %ymm6 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpbroadcastd 4(%r10), %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpbroadcastd (%r10), %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm6, %ymm11, %ymm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm8[1,3,1,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[20,21,28,29],zero,zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm7[1,3,1,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,ymm12[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[20,21,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,1,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,7,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17,24,25],zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <1,5,u,u,5,2,6,u> +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[20,21,24,25] ; AVX2-FAST-NEXT: vpor %ymm11, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,8,9,u,u,u,u,u,u,u,u,u,u,2,3,18,19,u,u,u,u,u,u,u,u,u,u,28,29,20,21] -; AVX2-FAST-NEXT: vpbroadcastd 8(%r10), %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,0] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[28,29,20,21] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpbroadcastd 4(%r10), %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm11, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,1,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[20,21,28,29],zero,zero,zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,ymm7[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[20,21,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpbroadcastd (%r10), %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm9[1,3,3,1] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,8,9,u,u,u,u,u,u,u,u,u,u,2,3,18,19,u,u,u,u,u,u,u,u,u,u,28,29,20,21] +; AVX2-FAST-NEXT: vpbroadcastd 8(%r10), %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vpsrlq $48, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] @@ -949,9 +982,9 @@ ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm9, 64(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm10, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, (%rax) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -983,29 +1016,29 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm6, %ymm11, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm8[1,3,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[20,21,28,29],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm7[1,3,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,ymm12[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[20,21,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm8[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,ymm11[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,2,2,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[28,29,20,21] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,8,9,u,u,u,u,u,u,u,u,u,u,2,3,18,19,u,u,u,u,u,u,u,u,u,u,28,29,20,21] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%r10), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[28,29,20,21] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%r10), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm11, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[20,21,28,29],zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,ymm7[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[20,21,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%r10), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm9[1,3,3,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,8,9,u,u,u,u,u,u,u,u,u,u,2,3,18,19,u,u,u,u,u,u,u,u,u,u,28,29,20,21] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%r10), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpsrlq $48, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] @@ -1017,8 +1050,8 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, (%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -1033,12 +1066,12 @@ ; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm5 ; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm6 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm7 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm7[0,2,1,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,1,3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,ymm10[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[4,5,u,u,u,u,u,u,u,u,u,u,u,u,6,7,22,23,u,u,u,u,u,u,u,u,u,u,u,u,24,25] @@ -1047,11 +1080,11 @@ ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,1,2,2,4,5,6,6] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4,5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11,12,13],ymm11[14],ymm10[15] ; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm10 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,2,0] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm8[0,2,2,0] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,8,9],zero,zero,zero,zero,ymm11[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm11[u,u,u,u,u,u,28,29,20,21] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 ; AVX512F-SLOW-NEXT: vporq %zmm9, %zmm10, %zmm9 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm7[0,2,0,2] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm10[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm10[u,u,u,u] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,4,5,12,13],zero,zero,ymm10[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm10[u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 @@ -1072,10 +1105,10 @@ ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u],zero,zero,zero,zero,ymm1[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[20,21,28,29,u,u,u,u] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm7[1,3,1,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,1,3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm2[u,u,u,u] ; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm7[1,3,3,1] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] ; AVX512F-SLOW-NEXT: vpternlogd $206, 8(%r10){1to8}, %ymm2, %ymm3 @@ -1097,9 +1130,9 @@ ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm3 ; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm4 ; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm5 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm7 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm8 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm6 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm8 ; AVX512F-FAST-NEXT: vpsrlq $48, %xmm3, %xmm3 ; AVX512F-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -1110,30 +1143,30 @@ ; AVX512F-FAST-NEXT: vpbroadcastd 12(%r10), %xmm2 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm7[1,3,1,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm8[1,3,1,3] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u],zero,zero,zero,zero,ymm1[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[20,21,28,29,u,u,u,u] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm6[1,3,1,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm7[1,3,1,3] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm2[u,u,u,u] ; AVX512F-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm6[1,3,3,1] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] ; AVX512F-FAST-NEXT: vpternlogd $206, 8(%r10){1to8}, %ymm2, %ymm3 ; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,2,0,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm8[0,2,0,2] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,2,1,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm7[0,2,1,3] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,2,2,0] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm7[0,2,2,0] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,8,9],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,28,29,20,21] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <1,5,u,u,5,2,6,u> -; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm4 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[20,21,24,25] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 ; AVX512F-FAST-NEXT: vporq %zmm2, %zmm3, %zmm2 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,2,0,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,2,0,2] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm3[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm3[u,u,u,u] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,4,5,12,13],zero,zero,ymm3[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm3[u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 @@ -1155,17 +1188,17 @@ ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,32,40,48,1,9,17,25,33,41,49,2,10,18,26,34,42,50,3,11,19,27,35,43,51,4,12,20,28] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,16,24,32,40,48,1,9,17,25,33,41,49,2,10,18,26,34,42,50,3,11,19,27,35,43,51,4,12,20,28] +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <36,44,52,5,13,21,29,37,45,53,6,14,22,30,38,46,54,7,15,23,31,39,47,55,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1195,187 +1228,194 @@ ; SSE-NEXT: subq $216, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa 16(%rdi), %xmm6 -; SSE-NEXT: movdqa 16(%rsi), %xmm4 -; SSE-NEXT: movdqa 16(%rdx), %xmm15 -; SSE-NEXT: movdqa 16(%rcx), %xmm1 -; SSE-NEXT: movdqa 16(%r8), %xmm8 -; SSE-NEXT: movdqa 16(%r9), %xmm7 -; SSE-NEXT: movdqa 16(%rax), %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm10 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: movdqa 16(%rsi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdx), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rcx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: movdqa (%r8), %xmm14 +; SSE-NEXT: movdqa 16(%r8), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r9), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa (%rax), %xmm3 +; SSE-NEXT: movdqa 16(%rax), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] -; SSE-NEXT: movaps {{.*#+}} xmm6 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] -; SSE-NEXT: andnps %xmm1, %xmm6 -; SSE-NEXT: orps %xmm0, %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: movaps {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] +; SSE-NEXT: andnps %xmm0, %xmm2 +; SSE-NEXT: orps %xmm1, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,1] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r9), %xmm12 +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa (%rcx), %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa (%r8), %xmm2 -; SSE-NEXT: movdqa (%r9), %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm11, %xmm6 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa (%rdx), %xmm13 -; SSE-NEXT: movdqa (%rcx), %xmm5 -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm11 -; SSE-NEXT: movdqa (%rsi), %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm1[4],xmm12[5],xmm1[5],xmm12[6],xmm1[6],xmm12[7],xmm1[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm14[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm6[0,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm14 -; SSE-NEXT: pandn %xmm6, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: psrld $16, %xmm3 -; SSE-NEXT: movdqa %xmm13, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: movdqa (%rdx), %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: movdqa (%rsi), %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: por %xmm3, %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm7, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pandn %xmm7, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: por %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] +; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,0],xmm7[3,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[3,3,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm13[0,2] +; SSE-NEXT: andps %xmm9, %xmm0 +; SSE-NEXT: orps %xmm11, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm13 +; SSE-NEXT: pandn %xmm11, %xmm13 +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm12[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,2],xmm11[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm0[0,3] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm3[3,3] -; SSE-NEXT: movdqa %xmm15, %xmm10 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm9[0,2] -; SSE-NEXT: andps %xmm8, %xmm1 -; SSE-NEXT: orps %xmm6, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm11 -; SSE-NEXT: pandn %xmm14, %xmm7 -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[2,2,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,1,0,1] +; SSE-NEXT: pandn %xmm13, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] -; SSE-NEXT: andps %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; SSE-NEXT: psrlq $48, %xmm2 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm8, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,2,2,2] +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm2[3,0] +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[3,3,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,2] +; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: andps %xmm2, %xmm0 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: orps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] +; SSE-NEXT: movdqa %xmm6, %xmm15 +; SSE-NEXT: psrlq $48, %xmm4 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -1384,157 +1424,161 @@ ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm10[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,0] +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,65535] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,2],xmm3[1,1] +; SSE-NEXT: movaps {{.*#+}} xmm8 = [65535,65535,0,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm8, %xmm14 +; SSE-NEXT: andnps %xmm13, %xmm14 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: orps %xmm6, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm13 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1] +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: pandn %xmm6, %xmm13 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm12[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm13, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: movaps (%rsp), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,2],xmm5[1,1] +; SSE-NEXT: andnps %xmm13, %xmm8 +; SSE-NEXT: orps %xmm6, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,0],xmm7[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps %xmm9, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,2],xmm6[2,0] +; SSE-NEXT: movaps %xmm5, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,6,7] -; SSE-NEXT: movaps {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm2, %xmm1 +; SSE-NEXT: movaps {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm1, %xmm13 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: andnps %xmm0, %xmm1 +; SSE-NEXT: orps %xmm13, %xmm1 +; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: andps %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE-NEXT: andnps %xmm0, %xmm2 ; SSE-NEXT: orps %xmm1, %xmm2 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: orps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm4, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,0,0,65535] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm10 +; SSE-NEXT: shufps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2,0],mem[0,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,2],xmm7[1,1] -; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,0,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm5, %xmm3 -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: orps %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm15[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; SSE-NEXT: psrld $16, %xmm9 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm9, %xmm1 -; SSE-NEXT: movaps (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm9 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,2],xmm11[1,1] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: andnps %xmm9, %xmm6 -; SSE-NEXT: orps %xmm1, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: pslldq {{.*#+}} xmm12 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,0,65535] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm12, %xmm9 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm12[2,1] -; SSE-NEXT: andps %xmm1, %xmm11 -; SSE-NEXT: orps %xmm9, %xmm11 -; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm2[2,1] +; SSE-NEXT: andps %xmm11, %xmm10 +; SSE-NEXT: orps %xmm0, %xmm10 +; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9] ; SSE-NEXT: pslldq {{.*#+}} xmm9 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm0, %xmm9 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm4[0,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[2,1] -; SSE-NEXT: andps %xmm1, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm13 = xmm13[0],xmm15[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[2,0],mem[2,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,1,1] +; SSE-NEXT: pand %xmm4, %xmm9 +; SSE-NEXT: pandn %xmm7, %xmm4 +; SSE-NEXT: por %xmm9, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[0,0] +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[2,1] +; SSE-NEXT: andps %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,0,0,65535] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: andps %xmm4, %xmm13 -; SSE-NEXT: por %xmm13, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,0,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: andps %xmm4, %xmm15 +; SSE-NEXT: por %xmm15, %xmm6 +; SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: andps %xmm4, %xmm2 +; SSE-NEXT: andps %xmm4, %xmm12 ; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: por %xmm12, %xmm4 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm4, 112(%rax) -; SSE-NEXT: movdqa %xmm5, (%rax) -; SSE-NEXT: movdqa %xmm1, 176(%rax) -; SSE-NEXT: movaps %xmm11, 64(%rax) -; SSE-NEXT: movaps %xmm6, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: movdqa %xmm6, (%rax) +; SSE-NEXT: movdqa %xmm11, 176(%rax) +; SSE-NEXT: movaps %xmm10, 64(%rax) +; SSE-NEXT: movaps %xmm1, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) +; SSE-NEXT: movaps %xmm8, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps %xmm0, 128(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rax) -; SSE-NEXT: movaps %xmm3, 144(%rax) +; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: movaps %xmm14, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rax) -; SSE-NEXT: movdqa %xmm8, 96(%rax) +; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) ; SSE-NEXT: addq $216, %rsp @@ -1542,464 +1586,449 @@ ; ; AVX1-ONLY-LABEL: store_i16_stride7_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $40, %rsp +; AVX1-ONLY-NEXT: subq $152, %rsp ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm11 ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm15 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm15, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm14[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,2,3,3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm8 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm10 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm13 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0],xmm10[1,2,3,4,5,6],xmm12[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[2,2,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm2[6],xmm10[7] -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm6 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm13[3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm3[0,1,2,3,4,5],xmm10[6],xmm3[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm3[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm14 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm11[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm14[4],xmm11[5],xmm14[5],xmm11[6],xmm14[6],xmm11[7],xmm14[7] +; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm12 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm11 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm4[0,2],xmm12[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm11, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm1 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm11, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm11[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm8[0,2],xmm13[1,3] -; AVX1-ONLY-NEXT: vmovaps %xmm8, %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm11, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm1, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm12[0,0,0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm9 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm15[2],xmm2[2],xmm15[3],xmm2[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm8 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm13[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm14, %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5],xmm12[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm3[0,2],xmm6[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm13, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm12[6],xmm4[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm7, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm8[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm14[0,1,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm13[0,0,0,0] +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[2,2,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5],xmm2[6],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm12 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm4 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm13[1],xmm4[1] +; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[0,0,0,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,9,u,u,u,u,u,u,u,u,6,7,10,11,12,13] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rax) -; AVX1-ONLY-NEXT: addq $40, %rsp +; AVX1-ONLY-NEXT: addq $152, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride7_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $40, %rsp +; AVX2-SLOW-NEXT: subq $72, %rsp ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> -; AVX2-SLOW-NEXT: vpermd %ymm7, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,3,2,3,4,7,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <3,u,u,u,4,u,u,4> +; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,3,2,3,4,7,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm6 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm13[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm6[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm5 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm4 +; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm8, %ymm8 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <3,u,u,3,u,u,u,4> -; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm9, %ymm9 +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm9, %ymm9 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm14 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm15 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm8, %ymm12, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm13[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX2-SLOW-NEXT: vpbroadcastd (%rax), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm8, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm11 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm5 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm14 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX2-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm8, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm1, %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[1,1,2,2] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7,8],ymm1[9],ymm7[10,11],ymm1[12],ymm7[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7,8,9],ymm7[10],ymm1[11,12],ymm7[13],ymm1[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,3,3,3,6,7,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm12 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[1,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[1,1,2,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2],xmm15[3,4],xmm1[5],xmm15[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[1,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2],xmm1[3,4],xmm7[5],xmm1[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm0[0,1,2,3,4,5,7,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm14 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm9, %ymm14, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm9, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm8 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3],ymm14[4,5],ymm1[6],ymm14[7,8,9,10],ymm1[11],ymm14[12,13],ymm1[14],ymm14[15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,3,6,6,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm4[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[2,3,3,3,6,7,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm14 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm1 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vpbroadcastd (%rax), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm10, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,5,7,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3],ymm11[4],ymm1[5,6,7,8],ymm11[9],ymm1[10,11],ymm11[12],ymm1[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm12, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm15 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm2 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm7[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm11[2],ymm1[3,4],ymm11[5],ymm1[6,7,8,9],ymm11[10],ymm1[11,12],ymm11[13],ymm1[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7,8,9],ymm7[10],ymm1[11,12],ymm7[13],ymm1[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm11, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm3, %ymm1, %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5],ymm1[6],ymm7[7,8,9,10],ymm1[11],ymm7[12,13],ymm1[14],ymm7[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7,8,9],ymm1[10],ymm5[11,12],ymm1[13],ymm5[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm10, (%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm14, 192(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm10, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm12, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-SLOW-NEXT: addq $40, %rsp +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: addq $72, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i16_stride7_vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: pushq %rax +; AVX2-FAST-NEXT: subq $40, %rsp ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm7 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm4 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm6 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm15 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm2 ; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm3 ; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[1,2,2,3,5,6,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7,8,9],ymm9[10],ymm10[11,12],ymm9[13],ymm10[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15] @@ -2012,8 +2041,7 @@ ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm15[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8,9,10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] @@ -2021,82 +2049,105 @@ ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[1,2,2,3,5,6,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7,8,9],ymm9[10],ymm10[11,12],ymm9[13],ymm10[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <3,u,u,3,u,u,u,4> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm7[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm6[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <3,u,u,3,u,u,u,4> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm9, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm10 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm12 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm10 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm11 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,1] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm14 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm15 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm12, %ymm13, %ymm8 +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm12 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX2-FAST-NEXT: vpbroadcastd (%rax), %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm9, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm13 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm14 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm8, %ymm11 -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm8 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] -; AVX2-FAST-NEXT: vpbroadcastd 8(%rax), %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-FAST-NEXT: vpbroadcastd 8(%rax), %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[1,1,2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1],xmm1[2],xmm9[3,4],xmm1[5],xmm9[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd 4(%rax), %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7,8,9,10],ymm2[11],ymm5[12,13],ymm2[14],ymm5[15] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] @@ -2106,212 +2157,217 @@ ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[1,1,2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpbroadcastd 4(%rax), %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vpbroadcastd (%rax), %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 128(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 160(%rax) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 128(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 192(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm11, 64(%rax) -; AVX2-FAST-NEXT: popq %rax +; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm8, (%rax) +; AVX2-FAST-NEXT: addq $40, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride7_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $40, %rsp +; AVX2-FAST-PERLANE-NEXT: subq $104, %rsp ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <3,u,u,u,4,u,u,4> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm4, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <3,u,u,3,u,u,u,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm9, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm8, %ymm12, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rax), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm8, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rax), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm8, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rax), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm1, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm14, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5],ymm1[6],ymm7[7,8,9,10],ymm1[11],ymm7[12,13],ymm1[14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5,6,7,8],ymm0[9],ymm7[10,11],ymm0[12],ymm7[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7,8,9],ymm1[10],ymm7[11,12],ymm1[13],ymm7[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,3,3,3,6,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[1,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2],xmm15[3,4],xmm1[5],xmm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3,4],xmm1[5],xmm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%rax), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm9, %ymm14, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm1, %ymm9, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3],ymm14[4,5],ymm1[6],ymm14[7,8,9,10],ymm1[11],ymm14[12,13],ymm1[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[2,3,3,3,6,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rax), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm10, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[2,2,2,2,6,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%rax), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6,7,8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[1,2,2,3,5,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1],ymm1[2],ymm11[3,4],ymm1[5],ymm11[6,7,8,9],ymm1[10],ymm11[11,12],ymm1[13],ymm11[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7,8,9],ymm1[10],ymm7[11,12],ymm1[13],ymm7[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm11, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm1, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5],ymm1[6],ymm7[7,8,9,10],ymm1[11],ymm7[12,13],ymm1[14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7,8,9],ymm1[10],ymm5[11,12],ymm1[13],ymm5[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm9[u,u,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, (%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $40, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $104, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -2457,128 +2513,131 @@ ; ; AVX512F-ONLY-FAST-LABEL: store_i16_stride7_vf16: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[12,13,14,15],zero,zero,ymm5[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm5[u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm3, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u],zero,zero,ymm6[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[16,17,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,14,15],zero,zero,ymm4[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm4[u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm3, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[16,17,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,14,15],zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm1[u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm3, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0,1],ymm3[2],ymm9[3,4],ymm3[5],ymm9[6,7,8,9],ymm3[10],ymm9[11,12],ymm3[13],ymm9[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <2,u,3,2,u,10,10,11> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm3[2],ymm10[3,4],ymm3[5],ymm10[6,7,8,9],ymm3[10],ymm10[11,12],ymm3[13],ymm10[14,15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm2, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7,8,9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,3,3,10,9,11,10] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm12, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,8,1,9,0,8,1,9] -; AVX512F-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2,3],xmm14[4],xmm8[5,6],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm12 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm10, %ymm20, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm5[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[12,13,14,15],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm3[u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u],zero,zero,ymm4[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[16,17,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,14,15],zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm2[u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm10, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm12, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[16,17,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,14,15],zero,zero,ymm0[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm0[u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm14, %xmm17 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%r10), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm6, %ymm7, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[16,17,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm6, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm9, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1],xmm14[2],xmm15[3,4],xmm14[5],xmm15[6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,8,1,9,0,8,1,9] +; AVX512F-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm15, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm8[0,0,1,1,4,4,5,5] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,0,0,1,8,9,9,11] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm8, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[16,17,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm10, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm8, %zmm6, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <6,u,u,u,7,u,u,7> -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm12[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm19[2,2,2,3,6,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm11[0,0,1,1,4,4,5,5] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 4(%rax), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, 192(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 128(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 64(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,0,1,8,9,9,11] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm8, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%r10), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 4(%r10), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <2,u,3,2,u,10,10,11> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm10, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7,8,9],ymm9[10],ymm10[11,12],ymm9[13],ymm10[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7,8,9],ymm10[10],ymm12[11,12],ymm10[13],ymm12[14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,2,2,3,6,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm1, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm0[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm0[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8,9,10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,2,3,3,10,9,11,10] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm11, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm7, %zmm10, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <6,u,u,u,7,u,u,7> +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm7, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; @@ -2724,128 +2783,131 @@ ; ; AVX512DQ-FAST-LABEL: store_i16_stride7_vf16: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[12,13,14,15],zero,zero,ymm5[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm5[u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm3, %ymm16 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm11 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm12 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u],zero,zero,ymm6[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[16,17,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,14,15],zero,zero,ymm4[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm4[u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm3, %ymm17 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[16,17,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,14,15],zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm1[u,u,u,u] -; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm3, %ymm18 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0,1],ymm3[2],ymm9[3,4],ymm3[5],ymm9[6,7,8,9],ymm3[10],ymm9[11,12],ymm3[13],ymm9[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <2,u,3,2,u,10,10,11> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm9 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm3[2],ymm10[3,4],ymm3[5],ymm10[6,7,8,9],ymm3[10],ymm10[11,12],ymm3[13],ymm10[14,15] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm19 -; AVX512DQ-FAST-NEXT: vprold $16, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,2,2,3,5,6,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7,8,9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,3,3,10,9,11,10] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm13 -; AVX512DQ-FAST-NEXT: vprold $16, %xmm12, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm15 = [0,8,1,9,0,8,1,9] -; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2,3],xmm14[4],xmm8[5,6],xmm14[7] -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm14 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm8 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm12 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpandnq %ymm10, %ymm20, %ymm10 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm5[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[12,13,14,15],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm3[u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u],zero,zero,ymm4[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[16,17,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,14,15],zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm2[u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm10, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm11 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm12, %zmm15 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm15 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[16,17,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,14,15],zero,zero,ymm0[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm0[u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm13 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm14, %xmm17 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm16 +; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%r10), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpandn %ymm6, %ymm7, %ymm6 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[16,17,u,u] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm6, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm6 +; AVX512DQ-FAST-NEXT: vprold $16, %xmm9, %xmm14 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1],xmm14[2],xmm15[3,4],xmm14[5],xmm15[6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm16 = [0,8,1,9,0,8,1,9] +; AVX512DQ-FAST-NEXT: # zmm16 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm15, %zmm16 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6],xmm10[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm9 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm8[0,0,1,1,4,4,5,5] +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm9 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,0,0,1,8,9,9,11] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm8, %zmm20 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[16,17,u,u] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm10, %zmm10 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13,14,15] -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermd %zmm8, %zmm6, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <6,u,u,u,7,u,u,7> -; AVX512DQ-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm7 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm12[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm19[2,2,2,3,6,6,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm11[0,0,1,1,4,4,5,5] -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm0 -; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm2 -; AVX512DQ-FAST-NEXT: vpbroadcastd 4(%rax), %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm4 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm1 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, 192(%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 128(%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 64(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,0,1,8,9,9,11] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm8, %zmm11 +; AVX512DQ-FAST-NEXT: vpbroadcastd (%r10), %ymm8 +; AVX512DQ-FAST-NEXT: vpbroadcastd 4(%r10), %ymm10 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm8 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <2,u,3,2,u,10,10,11> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm10, %zmm11 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7,8,9],ymm9[10],ymm10[11,12],ymm9[13],ymm10[14,15] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7,8,9],ymm10[10],ymm12[11,12],ymm10[13],ymm12[14,15] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,2,2,3,6,6,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm9 +; AVX512DQ-FAST-NEXT: vprold $16, %ymm1, %ymm10 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm0[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm0[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8,9,10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,2,3,3,10,9,11,10] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm11, %zmm12 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermd %zmm7, %zmm10, %zmm10 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm10 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <6,u,u,u,7,u,u,7> +; AVX512DQ-FAST-NEXT: vpermd %ymm7, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -2860,38 +2922,38 @@ ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <29,45,u,u,u,u,14,30,46,u,u,u,u,15,31,47> -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,16,32,48,u,u,u,1,17,33,49,u,u,u,2,18,34,50,u,u,u,3,19,35,51,u,u,u,4,20,36,52> -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <29,45,u,u,u,u,14,30,46,u,u,u,u,15,31,47> +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 +; AVX512BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 +; AVX512BW-NEXT: kmovd %ecx, %k1 +; AVX512BW-NEXT: vmovdqu16 %ymm4, %ymm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,16,32,48,u,u,u,1,17,33,49,u,u,u,2,18,34,50,u,u,u,3,19,35,51,u,u,u,4,20,36,52> +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <4,20,36,u,u,u,u,5,21,37,u,u,u,u,6,22,38,u,u,u,u,7,23,39,u,u,u,u,8,24,40,u> -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <4,20,36,u,u,u,u,5,21,37,u,u,u,u,6,22,38,u,u,u,u,7,23,39,u,u,u,u,8,24,40,u> +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <57,9,25,u,u,u,42,58,10,26,u,u,u,43,59,11,27,u,u,u,44,60,12,28,u,u,u,45,61,13,29,u> ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 -; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1} -; AVX512BW-NEXT: vmovdqa %ymm4, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-NEXT: vmovdqa %ymm5, 192(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <16 x i16>, ptr %in.vecptr0, align 64 @@ -2917,582 +2979,590 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride7_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $680, %rsp # imm = 0x2A8 +; SSE-NEXT: subq $712, %rsp # imm = 0x2C8 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rsi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdx), %xmm1 -; SSE-NEXT: movdqa 48(%rcx), %xmm5 -; SSE-NEXT: movdqa 48(%r8), %xmm9 -; SSE-NEXT: movdqa 48(%r9), %xmm4 +; SSE-NEXT: movdqa 48(%rdx), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%r8), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rax), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm11 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: movdqa 48(%r9), %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rax), %xmm5 +; SSE-NEXT: movdqa 48(%rax), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[0,2] -; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm1, %xmm2 -; SSE-NEXT: andnps %xmm7, %xmm1 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: orps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm4[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] -; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: andps %xmm1, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm3[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movaps {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: andnps %xmm0, %xmm3 +; SSE-NEXT: orps %xmm2, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,0,1] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa (%r8), %xmm0 -; SSE-NEXT: movdqa (%r9), %xmm1 +; SSE-NEXT: movdqa (%r9), %xmm8 ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm0, %xmm9 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm1, %xmm14 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa (%rcx), %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa (%rdx), %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: movdqa (%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm12 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa (%rdi), %xmm11 -; SSE-NEXT: movdqa (%rsi), %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa (%rsi), %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 ; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm14, %xmm15 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pandn %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pandn %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] -; SSE-NEXT: andps %xmm0, %xmm3 -; SSE-NEXT: orps %xmm2, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,0,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm7 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,0],xmm2[3,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm6[0,2] +; SSE-NEXT: andps %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm14 +; SSE-NEXT: orps %xmm3, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm6[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm2[0,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rax), %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa 16(%r8), %xmm10 -; SSE-NEXT: movdqa 16(%r9), %xmm8 -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa 16(%rcx), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: movdqa 16(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rsi), %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: movdqa 16(%r8), %xmm0 +; SSE-NEXT: movdqa 16(%r9), %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa 16(%rcx), %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: movdqa 16(%rdi), %xmm4 +; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm10, %xmm6 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pandn %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] -; SSE-NEXT: andps %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,0],xmm3[3,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm6[0,2] +; SSE-NEXT: andps %xmm14, %xmm3 ; SSE-NEXT: orps %xmm2, %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm4[2,3] +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm2[0,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rax), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: movdqa 32(%rax), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,1,0,1] +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa 32(%r8), %xmm10 -; SSE-NEXT: movdqa 32(%r9), %xmm9 -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm14 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%r8), %xmm15 +; SSE-NEXT: movdqa 32(%r9), %xmm14 +; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa 32(%rcx), %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa 32(%rcx), %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: movdqa 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: movdqa 32(%rdi), %xmm1 -; SSE-NEXT: movdqa 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa 32(%rdx), %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: movdqa 32(%rdi), %xmm0 +; SSE-NEXT: movdqa 32(%rsi), %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm9 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pandn %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,2,2] -; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm10, %xmm6 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: pand %xmm1, %xmm6 ; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pandn %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,2,2] +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,0],xmm2[3,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm6[0,2] +; SSE-NEXT: movaps {{.*#+}} xmm7 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: andps %xmm7, %xmm2 +; SSE-NEXT: andnps %xmm3, %xmm7 +; SSE-NEXT: orps %xmm2, %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm6[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[3,3,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] -; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm5, %xmm4 -; SSE-NEXT: andnps %xmm2, %xmm5 -; SSE-NEXT: orps %xmm4, %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[2,2,2,2,4,5,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm14 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm2[0,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[0,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,0,1] +; SSE-NEXT: pandn %xmm3, %xmm13 +; SSE-NEXT: por %xmm2, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: psrld $16, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,0,1] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm3, %xmm10 ; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pand %xmm2, %xmm15 -; SSE-NEXT: pandn %xmm10, %xmm2 -; SSE-NEXT: por %xmm15, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm6[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,65535] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pand %xmm9, %xmm10 +; SSE-NEXT: pandn %xmm13, %xmm9 +; SSE-NEXT: por %xmm10, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,2,2,2] +; SSE-NEXT: pandn %xmm3, %xmm12 +; SSE-NEXT: por %xmm2, %xmm12 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm12[3,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm8, %xmm3 -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm14, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[0,2] +; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: andps %xmm1, %xmm2 +; SSE-NEXT: andnps %xmm4, %xmm1 +; SSE-NEXT: orps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,2,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,0],xmm0[2,0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,0,0,65535] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm3 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] -; SSE-NEXT: psrlq $48, %xmm15 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm15[1] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; SSE-NEXT: movdqa %xmm1, %xmm12 ; SSE-NEXT: psrlq $48, %xmm14 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm14[1] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [0,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm15 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm12 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm12[1] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,5,4] -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,6,6] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm5[1,1] -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movaps {{.*#+}} xmm12 = [65535,65535,0,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: andnps %xmm2, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm5[1,1] +; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,0,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: andnps %xmm1, %xmm3 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,5,6,6] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm10[1,1] -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: andnps %xmm2, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm13[1,1] +; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: andnps %xmm1, %xmm3 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,6,6] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm7[1,1] -; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: movaps %xmm12, %xmm10 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm8[1,1] +; SSE-NEXT: movaps %xmm8, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm10 ; SSE-NEXT: andnps %xmm1, %xmm10 -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: orps %xmm0, %xmm10 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm14[1,1] -; SSE-NEXT: andnps %xmm1, %xmm12 -; SSE-NEXT: orps %xmm0, %xmm12 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm15[1,1] +; SSE-NEXT: andnps %xmm1, %xmm6 +; SSE-NEXT: orps %xmm0, %xmm6 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps $42, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2,2],mem[2,0] -; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,6,7] ; SSE-NEXT: movaps {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: andps %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] @@ -3500,99 +3570,103 @@ ; SSE-NEXT: orps %xmm1, %xmm2 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: andps %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] ; SSE-NEXT: andnps %xmm0, %xmm1 ; SSE-NEXT: orps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,xmm11[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,0,65535] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: movaps %xmm3, %xmm11 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[2,1] -; SSE-NEXT: andps %xmm2, %xmm11 -; SSE-NEXT: orps %xmm1, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,0,65535] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: shufps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2,0],mem[0,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,1] +; SSE-NEXT: andps %xmm1, %xmm2 +; SSE-NEXT: orps %xmm0, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pslldq {{.*#+}} xmm13 = zero,zero,xmm13[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm13 -; SSE-NEXT: por %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm13, %xmm1 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movapd %xmm7, %xmm13 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[1],mem[0] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm0[2,1] -; SSE-NEXT: andps %xmm2, %xmm13 -; SSE-NEXT: orps %xmm1, %xmm13 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: shufps $2, (%rsp), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2,0],mem[0,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm4[2,1] +; SSE-NEXT: andps %xmm1, %xmm13 +; SSE-NEXT: orps %xmm0, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[1],mem[0] -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[2,1] -; SSE-NEXT: andps %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2,0],mem[2,1] +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: shufps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2,0],mem[0,0] +; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,1] +; SSE-NEXT: andps %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2,0],mem[2,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,0,0,0,65535] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: andps %xmm6, %xmm8 -; SSE-NEXT: por %xmm8, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2,0],mem[2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,0,0,0,65535] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: andps %xmm9, %xmm12 +; SSE-NEXT: por %xmm12, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: andps %xmm6, %xmm3 -; SSE-NEXT: por %xmm3, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: pandn %xmm4, %xmm12 +; SSE-NEXT: andps %xmm9, %xmm0 +; SSE-NEXT: por %xmm0, %xmm12 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0] ; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: pandn %xmm4, %xmm9 -; SSE-NEXT: andps %xmm6, %xmm0 -; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm4, %xmm11 +; SSE-NEXT: andps %xmm9, %xmm0 +; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0] @@ -3600,22 +3674,24 @@ ; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; SSE-NEXT: andps %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm15[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; SSE-NEXT: andps %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: por %xmm0, %xmm9 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm6, 336(%rax) -; SSE-NEXT: movdqa %xmm9, 224(%rax) -; SSE-NEXT: movdqa %xmm8, 112(%rax) -; SSE-NEXT: movdqa %xmm1, (%rax) -; SSE-NEXT: movdqa %xmm2, 288(%rax) +; SSE-NEXT: movdqa %xmm9, 336(%rax) +; SSE-NEXT: movdqa %xmm11, 224(%rax) +; SSE-NEXT: movdqa %xmm12, 112(%rax) +; SSE-NEXT: movdqa %xmm3, (%rax) +; SSE-NEXT: movdqa %xmm1, 288(%rax) ; SSE-NEXT: movaps %xmm13, 176(%rax) -; SSE-NEXT: movaps %xmm11, 64(%rax) +; SSE-NEXT: movaps %xmm2, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%rax) -; SSE-NEXT: movaps %xmm12, 368(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 384(%rax) +; SSE-NEXT: movaps %xmm6, 368(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 352(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3644,8 +3720,8 @@ ; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movdqa %xmm15, 320(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, 320(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) @@ -3653,561 +3729,567 @@ ; SSE-NEXT: movaps %xmm0, 432(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 400(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 384(%rax) -; SSE-NEXT: addq $680, %rsp # imm = 0x2A8 +; SSE-NEXT: addq $712, %rsp # imm = 0x2C8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $584, %rsp # imm = 0x248 +; AVX1-ONLY-NEXT: subq $600, %rsp # imm = 0x258 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm4 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm1[6],xmm5[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm6 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm5[1,2,3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm13 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm1[6],xmm3[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm6 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm4[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4,5,6],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm8 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm11[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1,2,3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6],xmm2[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,5,6,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,6,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm9 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm9[3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm9[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm11 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm6[6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[0,2],xmm3[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5],xmm6[6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm7, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm6, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm14 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm6 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm15 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm6[3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[0,2],xmm6[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm15, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm15, %ymm14 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm14, %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm7 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm7[0,2],xmm5[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6],xmm1[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,0,0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm7 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm15 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm12[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm11 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm10 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm15[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm11[6],xmm1[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6],xmm2[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[0,2],xmm2[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm10[3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm10[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[0,2],xmm13[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm4[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[0,2],xmm15[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm7[1],xmm2[1] ; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2],xmm2[3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm10 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm12 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm12, %ymm2 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm12, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm13[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0],xmm6[1],xmm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm11, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm3, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm14, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm14, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm11, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm13[1],xmm11[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm1, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm15, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm9[1],xmm7[1] -; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm13[0,0,0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm8 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm4, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpblendw $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2],xmm15[3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[0,0,0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm7 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 416(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 432(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm14, 384(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 384(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%rax) -; AVX1-ONLY-NEXT: addq $584, %rsp # imm = 0x248 +; AVX1-ONLY-NEXT: addq $600, %rsp # imm = 0x258 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride7_vf32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $616, %rsp # imm = 0x268 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm15 -; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm14 -; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm12 +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm6 +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm11 +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> -; AVX2-SLOW-NEXT: vpermd %ymm8, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,3,2,3,4,7,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm2, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm11 -; AVX2-SLOW-NEXT: vpermd %ymm13, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vpermd %ymm8, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[0,3,2,3,4,7,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpermd %ymm12, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpermd %ymm7, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX2-SLOW-NEXT: vpermd %ymm6, %ymm3, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm5 ; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm11 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm6[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm12 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <3,u,u,3,u,u,u,4> -; AVX2-SLOW-NEXT: vpermd %ymm6, %ymm4, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm13 +; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm4, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %ymm5 ; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] @@ -4216,9 +4298,9 @@ ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] @@ -4230,18 +4312,18 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,1,2,3,6,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,2,2,4,5,6,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -4252,7 +4334,7 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -4261,109 +4343,113 @@ ; AVX2-SLOW-NEXT: vpbroadcastd 60(%r8), %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[2,3,3,3,6,7,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[2,3,3,3,6,7,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm13 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm8 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm7 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm6 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm9 +; AVX2-SLOW-NEXT: vpbroadcastd (%rax), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm9, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm12 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm1[0,1,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,1,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] -; AVX2-SLOW-NEXT: vpbroadcastd (%rax), %ymm14 +; AVX2-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm14 ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm15, %ymm14, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm12, %ymm0, %ymm15 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,1,2,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,2] +; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm10 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[3,3,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[1,1,2,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm3[2],xmm12[3,4],xmm3[5],xmm12[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[1,1,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm3[2],xmm14[3,4],xmm3[5],xmm14[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm3, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm9 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[3,3,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[1,1,2,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm3[2],xmm14[3,4],xmm3[5],xmm14[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[1,1,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm3[2],xmm15[3,4],xmm3[5],xmm15[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,7,6] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,5,7,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm9, %ymm3, %ymm14 -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm9, %ymm3, %ymm14 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm15 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] @@ -4382,119 +4468,120 @@ ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] -; AVX2-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8,9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,3,6,6,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[2,3,3,3,6,7,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8,9,10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm11 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[2,3,3,3,6,7,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] -; AVX2-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm11[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7,8,9],ymm5[10],ymm7[11,12],ymm5[13],ymm7[14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm8[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm7, %ymm5 -; AVX2-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8,9,10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX2-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm13[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8,9,10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm7, %ymm9, %ymm7 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] @@ -4502,21 +4589,21 @@ ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm6, 320(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 352(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 192(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 352(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 192(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 128(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 288(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm15, 256(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm15, 224(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 288(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm14, 256(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm14, 32(%rax) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4527,428 +4614,417 @@ ; ; AVX2-FAST-LABEL: store_i16_stride7_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $312, %rsp # imm = 0x138 +; AVX2-FAST-NEXT: subq $280, %rsp # imm = 0x118 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm6 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm10 -; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm8 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm13 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm15 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm5 +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm6 +; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm4 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm7 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[10,11,8,9,10,11,8,9,10,11,8,9,10,11,8,9,26,27,24,25,26,27,24,25,26,27,24,25,26,27,24,25] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[1,2,2,3,5,6,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[1,2,2,3,5,6,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [6,7,3,3,7,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm15 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm12 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm11 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm11[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7,8,9],ymm2[10],ymm7[11,12],ymm2[13],ymm7[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[0,1,1,3,4,5,5,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm2[2],ymm8[3,4],ymm2[5],ymm8[6,7,8,9],ymm2[10],ymm8[11,12],ymm2[13],ymm8[14,15] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm9 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm6 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm9[0,0,2,1,4,4,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm13[3],ymm1[4,5],ymm13[6],ymm1[7,8,9,10],ymm13[11],ymm1[12,13],ymm13[14],ymm1[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm1[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm13, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3],ymm3[4,5],ymm13[6],ymm3[7,8,9,10],ymm13[11],ymm3[12,13],ymm13[14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm11 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm9 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm7 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,4,5,2,3,4,5,2,3,4,5,2,3,4,5,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm2, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm6 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0,1,2],ymm14[3],ymm1[4,5],ymm14[6],ymm1[7,8,9,10],ymm14[11],ymm1[12,13],ymm14[14],ymm1[15] +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm14, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm11, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm8, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm7 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm12, %ymm12 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm7, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm5, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm12, %ymm12 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <3,u,u,3,u,u,u,4> -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm5, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <3,u,u,3,u,u,u,4> +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm12 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm5, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[10,11,8,9,10,11,8,9,10,11,8,9,10,11,8,9,26,27,24,25,26,27,24,25,26,27,24,25,26,27,24,25] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [5,6,2,3,6,7,5,6] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [5,6,2,3,6,7,5,6] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8,9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastd 60(%r8), %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastd 60(%r8), %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,28,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,3,3,7,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm12 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,1,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm13 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm14 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm12 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm13 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm9 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm8 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm10 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-NEXT: vpbroadcastd 32(%rax), %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm14 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FAST-NEXT: vpbroadcastd (%rax), %ymm11 -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm4, %ymm11, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm12, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FAST-NEXT: vpbroadcastd (%rax), %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm15 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-FAST-NEXT: vpbroadcastd 32(%rax), %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm5, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm13, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm10, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm7, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm10 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm11 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm3, %ymm11, %ymm3 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3],xmm0[4],xmm11[5,6],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm4 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[1,1,2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0,1],xmm4[2],xmm11[3,4],xmm4[5],xmm11[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd 36(%rax), %ymm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm5, %ymm11, %ymm5 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd 4(%rax), %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm5 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm12, %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm12, %xmm13 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[1,1,2,2] +; AVX2-FAST-NEXT: vmovdqa %xmm6, %xmm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0],xmm5[1],xmm11[2,3],xmm5[4],xmm11[5,6],xmm5[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,6,7,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[1,1,2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm14, %xmm4 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[1,1,2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm5[2],xmm11[3,4],xmm5[5],xmm11[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-NEXT: vpbroadcastd 4(%rax), %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-FAST-NEXT: vpbroadcastd 36(%rax), %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX2-FAST-NEXT: vpbroadcastd 40(%rax), %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FAST-NEXT: vpbroadcastd 8(%rax), %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm5 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] +; AVX2-FAST-NEXT: vpbroadcastd 8(%rax), %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] +; AVX2-FAST-NEXT: vpbroadcastd 40(%rax), %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 320(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 128(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 352(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 160(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 192(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 64(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 320(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 352(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 192(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 160(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 128(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 288(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 256(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 288(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 256(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-FAST-NEXT: addq $312, %rsp # imm = 0x138 +; AVX2-FAST-NEXT: addq $280, %rsp # imm = 0x118 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride7_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $616, %rsp # imm = 0x268 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: subq $648, %rsp # imm = 0x288 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm8, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm6, %ymm3, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm3, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm11, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm10, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm11, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm5, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm13, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm15, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm8, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm10, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm3, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm12, %ymm3, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm11, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <3,u,u,3,u,u,u,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm6, %ymm3, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm3, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] @@ -4957,39 +5033,40 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[10,11,8,9,10,11,8,9,10,11,8,9,10,11,8,9,26,27,24,25,26,27,24,25,26,27,24,25,26,27,24,25] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,2,2,4,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -4998,105 +5075,110 @@ ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 60(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,28,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[2,3,3,3,6,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[2,3,3,3,6,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rax), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm10, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rax), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rax), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm3, %ymm15, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm9, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rax), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm15, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm12, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm3 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[1,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,6,7,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm5, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2,3],xmm0[4],xmm5[5,6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%rax), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%rax), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm5, %ymm6, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%rax), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%rax), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm5, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 @@ -5105,7 +5187,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] @@ -5115,1378 +5197,1397 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 40(%rax), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rax), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rax), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 40(%rax), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8,9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[10,11,8,9,10,11,8,9,10,11,8,9,10,11,8,9,26,27,24,25,26,27,24,25,26,27,24,25,26,27,24,25] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[2,3,3,3,6,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,2,2,2,6,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8,9,10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[2,3,3,3,6,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm10, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm11[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm8[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm6, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7,8,9],ymm6[10],ymm8[11,12],ymm6[13],ymm8[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm11[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7,8,9],ymm6[10],ymm8[11,12],ymm6[13],ymm8[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm6 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,4,5,2,3,4,5,2,3,4,5,2,3,4,5,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm12, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm13[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm13, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm8, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8,9,10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm14[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm7, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm6, %ymm4 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 352(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 288(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 256(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 288(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 256(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $616, %rsp # imm = 0x268 +; AVX2-FAST-PERLANE-NEXT: addq $648, %rsp # imm = 0x288 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: store_i16_stride7_vf32: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512F-ONLY-SLOW-NEXT: subq $856, %rsp # imm = 0x358 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm5, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm31 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm15, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm7, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm2, %ymm6, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm15, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm11, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm4, %ymm9, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm14, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm9, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm14, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm8, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm15[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,3,6,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,3,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm9, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm3, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,5,7,6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm9, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm11, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm8, %zmm1, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm15, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1],ymm1[2],ymm8[3,4],ymm1[5],ymm8[6,7,8,9],ymm1[10],ymm8[11,12],ymm1[13],ymm8[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm15[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,0,0,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8,9,10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [2,2,3,3,10,9,11,10] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,7,6] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,7,6] +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rax), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm1, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rax), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm29 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm3, %ymm8, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm1, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7,8,9,10],ymm6[11],ymm3[12,13],ymm6[14],ymm3[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm13, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm14[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm22 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5,6,7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm0, %ymm2, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7,8],ymm3[9],ymm6[10,11],ymm3[12],ymm6[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm4, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm14[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm9, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,3,6,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermd %zmm1, %zmm0, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm30[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm31[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm1, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm3[2],xmm6[3,4],xmm3[5],xmm6[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2,3],xmm0[4],xmm7[5,6],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm18 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm12, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermd %zmm1, %zmm2, %zmm28 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm6, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm4, %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm3[0,1],xmm6[2],xmm3[3,4],xmm6[5],xmm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm1[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm30[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7,8,9],ymm3[10],ymm6[11,12],ymm3[13],ymm6[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm11, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm30[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm3[0,1],ymm12[2],ymm3[3,4],ymm12[5],ymm3[6,7,8,9],ymm12[10],ymm3[11,12],ymm12[13],ymm3[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm31[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm12[2],ymm3[3,4],ymm12[5],ymm3[6,7,8,9],ymm12[10],ymm3[11,12],ymm12[13],ymm3[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm31[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3],ymm14[4],ymm12[5,6,7,8],ymm14[9],ymm12[10,11],ymm14[12],ymm12[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm25, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm5, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7,8,9],ymm4[10],ymm0[11,12],ymm4[13],ymm0[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7,8,9],ymm4[10],ymm0[11,12],ymm4[13],ymm0[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6,7,8],ymm0[9],ymm4[10,11],ymm0[12],ymm4[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm19[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8,9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm3, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm5, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7,8,9],ymm0[10],ymm4[11,12],ymm0[13],ymm4[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6,7,8],ymm4[9],ymm0[10,11],ymm4[12],ymm0[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8,9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm13, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,3,6,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7,8,9],ymm4[10],ymm6[11,12],ymm4[13],ymm6[14,15] +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm5, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0],xmm2[1],xmm6[2,3],xmm2[4],xmm6[5,6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[2,1,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm22[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm17[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm16[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm9, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm13, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm31, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[2,1,3,2] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm27[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm24[2,1,3,2] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm23[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm17 = mem[2,3,3,3,6,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm21[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm20, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,1,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm19[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm18, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm8[0,1,2,3],zmm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm29[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[2,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm26[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm25[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm3[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm17[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm27[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm24[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm18[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm19[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm14[2,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm0 = mem[0,1,3,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm31, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] +; AVX512F-ONLY-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermd %zmm14, %zmm27, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm29 = ymm14[2,3,3,3,6,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm23[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,1,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,1,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm16[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm6, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm14, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm17[2,1,3,2] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm21, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm25, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm4, %zmm6, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd (%rax), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm26, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm29[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm30, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm31, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm22, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm30, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm4, %zmm31, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd (%rax), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm3, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermd (%rax), %zmm5, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 384(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512F-ONLY-SLOW-NEXT: addq $856, %rsp # imm = 0x358 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i16_stride7_vf32: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $232, %rsp -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: subq $280, %rsp # imm = 0x118 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm4, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm14, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm6, %ymm7, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm10, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm16 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm5, %ymm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm1, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [10,11,8,9,10,11,8,9,10,11,8,9,10,11,8,9,26,27,24,25,26,27,24,25,26,27,24,25,26,27,24,25] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6,7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7,8,9],ymm5[10],ymm3[11,12],ymm5[13],ymm3[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [2,2,2,3,8,10,10,11] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,0,0,1,8,9,9,11] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[2,2,2,2,6,6,6,6] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [2,1,3,3,8,8,9,9] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[1,1,1,1,5,5,5,5] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [2,2,2,3,8,8,8,9] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm24 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,0,0,1,8,9,9,11] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm13, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3],ymm3[4,5],ymm7[6],ymm3[7,8,9,10],ymm7[11],ymm3[12,13],ymm7[14],ymm3[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [2,2,3,3,10,9,11,10] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm8, %xmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,0,1,1,12,13,u,15> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = <2,2,u,3,u,10,10,11> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm8, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[2,3,4,5,2,3,4,5,2,3,4,5,2,3,4,5,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [2,2,3,3,10,9,11,10] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm9, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm25[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm14, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8,9,10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [2,1,3,3,8,8,9,9] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[10,11,12,13,10,11,12,13,10,11,12,13,14,15,14,15,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7,8,9],ymm0[10],ymm4[11,12],ymm0[13],ymm4[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [2,2,2,3,8,8,8,9] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm0, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm11, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm1, %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7,8,9],ymm6[10],ymm1[11,12],ymm6[13],ymm1[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7,8,9],ymm1[10],ymm6[11,12],ymm1[13],ymm6[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7,8,9,10],ymm5[11],ymm1[12,13],ymm5[14],ymm1[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = <0,1,u,3,10,10,11,11> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm21, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm6, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm1, %ymm5, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6,7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [2,2,2,3,8,10,10,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7,8,9],ymm12[10],ymm2[11,12],ymm12[13],ymm2[14,15] -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm3, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm4[0,1],ymm12[2],ymm4[3,4],ymm12[5],ymm4[6,7,8,9],ymm12[10],ymm4[11,12],ymm12[13],ymm4[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [2,1,3,2,10,10,10,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm31, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm22, %zmm17, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm3, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6],xmm10[7] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,1,1,8,8,10,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm6 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,1,3,8,8,9,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm0, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3,4],xmm6[5],xmm2[6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0],xmm6[1],xmm12[2,3],xmm6[4],xmm12[5,6],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 36(%rax), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 40(%rax), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm6, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm26, %zmm11, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm10[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm1[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3,4],ymm5[5],ymm10[6,7,8,9],ymm5[10],ymm10[11,12],ymm5[13],ymm10[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm31, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 4(%rax), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2,3],ymm2[4],ymm12[5,6,7,8],ymm2[9],ymm12[10,11],ymm2[12],ymm12[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm1[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7,8,9],ymm13[10],ymm12[11,12],ymm13[13],ymm12[14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <6,u,u,u,7,u,u,7> -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm16, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm3, %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm5, %zmm11, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm23, %zmm17, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm2, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm16[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7,8,9],ymm4[10],ymm6[11,12],ymm4[13],ymm6[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm7[2,3,4,5,2,3,4,5,2,3,4,5,2,3,4,5,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7,8,9,10],ymm6[11],ymm4[12,13],ymm6[14],ymm4[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,4,5,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm6, %ymm16, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[10,11,12,13,10,11,12,13,10,11,12,13,14,15,14,15,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm26[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm7[2],ymm3[3,4],ymm7[5],ymm3[6,7,8,9],ymm7[10],ymm3[11,12],ymm7[13],ymm3[14,15] +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm11, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm26[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7,8,9],ymm7[10],ymm11[11,12],ymm7[13],ymm11[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [2,1,3,2,10,10,10,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm16, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [14,5,0,0,15,6,0,15,14,5,0,0,15,6,0,15] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm3, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm19, %zmm26, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm1, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm14[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm3[2],xmm11[3,4],xmm3[5],xmm11[6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,0,1,1,8,8,10,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm25[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm25[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1],xmm9[2,3],xmm1[4],xmm9[5,6],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm19, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0],xmm3[1],xmm14[2,3],xmm3[4],xmm14[5,6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,1,3,8,8,9,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm16, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 4(%rax), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm24, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm2, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [6,7,3,3,7,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm18[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm4, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm17[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm5[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm16, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 36(%rax), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 40(%rax), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm29, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm9 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm10, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm27, %zmm26, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm30, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm9, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 32(%rax), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm19, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm23, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 32(%rax), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm31, %zmm2 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 256(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $232, %rsp +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: addq $280, %rsp # imm = 0x118 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i16_stride7_vf32: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512DQ-SLOW-NEXT: subq $856, %rsp # imm = 0x358 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm30 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX512DQ-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm9, %ymm5, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm31 -; AVX512DQ-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm15 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm15, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm7, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm18 +; AVX512DQ-SLOW-NEXT: vpor %ymm2, %ymm6, %ymm2 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm15 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm15, %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm7 -; AVX512DQ-SLOW-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %xmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %xmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %ymm5 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm9, %ymm11, %ymm9 -; AVX512DQ-SLOW-NEXT: vpor %ymm4, %ymm9, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm14, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm9, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm19 +; AVX512DQ-SLOW-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %ymm14 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm14, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm8, %ymm4 +; AVX512DQ-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm4 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm15[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,3,6,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,3,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm9 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm9, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm3, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm12 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,5,7,6] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm9 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm9, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm11 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm11, %ymm1 +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,2] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm8, %zmm1, %zmm26 -; AVX512DQ-SLOW-NEXT: vprold $16, %ymm15, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[1,2,2,3,5,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1],ymm1[2],ymm8[3,4],ymm1[5],ymm8[6,7,8,9],ymm1[10],ymm8[11,12],ymm1[13],ymm8[14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm15[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,0,0,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8,9,10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [2,2,3,3,10,9,11,10] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm6, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm6 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,7,6] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,7,6] +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rax), %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpandn %ymm1, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rax), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[1,1,1,1,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm29 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpandn %ymm3, %ymm8, %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm1, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7,8,9,10],ymm6[11],ymm3[12,13],ymm6[14],ymm3[15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512DQ-SLOW-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm13, %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm14[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm22 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5,6,7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpandn %ymm0, %ymm2, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm30 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm31 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7,8],ymm3[9],ymm6[10,11],ymm3[12],ymm6[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] -; AVX512DQ-SLOW-NEXT: vprold $16, %ymm4, %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[1,2,2,3,5,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm14[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vprold $16, %ymm9, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[1,2,2,3,5,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm26 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,3,6,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermd %zmm1, %zmm0, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm30[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm31[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512DQ-SLOW-NEXT: vprold $16, %xmm1, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm3[2],xmm6[3,4],xmm3[5],xmm6[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2,3],xmm0[4],xmm7[5,6],xmm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm0, %xmm18 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm12, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm25 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x8 {{.*#+}} zmm2 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7] +; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermd %zmm1, %zmm2, %zmm28 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm10 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; AVX512DQ-SLOW-NEXT: vprold $16, %xmm6, %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512DQ-SLOW-NEXT: vprold $16, %xmm4, %xmm4 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm3[0,1],xmm6[2],xmm3[3,4],xmm6[5],xmm3[6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm1[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm30[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7,8,9],ymm3[10],ymm6[11,12],ymm3[13],ymm6[14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm11, %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm30[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm3[0,1],ymm12[2],ymm3[3,4],ymm12[5],ymm3[6,7,8,9],ymm12[10],ymm3[11,12],ymm12[13],ymm3[14,15] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm31[1,1,1,1,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm12[2],ymm3[3,4],ymm12[5],ymm3[6,7,8,9],ymm12[10],ymm3[11,12],ymm12[13],ymm3[14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm31[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3],ymm14[4],ymm12[5,6,7,8],ymm14[9],ymm12[10,11],ymm14[12],ymm12[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[0,2,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm8 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm5, %ymm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7,8,9],ymm4[10],ymm0[11,12],ymm4[13],ymm0[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm27 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[1,1,1,1,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7,8,9],ymm4[10],ymm0[11,12],ymm4[13],ymm0[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm24 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6,7,8],ymm0[9],ymm4[10,11],ymm0[12],ymm4[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm19[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8,9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm19 +; AVX512DQ-SLOW-NEXT: vprold $16, %ymm3, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[1,2,2,3,5,6,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm5, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7,8,9],ymm0[10],ymm4[11,12],ymm0[13],ymm4[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm20 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6,7,8],ymm4[9],ymm0[10,11],ymm4[12],ymm0[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm21 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8,9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm13, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm15 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,3,6,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7,8,9],ymm4[10],ymm6[11,12],ymm4[13],ymm6[14,15] +; AVX512DQ-SLOW-NEXT: vprold $16, %xmm5, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm0, %xmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,2] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0],xmm2[1],xmm6[2,3],xmm2[4],xmm6[5,6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm16 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[2,1,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm22[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm17[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm16[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,3,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm30 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm9, %zmm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm13, %zmm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm31, %zmm8 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm2 = mem[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm5 = mem[2,1,3,2] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm27[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm24[2,1,3,2] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm23[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm17 = mem[2,3,3,3,6,7,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm21[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm20, %xmm9 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm19[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm18, %xmm9 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm1 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm8[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm9 = mem[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm29[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm5 = mem[2,1,3,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm26[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm25[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm3[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm17[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm17 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm9 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm27[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm24[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm18[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm19[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm14[2,1,3,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm0 = mem[0,1,3,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm31, %zmm2 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x8 {{.*#+}} zmm27 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] +; AVX512DQ-SLOW-NEXT: # zmm27 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermd %zmm14, %zmm27, %zmm27 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm29 = ymm14[2,3,3,3,6,7,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm23[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm14 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,1,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm16[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm6, %zmm5 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm2 ; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm14, %zmm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm17[2,1,3,2] -; AVX512DQ-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm28 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm21, %zmm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm25, %zmm5 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm4, %zmm6, %zmm5 +; AVX512DQ-SLOW-NEXT: vpbroadcastd (%rax), %ymm4 +; AVX512DQ-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm6 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm26, %zmm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm6 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm27 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm29[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm6 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm30, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm31, %zmm5 +; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm0 +; AVX512DQ-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm6 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0 ; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm22, %zmm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm30, %zmm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm4, %zmm31, %zmm5 -; AVX512DQ-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm4 -; AVX512DQ-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm8 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm5 -; AVX512DQ-SLOW-NEXT: vpbroadcastd (%rax), %ymm7 -; AVX512DQ-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm8 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm7 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm3, %zmm3 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x8 {{.*#+}} zmm5 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermd (%rax), %zmm5, %zmm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 384(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 384(%rax) -; AVX512DQ-SLOW-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512DQ-SLOW-NEXT: addq $856, %rsp # imm = 0x358 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i16_stride7_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $232, %rsp -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: subq $280, %rsp # imm = 0x118 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm9 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm25 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm12 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm14, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm15 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm7 -; AVX512DQ-FAST-NEXT: vporq %ymm6, %ymm7, %ymm25 -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm3 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm10, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm16 ; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm5, %ymm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm31 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm12 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm1 -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm13 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm2 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm14 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm1 -; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm1, %ymm21 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm29 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [10,11,8,9,10,11,8,9,10,11,8,9,10,11,8,9,26,27,24,25,26,27,24,25,26,27,24,25,26,27,24,25] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6,7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm22 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7,8,9],ymm5[10],ymm3[11,12],ymm5[13],ymm3[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [2,2,2,3,8,10,10,11] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm6 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,0,0,1,8,9,9,11] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm24 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[2,2,2,2,6,6,6,6] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm23 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [2,1,3,3,8,8,9,9] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm19 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[1,1,1,1,5,5,5,5] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [2,2,2,3,8,8,8,9] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm24 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm20 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vprold $16, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm1 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,0,0,1,8,9,9,11] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm27 -; AVX512DQ-FAST-NEXT: vprold $16, %ymm13, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[1,2,2,3,5,6,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3],ymm3[4,5],ymm7[6],ymm3[7,8,9,10],ymm7[11],ymm3[12,13],ymm7[14],ymm3[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [2,2,3,3,10,9,11,10] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm15 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm9 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm8, %xmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,0,1,1,12,13,u,15> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = <2,2,u,3,u,10,10,11> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm21 +; AVX512DQ-FAST-NEXT: vprold $16, %ymm8, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[2,3,4,5,2,3,4,5,2,3,4,5,2,3,4,5,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [2,2,3,3,10,9,11,10] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm23 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm9, %ymm20 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm25[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm2, %zmm27 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm14, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8,9,10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm11 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [2,1,3,3,8,8,9,9] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm31 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[10,11,12,13,10,11,12,13,10,11,12,13,14,15,14,15,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7,8,9],ymm0[10],ymm4[11,12],ymm0[13],ymm4[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm15 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm12 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [2,2,2,3,8,8,8,9] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm0, %zmm28 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vprold $16, %xmm11, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm14 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpandn %ymm1, %ymm2, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm16 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7,8,9],ymm6[10],ymm1[11,12],ymm6[13],ymm1[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm13 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7,8,9],ymm1[10],ymm6[11,12],ymm1[13],ymm6[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7,8,9,10],ymm5[11],ymm1[12,13],ymm5[14],ymm1[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = <0,1,u,3,10,10,11,11> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm21, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %ymm6 -; AVX512DQ-FAST-NEXT: vpermd %ymm6, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpandn %ymm1, %ymm5, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm14 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6,7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14,15] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm13 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [2,2,2,3,8,10,10,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm13 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7,8,9],ymm12[10],ymm2[11,12],ymm12[13],ymm2[14,15] -; AVX512DQ-FAST-NEXT: vprold $16, %ymm3, %ymm12 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,2,2,3,5,6,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm4[0,1],ymm12[2],ymm4[3,4],ymm12[5],ymm4[6,7,8,9],ymm12[10],ymm4[11,12],ymm12[13],ymm4[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [2,1,3,2,10,10,10,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm31, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm22, %zmm17, %zmm13 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm4 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm3, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm4 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm10 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6],xmm10[7] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,1,1,8,8,10,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm6 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,1,3,8,8,9,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm13 -; AVX512DQ-FAST-NEXT: vprold $16, %xmm0, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3,4],xmm6[5],xmm2[6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm11 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0],xmm6[1],xmm12[2,3],xmm6[4],xmm12[5,6],xmm6[7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm2 -; AVX512DQ-FAST-NEXT: vpbroadcastd 36(%rax), %ymm6 -; AVX512DQ-FAST-NEXT: vpbroadcastd 40(%rax), %ymm11 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm6, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm26, %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm10[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm1[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3,4],ymm5[5],ymm10[6,7,8,9],ymm5[10],ymm10[11,12],ymm5[13],ymm10[14,15] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm31, %zmm5 -; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm0 -; AVX512DQ-FAST-NEXT: vpbroadcastd 4(%rax), %ymm12 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2,3],ymm2[4],ymm12[5,6,7,8],ymm2[9],ymm12[10,11],ymm2[12],ymm12[13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm1[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7,8,9],ymm13[10],ymm12[11,12],ymm13[13],ymm12[14,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm12 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <6,u,u,u,7,u,u,7> -; AVX512DQ-FAST-NEXT: vpermd %ymm16, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm5 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm5, %zmm11, %zmm9 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm23, %zmm17, %zmm19 -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm30 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 +; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpandn %ymm2, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm30 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm16[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7,8,9],ymm4[10],ymm6[11,12],ymm4[13],ymm6[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm7[2,3,4,5,2,3,4,5,2,3,4,5,2,3,4,5,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7,8,9,10],ymm6[11],ymm4[12,13],ymm6[14],ymm4[15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,4,5,4,5,5,7] +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpandnq %ymm6, %ymm16, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[10,11,12,13,10,11,12,13,10,11,12,13,14,15,14,15,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, %ymm11 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm26[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm7[2],ymm3[3,4],ymm7[5],ymm3[6,7,8,9],ymm7[10],ymm3[11,12],ymm7[13],ymm3[14,15] +; AVX512DQ-FAST-NEXT: vprold $16, %ymm11, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm26[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7,8,9],ymm7[10],ymm11[11,12],ymm7[13],ymm11[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [2,1,3,2,10,10,10,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm16, %zmm11 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm3 = [14,5,0,0,15,6,0,15,14,5,0,0,15,6,0,15] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm3, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm19, %zmm26, %zmm22 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm11 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm7 +; AVX512DQ-FAST-NEXT: vprold $16, %xmm1, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm14[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm3[2],xmm11[3,4],xmm3[5],xmm11[6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,0,1,1,8,8,10,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm25[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm25[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1],xmm9[2,3],xmm1[4],xmm9[5,6],xmm1[7] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm19, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm9 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0],xmm3[1],xmm14[2,3],xmm3[4],xmm14[5,6],xmm3[7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,1,3,8,8,9,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm16, %zmm9 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm9 +; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm3 +; AVX512DQ-FAST-NEXT: vpbroadcastd 4(%rax), %ymm11 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm24, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm21 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm2, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [6,7,3,3,7,7,6,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm2 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm18[0,0,1,3] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, %xmm10 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm14 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm17[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,3,2] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm5[2,2,3,3] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm16, %zmm12 +; AVX512DQ-FAST-NEXT: vpbroadcastd 36(%rax), %ymm5 +; AVX512DQ-FAST-NEXT: vpbroadcastd 40(%rax), %ymm10 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm29, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm9 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm10, %zmm9 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm27, %zmm26, %zmm31 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm30, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm9, %zmm9 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm14 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm14 -; AVX512DQ-FAST-NEXT: vpbroadcastd 32(%rax), %ymm5 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm19, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vpbroadcastd 32(%rax), %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm31, %zmm2 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 256(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 64(%rax) -; AVX512DQ-FAST-NEXT: addq $232, %rsp +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-FAST-NEXT: addq $280, %rsp # imm = 0x118 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -6648,522 +6749,519 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride7_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1640, %rsp # imm = 0x668 +; SSE-NEXT: subq $1656, %rsp # imm = 0x678 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa 112(%rdi), %xmm15 +; SSE-NEXT: movdqa 112(%rdi), %xmm3 +; SSE-NEXT: movdqa 112(%rsi), %xmm11 +; SSE-NEXT: movdqa 96(%rdx), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdx), %xmm10 +; SSE-NEXT: movdqa 96(%rcx), %xmm15 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdx), %xmm5 +; SSE-NEXT: movdqa 112(%rcx), %xmm1 +; SSE-NEXT: movdqa 112(%r8), %xmm7 +; SSE-NEXT: movdqa 112(%r9), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdx), %xmm1 -; SSE-NEXT: movdqa 96(%rcx), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rcx), %xmm6 -; SSE-NEXT: movdqa 112(%r8), %xmm4 -; SSE-NEXT: movdqa 112(%r9), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rax), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: movdqa 112(%rax), %xmm12 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm13 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[0,2] -; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm1, %xmm4 -; SSE-NEXT: andnps %xmm7, %xmm1 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: orps %xmm4, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,0,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[3,0] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,1,1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,2] +; SSE-NEXT: movaps {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: andps %xmm9, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,1,0,1] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm3 +; SSE-NEXT: andnps %xmm2, %xmm3 +; SSE-NEXT: orps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm2 ; SSE-NEXT: movdqa 96(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rsi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa 96(%r8), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa 96(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm4[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] -; SSE-NEXT: movdqa 96(%rax), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa 96(%r8), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] +; SSE-NEXT: movaps %xmm9, %xmm4 +; SSE-NEXT: andnps %xmm2, %xmm4 +; SSE-NEXT: orps %xmm3, %xmm4 +; SSE-NEXT: movdqa 96(%r9), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movdqa 96(%rax), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] ; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 ; SSE-NEXT: andps %xmm5, %xmm3 ; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm2[1] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm4[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] +; SSE-NEXT: movaps %xmm9, %xmm4 +; SSE-NEXT: andnps %xmm2, %xmm4 +; SSE-NEXT: orps %xmm3, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[3,3,3,3] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 ; SSE-NEXT: andps %xmm5, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,1] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rax), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movaps %xmm9, %xmm3 +; SSE-NEXT: andnps %xmm2, %xmm3 ; SSE-NEXT: movdqa (%r8), %xmm8 ; SSE-NEXT: movdqa (%r9), %xmm7 -; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm4 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rcx), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa (%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa (%rdi), %xmm9 -; SSE-NEXT: movdqa (%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa (%rcx), %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: movdqa (%rdx), %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa (%rsi), %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: movdqa %xmm12, %xmm9 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm3 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] -; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,2,2,2] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm3[3,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] -; SSE-NEXT: andps %xmm12, %xmm3 -; SSE-NEXT: orps %xmm1, %xmm3 +; SSE-NEXT: andps %xmm11, %xmm3 +; SSE-NEXT: orps %xmm2, %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1] -; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm13[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm2[0,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rax), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,1,0,1] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa 16(%r8), %xmm10 +; SSE-NEXT: movdqa 16(%r9), %xmm8 +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa 16(%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 16(%r8), %xmm14 -; SSE-NEXT: movdqa 16(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm5 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 16(%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 16(%rdx), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm2 ; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rsi), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm7, %xmm2 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm6, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm6, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm3[3,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] -; SSE-NEXT: andps %xmm12, %xmm3 -; SSE-NEXT: orps %xmm1, %xmm3 +; SSE-NEXT: andps %xmm11, %xmm3 +; SSE-NEXT: orps %xmm2, %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm7[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[0,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rax), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 32(%r8), %xmm14 -; SSE-NEXT: movdqa 32(%r9), %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 32(%rcx), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 32(%rdx), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa 32(%rdi), %xmm9 +; SSE-NEXT: movdqa 32(%rax), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa 32(%r8), %xmm11 +; SSE-NEXT: movdqa 32(%r9), %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa 32(%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: movdqa 32(%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: movdqa 32(%rdi), %xmm7 ; SSE-NEXT: movdqa 32(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,2,2,2] +; SSE-NEXT: pandn %xmm5, %xmm15 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm15, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,2,2] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] ; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm3[3,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] -; SSE-NEXT: andps %xmm10, %xmm0 -; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: andps %xmm15, %xmm0 +; SSE-NEXT: orps %xmm2, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm5[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[0,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rax), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 48(%r8), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%r9), %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa 48(%r8), %xmm12 +; SSE-NEXT: movdqa 48(%r9), %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm15 +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: movdqa 48(%rcx), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: psrld $16, %xmm2 ; SSE-NEXT: movdqa 48(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa 48(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: movdqa 48(%rdi), %xmm7 ; SSE-NEXT: movdqa 48(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm5, %xmm15 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm15, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,2,2,2] +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,2,2,2] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,4,4,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] ; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm3[3,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] -; SSE-NEXT: andps %xmm2, %xmm0 -; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: andps %xmm8, %xmm0 +; SSE-NEXT: orps %xmm2, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[0,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rax), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 64(%r8), %xmm1 -; SSE-NEXT: movdqa 64(%r9), %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa 64(%r8), %xmm2 +; SSE-NEXT: movdqa 64(%r9), %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: movdqa 64(%rcx), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: psrld $16, %xmm2 ; SSE-NEXT: movdqa 64(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa 64(%rdi), %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: movdqa 64(%rdi), %xmm7 ; SSE-NEXT: movdqa 64(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,2,2,2] ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] @@ -7172,65 +7270,65 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] ; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm3[3,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] ; SSE-NEXT: andps %xmm9, %xmm0 -; SSE-NEXT: orps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] +; SSE-NEXT: orps %xmm2, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rax), %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,1] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 80(%r8), %xmm1 -; SSE-NEXT: movdqa 80(%r9), %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[0,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rax), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa 80(%r8), %xmm2 +; SSE-NEXT: movdqa 80(%r9), %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: movdqa 80(%rcx), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 80(%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: movdqa 80(%rdx), %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 ; SSE-NEXT: movdqa 80(%rdi), %xmm7 ; SSE-NEXT: movdqa 80(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm4 @@ -7240,250 +7338,241 @@ ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,2,2,2] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm13, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm13, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] ; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm13, %xmm6 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm3[3,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] ; SSE-NEXT: andps %xmm9, %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm13 -; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: orps %xmm2, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm8[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[0,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm3 ; SSE-NEXT: psrld $16, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpcklwd (%rsp), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm4, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[3,3] -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[3,3,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] -; SSE-NEXT: movaps %xmm13, %xmm3 -; SSE-NEXT: andps %xmm13, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: orps %xmm4, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklwd (%rsp), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm5[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm3[0,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; SSE-NEXT: psrld $16, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,0,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm4[3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: andps %xmm5, %xmm4 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[2,2,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm14 +; SSE-NEXT: por %xmm3, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pand %xmm11, %xmm14 +; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: por %xmm14, %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,2,2,2] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm2[3,0] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm1[2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[3,3,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2] +; SSE-NEXT: movaps {{.*#+}} xmm4 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: andps %xmm4, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm4 +; SSE-NEXT: orps %xmm2, %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm5[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[0,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[3,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[3,3,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[0,2] +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: andps %xmm8, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm8, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: orps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm4[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9] ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,65535] ; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm7, %xmm5 ; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: pand %xmm7, %xmm3 ; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: por %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: psrld $16, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,4,5,6,6] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] -; SSE-NEXT: andps %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,1,3] -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm12, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,5,6,6] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm15[2,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: andps %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pslld $16, %xmm2 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,1,3] +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -7496,8 +7585,8 @@ ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] @@ -7508,15 +7597,14 @@ ; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm8, %xmm15 -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -7529,8 +7617,8 @@ ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] @@ -7540,11 +7628,11 @@ ; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] @@ -7561,8 +7649,8 @@ ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] @@ -7572,11 +7660,11 @@ ; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] @@ -7591,11 +7679,10 @@ ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] @@ -7617,16 +7704,16 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] @@ -7649,16 +7736,16 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] @@ -7679,12 +7766,12 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm11[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm7[1,1] ; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,65535,0,0,0,65535,65535,65535] ; SSE-NEXT: movaps %xmm5, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 @@ -7703,13 +7790,13 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm8[1,1] -; SSE-NEXT: movaps %xmm8, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm13[1,1] +; SSE-NEXT: movaps %xmm13, %xmm10 ; SSE-NEXT: movaps %xmm5, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 ; SSE-NEXT: pand %xmm5, %xmm0 @@ -7727,13 +7814,13 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm14[1,1] -; SSE-NEXT: movaps %xmm14, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm6[1,1] +; SSE-NEXT: movaps %xmm6, %xmm12 ; SSE-NEXT: movaps %xmm5, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 ; SSE-NEXT: pand %xmm5, %xmm0 @@ -7751,14 +7838,13 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm3[1,1] -; SSE-NEXT: movaps %xmm3, %xmm14 +; SSE-NEXT: movdqa %xmm11, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm11[1,1] ; SSE-NEXT: movaps %xmm5, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 ; SSE-NEXT: pand %xmm5, %xmm0 @@ -7782,8 +7868,8 @@ ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm10[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm8[1,1] +; SSE-NEXT: movdqa %xmm8, %xmm13 ; SSE-NEXT: movaps %xmm5, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 ; SSE-NEXT: pand %xmm5, %xmm0 @@ -7807,7 +7893,8 @@ ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm15[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm3[1,1] +; SSE-NEXT: movaps %xmm3, %xmm9 ; SSE-NEXT: movaps %xmm5, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 ; SSE-NEXT: pand %xmm5, %xmm0 @@ -7837,27 +7924,28 @@ ; SSE-NEXT: andnps %xmm1, %xmm5 ; SSE-NEXT: orps %xmm0, %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[0],mem[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0,1],mem[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[0,2] -; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: andps %xmm2, %xmm9 +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0,0],mem[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] +; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: andps %xmm5, %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,6,6,7] ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] -; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: orps %xmm9, %xmm2 -; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: andps %xmm5, %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm7 +; SSE-NEXT: andnps %xmm1, %xmm5 +; SSE-NEXT: orps %xmm2, %xmm5 +; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: andps %xmm6, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; SSE-NEXT: andnps %xmm1, %xmm5 -; SSE-NEXT: orps %xmm7, %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: andnps %xmm1, %xmm6 +; SSE-NEXT: orps %xmm5, %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: shufps $42, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[2,2],mem[2,0] @@ -7881,19 +7969,19 @@ ; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,0,65535] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,0,0,0,65535] +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movapd %xmm10, %xmm5 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[1],mem[0] +; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: shufps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2,0],mem[0,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,1] -; SSE-NEXT: andps %xmm2, %xmm5 -; SSE-NEXT: orps %xmm1, %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,1] +; SSE-NEXT: andps %xmm5, %xmm2 +; SSE-NEXT: orps %xmm1, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] ; SSE-NEXT: movdqa %xmm4, %xmm1 @@ -7902,15 +7990,18 @@ ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[1],mem[0] +; SSE-NEXT: movaps %xmm10, %xmm7 +; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: shufps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2,0],mem[0,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[2,1] -; SSE-NEXT: andps %xmm2, %xmm12 -; SSE-NEXT: orps %xmm1, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,1] +; SSE-NEXT: andps %xmm5, %xmm2 +; SSE-NEXT: orps %xmm1, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] ; SSE-NEXT: movdqa %xmm4, %xmm1 @@ -7919,16 +8010,16 @@ ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm11, %xmm6 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[1],mem[0] +; SSE-NEXT: movaps %xmm12, %xmm2 +; SSE-NEXT: shufps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2,0],mem[0,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[2,1] -; SSE-NEXT: andps %xmm2, %xmm11 -; SSE-NEXT: orps %xmm1, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[2,1] +; SSE-NEXT: andps %xmm5, %xmm12 +; SSE-NEXT: orps %xmm1, %xmm12 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] ; SSE-NEXT: movdqa %xmm4, %xmm1 @@ -7937,17 +8028,16 @@ ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm14, %xmm15 -; SSE-NEXT: movaps %xmm14, %xmm9 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[1],mem[0] +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: shufps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2,0],mem[0,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[2,1] -; SSE-NEXT: andps %xmm2, %xmm9 -; SSE-NEXT: orps %xmm1, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm0[2,1] +; SSE-NEXT: andps %xmm5, %xmm10 +; SSE-NEXT: orps %xmm1, %xmm10 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] ; SSE-NEXT: movdqa %xmm4, %xmm1 @@ -7956,16 +8046,15 @@ ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[1],mem[0] +; SSE-NEXT: shufps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2,0],mem[0,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[2,1] -; SSE-NEXT: andps %xmm2, %xmm7 -; SSE-NEXT: orps %xmm1, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[2,1] +; SSE-NEXT: andps %xmm5, %xmm8 +; SSE-NEXT: orps %xmm1, %xmm8 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] ; SSE-NEXT: movdqa %xmm4, %xmm1 @@ -7974,17 +8063,17 @@ ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movapd %xmm14, %xmm5 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[1],mem[0] +; SSE-NEXT: movaps %xmm9, %xmm14 +; SSE-NEXT: movaps %xmm9, %xmm6 +; SSE-NEXT: shufps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2,0],mem[0,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,1] -; SSE-NEXT: andps %xmm2, %xmm5 -; SSE-NEXT: orps %xmm1, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[2,1] +; SSE-NEXT: andps %xmm5, %xmm6 +; SSE-NEXT: orps %xmm1, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7992,14 +8081,14 @@ ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1],mem[0] +; SSE-NEXT: shufps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2,0],mem[0,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,1] -; SSE-NEXT: andps %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: andps %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0] @@ -8007,13 +8096,13 @@ ; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,0,0,65535] -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: pandn %xmm1, %xmm11 ; SSE-NEXT: andps %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0] @@ -8021,12 +8110,12 @@ ; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: pandn %xmm1, %xmm9 ; SSE-NEXT: andps %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: por %xmm0, %xmm9 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0] @@ -8034,25 +8123,25 @@ ; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm1, %xmm7 ; SSE-NEXT: andps %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[2,0],mem[2,1] +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2,0],mem[2,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,0,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: andps %xmm4, %xmm13 -; SSE-NEXT: por %xmm13, %xmm3 +; SSE-NEXT: andps %xmm4, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0] @@ -8060,8 +8149,7 @@ ; SSE-NEXT: # xmm1 = xmm1[2,0],mem[2,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,0,1,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] ; SSE-NEXT: movdqa %xmm4, %xmm13 ; SSE-NEXT: pandn %xmm0, %xmm13 @@ -8098,20 +8186,23 @@ ; SSE-NEXT: movdqa %xmm15, 560(%rax) ; SSE-NEXT: movdqa %xmm13, 448(%rax) ; SSE-NEXT: movdqa %xmm3, 336(%rax) -; SSE-NEXT: movdqa %xmm6, 224(%rax) -; SSE-NEXT: movdqa %xmm8, 112(%rax) -; SSE-NEXT: movdqa %xmm10, (%rax) -; SSE-NEXT: movdqa %xmm2, 736(%rax) -; SSE-NEXT: movaps %xmm5, 624(%rax) -; SSE-NEXT: movaps %xmm7, 512(%rax) -; SSE-NEXT: movaps %xmm9, 400(%rax) -; SSE-NEXT: movaps %xmm11, 288(%rax) -; SSE-NEXT: movaps %xmm12, 176(%rax) +; SSE-NEXT: movdqa %xmm7, 224(%rax) +; SSE-NEXT: movdqa %xmm9, 112(%rax) +; SSE-NEXT: movdqa %xmm11, (%rax) +; SSE-NEXT: movdqa %xmm5, 736(%rax) +; SSE-NEXT: movaps %xmm6, 624(%rax) +; SSE-NEXT: movaps %xmm8, 512(%rax) +; SSE-NEXT: movaps %xmm10, 400(%rax) +; SSE-NEXT: movaps %xmm12, 288(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 864(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 832(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 784(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] @@ -8197,149 +8288,149 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 848(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 832(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 800(%rax) -; SSE-NEXT: addq $1640, %rsp # imm = 0x668 +; SSE-NEXT: addq $1656, %rsp # imm = 0x678 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride7_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $1496, %rsp # imm = 0x5D8 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa 112(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6],xmm3[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 112(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa 112(%r8), %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm7[0],xmm2[1],xmm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 112(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa 112(%r8), %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 112(%r9), %xmm4 +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3],xmm6[4,5],xmm3[6,7] ; AVX1-ONLY-NEXT: vmovdqa 112(%rax), %xmm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0],xmm7[1,2,3,4,5,6],xmm11[7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm11 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm11[1],xmm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm11[2],xmm7[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm7[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm10[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm3[0],zero,xmm3[1],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm1[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm10[6],xmm11[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm13, %ymm11 -; AVX1-ONLY-NEXT: vandps %ymm13, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm11, %ymm12, %ymm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm11[0,1,2],xmm12[3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5],xmm12[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm12, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vmovaps %ymm12, %ymm15 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1],xmm11[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm2, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm3[0],zero,xmm3[1],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1,2,3,4,5,6],xmm12[7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm12 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm12[2],xmm11[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm6[1],xmm8[1] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm14, %ymm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[2,2,3,3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm14, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm8, %ymm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,5,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2,3,4,5,6],xmm9[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 96(%rcx), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm14[1],xmm11[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm8[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm5[1,2,3,4,5,6],xmm8[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm8[1],xmm5[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm11, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa 96(%r8), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm7[0,1,2,3,4],xmm8[5],xmm7[6,7] +; AVX1-ONLY-NEXT: vmovdqa 96(%r9), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa 96(%rax), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2,3,4,5,6],xmm9[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 96(%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4],xmm4[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa 96(%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa 96(%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vmovaps %ymm15, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1,2,3,4,5,6],xmm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2],xmm3[3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm1[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6],xmm1[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm3 @@ -8361,9 +8452,9 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm11, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm5 ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8378,9 +8469,9 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm13, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm5 @@ -8410,14 +8501,15 @@ ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm6 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 @@ -8432,26 +8524,27 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm13[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm11[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm13[0,2],xmm3[1,3] -; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[0,2],xmm3[1,3] +; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] @@ -8469,9 +8562,8 @@ ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] @@ -8479,7 +8571,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm11[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] @@ -8491,12 +8583,12 @@ ; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm5 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm0 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm13 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 @@ -8507,8 +8599,9 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm8 ; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8523,9 +8616,8 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm11[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm3, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm13, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm13, %ymm9 ; AVX1-ONLY-NEXT: vorps %ymm9, %ymm8, %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm8 @@ -8534,11 +8626,11 @@ ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm8, %ymm2 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm13[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[2,2,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6],xmm7[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 @@ -8553,77 +8645,76 @@ ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm6 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm1 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm11 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm10 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm13[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm7[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[0,2],xmm11[1,3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm13[0,2],xmm11[1,3] +; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 -; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm7[6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; AVX1-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] @@ -8631,15 +8722,15 @@ ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,0,1,1] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm9 @@ -8650,21 +8741,21 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm7, %ymm10 ; AVX1-ONLY-NEXT: vmovdqa 64(%r9), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%r8), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa 64(%r8), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vmovdqa 64(%rax), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa 64(%rax), %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm14[5],xmm13[6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] @@ -8678,11 +8769,11 @@ ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm10[6],xmm9[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] @@ -8690,13 +8781,12 @@ ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm0 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,2],xmm6[1,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm3[3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,2],xmm3[1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm4, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm1 @@ -8709,41 +8799,40 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm7 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vmovdqa 80(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 80(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 80(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2,3,4],xmm6[5],xmm15[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[0,2],xmm2[1,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm10 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm5[0,2],xmm2[1,3] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] @@ -8761,13 +8850,13 @@ ; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm15[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm15 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm15 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm2[3],xmm15[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm15, %ymm7 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] @@ -8775,12 +8864,11 @@ ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm8 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm15, %ymm7 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8808,19 +8896,19 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1,2,3,4,5],xmm12[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm3, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm5, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm8 +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm12 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6],xmm13[7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] @@ -8838,66 +8926,67 @@ ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm10, %ymm6 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm10, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm14 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm4 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm6 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm12, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm15, %xmm6 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm14[1],xmm6[1] +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm0[1],xmm6[1] ; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = mem[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm2, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1,2,3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm6 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm13 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] ; AVX1-ONLY-NEXT: vandps %ymm8, %ymm12, %ymm12 @@ -8923,12 +9012,12 @@ ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0],xmm3[1],xmm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm13 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm12, %ymm12 @@ -9009,11 +9098,11 @@ ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] @@ -9191,21 +9280,21 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 784(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 768(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 880(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 864(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 816(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 784(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 800(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 768(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 848(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 832(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 816(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 800(%rax) ; AVX1-ONLY-NEXT: addq $1496, %rsp # imm = 0x5D8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -9362,7 +9451,7 @@ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,2,2,2,6,6,6,6] @@ -9537,7 +9626,7 @@ ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm4, %ymm6, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufd $165, (%rsp), %xmm1 # 16-byte Folded Reload @@ -9827,8 +9916,7 @@ ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload @@ -10085,10 +10173,9 @@ ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm3 -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm2 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,4,5,2,3,4,5,2,3,4,5,2,3,4,5,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm0 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,0,2,1,4,4,6,5] ; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm11 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] @@ -10100,22 +10187,22 @@ ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8,9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] @@ -10124,15 +10211,14 @@ ; AVX2-FAST-NEXT: vmovdqa 64(%rax), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm6 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm15 @@ -10144,343 +10230,339 @@ ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,1,1,3,4,5,5,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0,1],ymm3[2],ymm9[3,4],ymm3[5],ymm9[6,7,8,9],ymm3[10],ymm9[11,12],ymm3[13],ymm9[14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm9[2],ymm1[3,4],ymm9[5],ymm1[6,7,8,9],ymm9[10],ymm1[11,12],ymm9[13],ymm1[14,15] -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[0,1,1,3,4,5,5,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7,8,9],ymm0[10],ymm14[11,12],ymm0[13],ymm14[14,15] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0,1],ymm3[2],ymm14[3,4],ymm3[5],ymm14[6,7,8,9],ymm3[10],ymm14[11,12],ymm3[13],ymm14[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,6,7,6,7,6,7,8,9,8,9,8,9,8,9,22,23,22,23,22,23,22,23,24,25,24,25,24,25,24,25] ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,2,2,3,5,6,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[1,2,2,3,5,6,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,2,2,3,5,6,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7,8,9],ymm2[10],ymm14[11,12],ymm2[13],ymm14[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm11 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm14 -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm11[1,2,2,3,5,6,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[1,2,2,3,5,6,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7,8,9],ymm0[10],ymm14[11,12],ymm0[13],ymm14[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm14 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[1,2,2,3,5,6,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7,8,9],ymm2[10],ymm14[11,12],ymm2[13],ymm14[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm3 ; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm13 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm14[2],ymm4[3,4],ymm14[5],ymm4[6,7,8,9],ymm14[10],ymm4[11,12],ymm14[13],ymm4[14,15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm14[2],ymm3[3,4],ymm14[5],ymm3[6,7,8,9],ymm14[10],ymm3[11,12],ymm14[13],ymm3[14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [10,11,8,9,10,11,8,9,10,11,8,9,10,11,8,9,26,27,24,25,26,27,24,25,26,27,24,25,26,27,24,25] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm14 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm2 ; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm14, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm14, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm4, %ymm12, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm3, %ymm11, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5,6,7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [10,11,12,13,10,11,12,13,10,11,12,13,14,15,14,15,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [6,7,3,3,7,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm14[2],ymm0[3,4],ymm14[5],ymm0[6,7,8,9],ymm14[10],ymm0[11,12],ymm14[13],ymm0[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm5, %ymm14 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm11[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm14[2],ymm2[3,4],ymm14[5],ymm2[6,7,8,9],ymm14[10],ymm2[11,12],ymm14[13],ymm2[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm5, %ymm14 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm14[2],ymm0[3,4],ymm14[5],ymm0[6,7,8,9],ymm14[10],ymm0[11,12],ymm14[13],ymm0[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm14 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm14 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0,1,2],ymm4[3],ymm14[4,5],ymm4[6],ymm14[7,8,9,10],ymm4[11],ymm14[12,13],ymm4[14],ymm14[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm14 -; AVX2-FAST-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm4, %ymm14, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7,8,9,10],ymm14[11],ymm15[12,13],ymm14[14],ymm15[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm12, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8,9,10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm11[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13,14,15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7,8,9,10],ymm1[11],ymm12[12,13],ymm1[14],ymm12[15] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm12 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm10[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5,6,7,8],ymm12[9],ymm15[10,11],ymm12[12],ymm15[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm12, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2,3],ymm2[4],ymm12[5,6,7,8],ymm2[9],ymm12[10,11],ymm2[12],ymm12[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm12 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <3,u,u,u,4,u,u,4> ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm6, %ymm5 ; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm7 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm6, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <3,u,u,3,u,u,u,4> -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm3, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm11, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm8 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm7, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <3,u,u,3,u,u,u,4> +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm6, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,1,1,3,4,5,5,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm9[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[10,11,8,9,10,11,8,9,10,11,8,9,10,11,8,9,26,27,24,25,26,27,24,25,26,27,24,25,26,27,24,25] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [5,6,2,3,6,7,5,6] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vpbroadcastd 124(%r8), %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,28,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10488,7 +10570,7 @@ ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm15 +; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm10 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 @@ -10504,7 +10586,7 @@ ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,1,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm15 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 @@ -10512,12 +10594,11 @@ ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm13 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm12 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3] +; AVX2-FAST-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] ; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 @@ -10569,8 +10650,8 @@ ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FAST-NEXT: vpbroadcastd 32(%rax), %ymm5 ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm1 @@ -10583,17 +10664,17 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FAST-NEXT: vpbroadcastd 64(%rax), %ymm11 ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm3, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 96(%r9), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm11 ; AVX2-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 96(%r9), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FAST-NEXT: vpbroadcastd 96(%rax), %ymm14 ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm14, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm10, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm15, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10601,13 +10682,13 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm7, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm0 ; AVX2-FAST-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm2 = mem[1,1,2,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm15, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,6,7,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm2 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,3] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4],xmm2[5],xmm4[6,7] @@ -10616,12 +10697,12 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm0 -; AVX2-FAST-NEXT: vpshufd $165, (%rsp), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[1,1,2,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm2 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[1,1,2,3] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm2[2],xmm14[3,4],xmm2[5],xmm14[6,7] @@ -10660,14 +10741,14 @@ ; AVX2-FAST-NEXT: vpbroadcastd 4(%rax), %ymm11 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm11, %ymm7 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm11 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd 36(%rax), %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm11, %ymm12, %ymm11 +; AVX2-FAST-NEXT: vpbroadcastd 36(%rax), %ymm13 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm11, %ymm13, %ymm11 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd 68(%rax), %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vpbroadcastd 68(%rax), %ymm13 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm13, %ymm5 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] ; AVX2-FAST-NEXT: vpbroadcastd 100(%rax), %ymm6 @@ -10676,7 +10757,7 @@ ; AVX2-FAST-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm12 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm13 ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm14 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] @@ -10690,9 +10771,9 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd (%rsp), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] @@ -10786,7 +10867,7 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 672(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm3, 512(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm12, 480(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm13, 480(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 448(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm2, 288(%rax) @@ -10817,30 +10898,30 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <3,u,u,u,4,u,u,4> ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm14, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm2, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm10, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm5, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm4, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm5, %ymm6, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm5, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm5, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm6, %ymm1 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -10941,11 +11022,11 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm9, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm8[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7,8,9],ymm1[10],ymm7[11,12],ymm1[13],ymm7[14,15] @@ -10958,7 +11039,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm5[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 @@ -10967,10 +11048,10 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[10,11,8,9,10,11,8,9,10,11,8,9,10,11,8,9,26,27,24,25,26,27,24,25,26,27,24,25,26,27,24,25] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7,8],ymm1[9],ymm7[10,11],ymm1[12],ymm7[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] @@ -10981,7 +11062,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 @@ -10990,10 +11071,11 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] @@ -11003,8 +11085,8 @@ ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 124(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,28,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,3,3,3,6,7,7,7] @@ -11013,7 +11095,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -11023,7 +11105,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] @@ -11038,12 +11120,12 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 @@ -11108,11 +11190,11 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 64(%rax), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm13, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 96(%rax), %ymm15 @@ -11126,13 +11208,13 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm7, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd $165, (%rsp), %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm5 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,6,7,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[1,1,2,3] @@ -11141,8 +11223,8 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 @@ -11204,10 +11286,10 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] @@ -11286,8 +11368,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] @@ -11330,7 +11411,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm6, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm7 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,4,5,2,3,4,5,2,3,4,5,2,3,4,5,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm6, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload @@ -11370,13 +11451,12 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm7, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm7, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [10,11,8,9,10,11,8,9,10,11,8,9,10,11,8,9,26,27,24,25,26,27,24,25,26,27,24,25,26,27,24,25] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload @@ -11412,7 +11492,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [6,7,6,7,6,7,6,7,8,9,8,9,8,9,8,9,22,23,22,23,22,23,22,23,24,25,24,25,24,25,24,25] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm6, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15] @@ -11443,49 +11523,46 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm8, %ymm12, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm9, %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8,9,10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm1, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm10, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm11, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8,9,10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm12, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6,7,8],ymm12[9],ymm13[10,11],ymm12[12],ymm13[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm11, %ymm12, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm11, %ymm12, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8,9,10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm12, %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm12, %ymm13, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [10,11,12,13,10,11,12,13,10,11,12,13,14,15,14,15,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm6, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[3,3,3,3,7,7,7,7] @@ -11574,2873 +11651,2959 @@ ; ; AVX512F-ONLY-SLOW-LABEL: store_i16_stride7_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $2168, %rsp # imm = 0x878 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm6, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vporq %ymm1, %ymm2, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm9, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vporq %ymm1, %ymm2, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm3, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm10, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm4, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm5, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm8, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: subq $3112, %rsp # imm = 0xC28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm8, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm5, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm5, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm11, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm4, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm11, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm13, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm13, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm15, %ymm14, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm0, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,2,3,6,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7,8,9],ymm12[10],ymm15[11,12],ymm12[13],ymm15[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [2,1,3,2,10,10,10,11] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm14, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512F-ONLY-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm15, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7,8,9],ymm14[10],ymm12[11,12],ymm14[13],ymm12[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6,7,8],ymm12[9],ymm14[10,11],ymm12[12],ymm14[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6,7,8],ymm12[9],ymm14[10,11],ymm12[12],ymm14[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8,9,10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm12[2],ymm6[3,4],ymm12[5],ymm6[6,7,8,9],ymm12[10],ymm6[11,12],ymm12[13],ymm6[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7,8,9],ymm7[10],ymm9[11,12],ymm7[13],ymm9[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm17, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm12[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[14,15],zero,zero,ymm12[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[16,17],zero,zero,ymm12[u,u],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm9, %ymm7, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm15, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm15, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm14[0,1,2,3],zmm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rax), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpermd %zmm6, %zmm18, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm1, %ymm12, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rax), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,1,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm15[0,0,1,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 72(%rax), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm15, %ymm12, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rax), %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm14, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm12, %zmm12 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm6, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm14, %ymm9, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm6, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 72(%rax), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpandnq %ymm6, %ymm17, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rax), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm6, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,1] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpandnq %ymm12, %ymm17, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rax), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm7, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm22 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm1[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm29[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7,8,9],ymm12[10],ymm15[11,12],ymm12[13],ymm15[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm2[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm11[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7,8,9],ymm14[10],ymm12[11,12],ymm14[13],ymm12[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm26[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rax), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm12[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm14, %ymm9, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm12, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm14, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7,8,9,10],ymm9[11],ymm0[12,13],ymm9[14],ymm0[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7,8,9],ymm9[10],ymm0[11,12],ymm9[13],ymm0[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpandnq %ymm15, %ymm16, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm11[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,0,0,0,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm22[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7,8,9,10],ymm15[11],ymm14[12,13],ymm15[14],ymm14[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm4, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7,8,9,10],ymm14[11],ymm15[12,13],ymm14[14],ymm15[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1],ymm4[2],ymm9[3,4],ymm4[5],ymm9[6,7,8,9],ymm4[10],ymm9[11,12],ymm4[13],ymm9[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7,8,9],ymm0[10],ymm4[11,12],ymm0[13],ymm4[14,15] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermd %zmm12, %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm0, %ymm4, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[14,15,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[16,17,u,u],zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm4, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm0, %ymm6, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rax), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpandnq %ymm0, %ymm16, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm6, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm7, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <16,16,17,17,17,17,u,u,0,1,0,1,2,3,2,3> +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm18, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm3, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,7,6] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd (%rax), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm10 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,5,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm10, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm2, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm7 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm21, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm2, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,2,2,3,5,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,2,3,3,10,9,11,10] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512F-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermd 64(%rax), %zmm15, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,3,6,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm23[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,1,3,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm11, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,3,3,3,6,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 96(%rax), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm1, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm4, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm6[2],xmm9[3,4],xmm6[5],xmm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm6, %xmm23 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm4, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[1,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm6[1],xmm9[2,3],xmm6[4],xmm9[5,6],xmm6[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm30, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 100(%rax), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 104(%rax), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm28, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm29 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,3,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <16,16,17,17,17,17,u,u,0,1,0,1,2,3,2,3> -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm9, %xmm27 -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm4, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,7,6] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm6, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 64(%rax), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 68(%rax), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm9, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm26 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0],xmm2[1],xmm10[2,3],xmm2[4],xmm10[5,6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[0,1,3,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm10, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm10[2],xmm5[3,4],xmm10[5],xmm5[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm12, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,7,6] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm6, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd (%rax), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm5, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm3[2],xmm7[3,4],xmm3[5],xmm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm14, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[1,1,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm15, %xmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,7,6] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm19[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm8, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm29[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm29[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5,6,7,8],ymm8[9],ymm5[10,11],ymm8[12],ymm5[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm0[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm26[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5,6,7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm26[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0,1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7,8,9,10],ymm11[11],ymm8[12,13],ymm11[14],ymm8[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm0, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm22[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7,8,9],ymm8[10],ymm11[11,12],ymm8[13],ymm11[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,1,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm15[0,0,1,3] +; AVX512F-ONLY-SLOW-NEXT: vpermpd $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermpd $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm30[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm27[2,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm22[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,2,3,6,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm15[0,1],ymm9[2],ymm15[3,4],ymm9[5],ymm15[6,7,8,9],ymm9[10],ymm15[11,12],ymm9[13],ymm15[14,15] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermd %zmm6, %zmm15, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermd %zmm12, %zmm15, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm6[1],xmm2[2,3],xmm6[4],xmm2[5,6],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm2[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermpd $182, (%rsp), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm24[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm23[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm21[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm20[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm19[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm18[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm17[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,3,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm16[2,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm3[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm5[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm10[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm13[2,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm14[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm9[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,3,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <16,16,17,17,17,17,u,u,0,1,0,1,2,3,2,3> +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm25, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm11, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,7,6] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 64(%rax), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 68(%rax), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm0[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm0[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm3, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm9[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8,9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2,3],ymm2[4],ymm6[5,6,7,8],ymm2[9],ymm6[10,11],ymm2[12],ymm6[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vpermd (%rax), %zmm15, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm14, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm0[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3],ymm5[4],ymm2[5,6,7,8],ymm5[9],ymm2[10,11],ymm5[12],ymm2[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %ymm6, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8,9,10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7,8,9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,3,6,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm2[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm3[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm4[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm10, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,3,6,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm5[0,1,3,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm12, %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm22[2,3,3,3,6,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vprold $16, %xmm2, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm12[2],xmm15[3,4],xmm12[5],xmm15[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm1[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm24 = mem[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm19 = mem[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[2,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,5,7,6] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm30, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm28, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm5[1],xmm15[2,3],xmm5[4],xmm15[5,6],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm23 = mem[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm13[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm29, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[2,1,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm3[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[2,1,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,0,1,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[2,1,3,2] -; AVX512F-ONLY-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm29 = mem[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm26 = mem[2,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm21 = mem[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[2,1,3,2] -; AVX512F-ONLY-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm12 = mem[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[2,1,3,2] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm18[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm24, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm25, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm24, %zmm27, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm23[2,1,3,2] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm2[0,1,2,3],zmm23[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm23 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm13 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm7 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm13, %zmm24, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm8, %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm24, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,7,6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 100(%rax), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 104(%rax), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm15, %zmm22, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm15, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm22, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm15, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 124(%r8), %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm24, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm21 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm24, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm24, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm24 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm24 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %zmm2, %zmm0, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm19 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm2[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm23 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm2, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm28[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm28 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm30 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm4, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm17, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm11, %zmm0, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm29, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] +; AVX512F-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermd %zmm14, %zmm16, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm28, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm25, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermd %zmm2, %zmm16, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm28, %zmm16 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm0, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm27, %zmm0, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermq $212, (%rsp), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,1,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm15, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm31, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm27, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # xmm3 = mem[0,1,3,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[2,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # xmm6 = mem[2,1,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm26 = mem[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # xmm10 = mem[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # xmm12 = mem[0,1,3,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm14 = mem[2,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm16 = mem[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # xmm2 = mem[2,1,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm17 = mem[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # xmm4 = mem[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm27, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm29 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm15, %zmm29 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,1,1,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm27, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm11, %zmm0, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm26, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm24, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm16, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm17, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm24, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm20, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm13, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm15, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[1,1,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3],xmm9[4],xmm10[5,6],xmm9[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm10 = mem[0,1,3,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,1,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,3,3,3,6,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm25 = mem[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm15 = mem[2,1,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm20 = mem[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm14 = mem[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm23 = mem[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm13 = mem[0,1,3,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,1,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm27 = ymm2[2,3,3,3,6,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm28 = mem[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm12[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm26, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm13[0,0,0,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm23, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm27[2,1,3,2] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd 96(%rax), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm26, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm7, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm8, %zmm7, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm25, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm20, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm22, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm9, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm22, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm30 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 832(%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $2168, %rsp # imm = 0x878 -; AVX512F-ONLY-SLOW-NEXT: vzeroupper -; AVX512F-ONLY-SLOW-NEXT: retq -; -; AVX512F-ONLY-FAST-LABEL: store_i16_stride7_vf64: -; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $1432, %rsp # imm = 0x598 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm1, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm1, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm1, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm13, %ymm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $3112, %rsp # imm = 0xC28 +; AVX512F-ONLY-SLOW-NEXT: vzeroupper +; AVX512F-ONLY-SLOW-NEXT: retq +; +; AVX512F-ONLY-FAST-LABEL: store_i16_stride7_vf64: +; AVX512F-ONLY-FAST: # %bb.0: +; AVX512F-ONLY-FAST-NEXT: subq $1688, %rsp # imm = 0x698 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm9, %ymm13, %ymm17 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm9, %ymm13, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm9, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm9, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm9, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm9, %ymm13, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm9, %ymm11, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm9, %ymm8, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm9, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm9, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm11, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm9, %ymm8, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm7[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7,8,9],ymm12[10],ymm9[11,12],ymm12[13],ymm9[14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm16, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm19, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[14,15],zero,zero,ymm8[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[16,17],zero,zero,ymm8[u,u],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm13, %ymm9, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm13, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vextracti64x4 $1, %zmm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm13, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm5, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm5, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,28,29,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,4,5,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rax), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm10, %ymm14, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm9, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm7[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8,9,10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,2,2,3,10,9,11,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm25, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm5[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm7, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <5,u,u,u,6,u,u,6> -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm28, %zmm9, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 72(%rax), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm4, %ymm7, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rax), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm13, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm13, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,0,1,1,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %zmm4, %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm7, %ymm16, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %zmm4, %zmm14, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3],ymm4[4,5],ymm7[6],ymm4[7,8,9,10],ymm7[11],ymm4[12,13],ymm7[14],ymm4[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,u,3,10,10,11,11> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm18, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm2, %ymm5, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5],ymm4[6],ymm7[7,8,9,10],ymm4[11],ymm7[12,13],ymm4[14],ymm7[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm1[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm13[0],ymm7[1],ymm13[2,3],ymm7[4],ymm13[5,6,7,8],ymm7[9],ymm13[10,11],ymm7[12],ymm13[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm25, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 72(%rax), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm2, %ymm5, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1],ymm13[2],ymm6[3,4],ymm13[5],ymm6[6,7,8,9],ymm13[10],ymm6[11,12],ymm13[13],ymm6[14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm20, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm7, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15] -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm15, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [2,1,3,2,10,10,10,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm17, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm30[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm30[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm20, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7,8,9],ymm7[10],ymm4[11,12],ymm7[13],ymm4[14,15] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7,8,9],ymm7[10],ymm11[11,12],ymm7[13],ymm11[14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm17, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm6, %zmm23, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm15, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7,8,9],ymm4[10],ymm6[11,12],ymm4[13],ymm6[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm19[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7,8,9,10],ymm11[11],ymm6[12,13],ymm11[14],ymm6[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [2,2,3,3,10,9,11,10] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm25, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm28, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm29, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm28, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,4,5,2,3,4,5,2,3,4,5,2,3,4,5,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm28[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8,9,10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm2, %ymm8, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,2,2,3,8,9,9,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm30[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,3,8,8,9,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm3, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,1,1,8,8,10,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,0,0,1,8,9,9,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7,8,9,10],ymm2[11],ymm5[12,13],ymm2[14],ymm5[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,8,9,10,11,8,9,10,11,8,9,10,11,8,9,26,27,24,25,26,27,24,25,26,27,24,25,26,27,24,25] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm17[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5,6,7,8],ymm5[9],ymm8[10,11],ymm5[12],ymm8[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,2,3,10,9,11,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2,3],ymm2[4],ymm8[5,6,7,8],ymm2[9],ymm8[10,11],ymm2[12],ymm8[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7,8,9],ymm3[10],ymm6[11,12],ymm3[13],ymm6[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [2,2,2,3,8,10,10,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm24, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm5, %zmm21, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <5,u,u,u,6,u,u,6> +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 124(%r8), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm8, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm3[2],xmm7[3,4],xmm3[5],xmm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm9, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 64(%rax), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 68(%rax), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm6, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,1,3,8,8,9,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm5, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,1,1,8,8,10,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,0,1,8,9,9,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm5 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 4(%rax), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,1,3,3,8,8,9,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm14, %zmm2, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm9, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[2,2,2,2,6,6,6,6] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7,8,9],ymm5[10],ymm3[11,12],ymm5[13],ymm3[14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm24, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm22[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm22[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [2,1,3,2,10,10,10,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm23, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm4, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm26[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,10,10,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7,8,9,10],ymm6[11],ymm3[12,13],ymm6[14],ymm3[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,2,3,3,10,9,11,10] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm7, %zmm2, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm26 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm11, %xmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,2,2,3,8,9,9,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm12, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,1,3,3,8,8,9,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm21, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [10,11,12,13,10,11,12,13,10,11,12,13,14,15,14,15,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm15, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,2,3,10,9,11,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm15[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,2,2,3,8,8,8,9] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm6, %zmm23, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm8, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0],xmm4[1],xmm9[2,3],xmm4[4],xmm9[5,6],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm27 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,1,3,8,8,9,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm4, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1],xmm9[2],xmm12[3,4],xmm9[5],xmm12[6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,1,1,8,8,10,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm18, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,1,8,9,9,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 64(%rax), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 68(%rax), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm28[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7,8,9],ymm9[10],ymm0[11,12],ymm9[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm2, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm28[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm9[2],ymm12[3,4],ymm9[5],ymm12[6,7,8,9],ymm9[10],ymm12[11,12],ymm9[13],ymm12[14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm17[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm17[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm25, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm25, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm6, %zmm29, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm7, %zmm28, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm17[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm9[2],ymm1[3,4],ymm9[5],ymm1[6,7,8,9],ymm9[10],ymm1[11,12],ymm9[13],ymm1[14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm24, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm2[2],ymm9[3,4],ymm2[5],ymm9[6,7,8,9],ymm2[10],ymm9[11,12],ymm2[13],ymm9[14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,2,2,3,8,8,8,9] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm12, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm16[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7,8,9],ymm0[10],ymm14[11,12],ymm0[13],ymm14[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm12[2,3,4,5,2,3,4,5,2,3,4,5,2,3,4,5,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm16[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7,8,9,10],ymm13[11],ymm14[12,13],ymm13[14],ymm14[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,2,3,3,10,9,11,10] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [6,7,3,3,7,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm27, %ymm28, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 96(%rax), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm31, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm30, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm9, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm21[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm1[1],ymm9[2,3],ymm1[4],ymm9[5,6,7,8],ymm1[9],ymm9[10,11],ymm1[12],ymm9[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,2,3,8,9,9,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,0,2,1,8,8,9,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm24 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm0[1],xmm11[2,3],xmm0[4],xmm11[5,6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm28, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 32(%rax), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm23, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm31, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm7, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm0[2],xmm6[3,4],xmm0[5],xmm6[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm0, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,2,1,8,8,9,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm26, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm26[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7] ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm7, %zmm26, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm6, %zmm26, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} xmm29 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm29, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,1,3,8,8,9,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm27, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8,9,10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,1,3,3,8,8,9,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 100(%rax), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 104(%rax), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm21, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm30, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm20[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm9[0],ymm2[1],ymm9[2,3],ymm2[4],ymm9[5,6,7,8],ymm2[9],ymm9[10,11],ymm2[12],ymm9[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm2 = mem[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm2[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm2[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm2[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,2,3,8,9,9,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm11, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm17[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [2,2,2,3,8,8,8,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm17, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm23, %ymm28, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 32(%rax), %ymm20 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm9, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm31, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm11, %zmm30, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,1,3,3,8,8,9,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm13, %zmm14, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 36(%rax), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 40(%rax), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm27, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm12[10,11,12,13,10,11,12,13,10,11,12,13,14,15,14,15,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm16[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm13[2],ymm5[3,4],ymm13[5],ymm5[6,7,8,9],ymm13[10],ymm5[11,12],ymm13[13],ymm5[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,2,2,3,8,8,8,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm4, %ymm28, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 96(%rax), %ymm18 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm23, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm11, %zmm31, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm5, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm22, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2,3],xmm1[4],xmm8[5,6],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,0,1,1,8,8,10,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm26, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm29, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm27, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 36(%rax), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 40(%rax), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm21, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm8, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,0,2,1,8,8,9,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,1,1,8,8,10,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm3, %zmm26, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,1,3,8,8,9,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 100(%rax), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 104(%rax), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm27, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm6 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm26, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm26, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17,u,u],zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512F-ONLY-FAST-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [14,21,0,0,15,22,0,15,14,21,0,0,15,22,0,15] -; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm5, %zmm25, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17],zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm8, %ymm9, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm4, %ymm11, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,4,5,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm4, %ymm11, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm11, %ymm12, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [14,5,0,0,15,6,0,15,14,5,0,0,15,6,0,15] +; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm13, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm13 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm9 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm9, %zmm16, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm14[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm9 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm14 = mem[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm17 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm18 = mem[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm20 = mem[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm9, %zmm16, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vextracti64x4 $1, %zmm11, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm11, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm15[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm9 = mem[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX512F-ONLY-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm11 = mem[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm7, %zmm8, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm7 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm7, %zmm26, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm7 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm7, %zmm26, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm1 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm0 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm7 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm9, %zmm16, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm9, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm17, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm9, %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm9, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %zmm16, %zmm9, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %zmm16, %zmm3, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermq $170, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm15, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm10 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 832(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $1432, %rsp # imm = 0x598 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: addq $1688, %rsp # imm = 0x698 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i16_stride7_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $2168, %rsp # imm = 0x878 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm6, %ymm2 -; AVX512DQ-SLOW-NEXT: vporq %ymm1, %ymm2, %ymm18 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm9, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm2 -; AVX512DQ-SLOW-NEXT: vporq %ymm1, %ymm2, %ymm17 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm3, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm10 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm10, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm4, %ymm2 -; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm5 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm5, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm2 -; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm8, %ymm12 +; AVX512DQ-SLOW-NEXT: subq $3112, %rsp # imm = 0xC28 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm7 ; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512DQ-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm8, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm19 -; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX512DQ-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm5 +; AVX512DQ-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm5, %ymm5 +; AVX512DQ-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX512DQ-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm5, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm31 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm29 +; AVX512DQ-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm28 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm26 +; AVX512DQ-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm22 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm11 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm11, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm19 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpor %ymm4, %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm1, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm11, %ymm13 -; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm13, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %ymm4 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %ymm9 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm1 +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm13 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm13, %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm12, %ymm20 -; AVX512DQ-SLOW-NEXT: vpor %ymm15, %ymm14, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vprold $16, %ymm0, %ymm14 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[1,2,2,3,5,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,2,3,6,6,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7,8,9],ymm12[10],ymm15[11,12],ymm12[13],ymm15[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [2,1,3,2,10,10,10,11] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm12, %zmm14, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512DQ-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm15 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm15, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm16 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7,8,9],ymm14[10],ymm12[11,12],ymm14[13],ymm12[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6,7,8],ymm12[9],ymm14[10,11],ymm12[12],ymm14[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6,7,8],ymm12[9],ymm14[10,11],ymm12[12],ymm14[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm12, %ymm25 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8,9,10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm12[2],ymm6[3,4],ymm12[5],ymm6[6,7,8,9],ymm12[10],ymm6[11,12],ymm12[13],ymm6[14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7,8,9],ymm7[10],ymm9[11,12],ymm7[13],ymm9[14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm6 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm17, %zmm7 -; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm12[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[14,15],zero,zero,ymm12[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[16,17],zero,zero,ymm12[u,u],zero,zero -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm9, %ymm7, %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %ymm15 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm14 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm15, %ymm14 -; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm14 -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm6 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm7 -; AVX512DQ-SLOW-NEXT: vprold $16, %ymm15, %ymm6 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm14[0,1,2,3],zmm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vbroadcasti32x8 {{.*#+}} zmm18 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7] -; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rax), %ymm6 -; AVX512DQ-SLOW-NEXT: vpermd %zmm6, %zmm18, %zmm12 +; AVX512DQ-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpandn %ymm1, %ymm12, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rax), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm8 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm15[0,0,1,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpbroadcastd 72(%rax), %ymm15 +; AVX512DQ-SLOW-NEXT: vpandn %ymm15, %ymm12, %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rax), %ymm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm14, %ymm15 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm12, %zmm12 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm6, %ymm6 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpandn %ymm14, %ymm9, %ymm14 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm6, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpbroadcastd 72(%rax), %ymm6 -; AVX512DQ-SLOW-NEXT: vpandnq %ymm6, %ymm17, %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rax), %ymm6 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm6, %ymm12 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %xmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %xmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,1] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm12 -; AVX512DQ-SLOW-NEXT: vpandnq %ymm12, %ymm17, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rax), %ymm7 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm7, %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm22 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm12 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm1[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm29[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7,8,9],ymm12[10],ymm15[11,12],ymm12[13],ymm15[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm2[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm11[1,1,1,1,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7,8,9],ymm14[10],ymm12[11,12],ymm14[13],ymm12[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm26[1,1,1,1,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rax), %ymm12 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm12[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpandn %ymm14, %ymm9, %ymm9 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm15, %ymm12, %ymm14 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm14, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7,8,9,10],ymm9[11],ymm0[12,13],ymm9[14],ymm0[15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm7 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7,8,9],ymm9[10],ymm0[11,12],ymm9[13],ymm0[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpandnq %ymm15, %ymm16, %ymm15 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm14 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm11[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,0,0,0,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm22[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7,8,9,10],ymm15[11],ymm14[12,13],ymm15[14],ymm14[15] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm4, %ymm14 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm30 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7,8,9,10],ymm14[11],ymm15[12,13],ymm14[14],ymm15[15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm27 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1],ymm4[2],ymm9[3,4],ymm4[5],ymm9[6,7,8,9],ymm4[10],ymm9[11,12],ymm4[13],ymm9[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,1,1,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7,8,9],ymm0[10],ymm4[11,12],ymm0[13],ymm4[14,15] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpermd %zmm12, %zmm18, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %ymm9 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpandn %ymm0, %ymm4, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[14,15,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[16,17,u,u],zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %ymm4 +; AVX512DQ-SLOW-NEXT: vprold $16, %ymm4, %ymm0 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpandn %ymm0, %ymm6, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm6 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rax), %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vpandnq %ymm0, %ymm16, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm6, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm7, %xmm14 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <16,16,17,17,17,17,u,u,0,1,0,1,2,3,2,3> +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm18, %xmm15 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vprold $16, %xmm3, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,7,6] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vpbroadcastd (%rax), %ymm1 +; AVX512DQ-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,1,1,1,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm10 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,5,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm10, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm2, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm7 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm9 -; AVX512DQ-SLOW-NEXT: vprold $16, %ymm21, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[1,2,2,3,5,6,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512DQ-SLOW-NEXT: vprold $16, %ymm2, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,2,2,3,5,6,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,2,3,3,10,9,11,10] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x8 {{.*#+}} zmm15 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermd 64(%rax), %zmm15, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,3,6,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX512DQ-SLOW-NEXT: vprold $16, %xmm5, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm3[2],xmm7[3,4],xmm3[5],xmm7[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm14, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm7 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[1,1,2,2] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm15, %xmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,7,6] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm5 +; AVX512DQ-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm8 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm8 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm8, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm29[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm29[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5,6,7,8],ymm8[9],ymm5[10,11],ymm8[12],ymm5[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm0[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm26[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5,6,7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm26[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0,1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7,8,9,10],ymm11[11],ymm8[12,13],ymm11[14],ymm8[15] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vprold $16, %ymm0, %ymm8 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm22[1,2,2,3,5,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7,8,9],ymm8[10],ymm11[11,12],ymm8[13],ymm11[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm11 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm15[0,0,1,3] +; AVX512DQ-SLOW-NEXT: vpermpd $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = mem[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = mem[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermpd $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = mem[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = mem[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovups %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = mem[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm30[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm27[2,1,3,3] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[1,2,2,3,5,6,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm22[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,2,3,6,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm15[0,1],ymm9[2],ymm15[3,4],ymm9[5],ymm15[6,7,8,9],ymm9[10],ymm15[11,12],ymm9[13],ymm15[14,15] +; AVX512DQ-SLOW-NEXT: vbroadcasti32x8 {{.*#+}} zmm15 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7] +; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermd %zmm6, %zmm15, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermd %zmm12, %zmm15, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm6[1],xmm2[2,3],xmm6[4],xmm2[5,6],xmm6[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm2[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermpd $182, (%rsp), %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm24[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm23[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm21[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm20[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm19[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm18[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm17[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,3,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm16[2,1,3,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm3[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm5[0,2,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm23[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,1,3,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm11, %zmm4 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,3,3,3,6,7,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX512DQ-SLOW-NEXT: vpbroadcastd 96(%rax), %ymm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX512DQ-SLOW-NEXT: vprold $16, %xmm4, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm6[2],xmm9[3,4],xmm6[5],xmm9[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm6, %xmm23 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %xmm5 -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm4, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[1,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm6[1],xmm9[2,3],xmm6[4],xmm9[5,6],xmm6[7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm3, %zmm30, %zmm2 -; AVX512DQ-SLOW-NEXT: vpbroadcastd 100(%rax), %ymm3 -; AVX512DQ-SLOW-NEXT: vpbroadcastd 104(%rax), %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm28, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm29 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,3,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <16,16,17,17,17,17,u,u,0,1,0,1,2,3,2,3> -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm3, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm9, %xmm27 -; AVX512DQ-SLOW-NEXT: vprold $16, %xmm4, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %xmm4 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,7,6] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm4, %zmm6, %zmm2 -; AVX512DQ-SLOW-NEXT: vpbroadcastd 64(%rax), %ymm4 -; AVX512DQ-SLOW-NEXT: vpbroadcastd 68(%rax), %ymm9 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm9, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm26 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0],xmm2[1],xmm10[2,3],xmm2[4],xmm10[5,6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[0,1,3,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm10, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm12 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512DQ-SLOW-NEXT: vprold $16, %xmm10, %xmm10 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm10[2],xmm5[3,4],xmm10[5],xmm5[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm12, %zmm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm3 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,7,6] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm3, %zmm6, %zmm2 -; AVX512DQ-SLOW-NEXT: vpbroadcastd (%rax), %ymm3 -; AVX512DQ-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm9, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm10 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm19[1,1,1,1,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm10[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm13[2,1,3,3] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm14[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm9[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,3,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <16,16,17,17,17,17,u,u,0,1,0,1,2,3,2,3> +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm12 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vprold $16, %xmm11, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,7,6] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vpbroadcastd 64(%rax), %ymm1 +; AVX512DQ-SLOW-NEXT: vpbroadcastd 68(%rax), %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm31 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm0[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm0[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vprold $16, %ymm3, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm9[1,2,2,3,5,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8,9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm5 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2,3],ymm2[4],ymm6[5,6,7,8],ymm2[9],ymm6[10,11],ymm2[12],ymm6[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512DQ-SLOW-NEXT: vpermd (%rax), %zmm15, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm14, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm0[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm8 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm10 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3],ymm5[4],ymm2[5,6,7,8],ymm5[9],ymm2[10,11],ymm5[12],ymm2[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512DQ-SLOW-NEXT: vprold $16, %ymm6, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[1,2,2,3,5,6,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8,9,10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7,8,9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[3,3,3,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,3,6,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm2[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm3[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm4[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %xmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %xmm9 +; AVX512DQ-SLOW-NEXT: vprold $16, %xmm10, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,3,6,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %xmm14 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm5[0,1,3,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm12, %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm22[2,3,3,3,6,7,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] -; AVX512DQ-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm12 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm22 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX512DQ-SLOW-NEXT: vprold $16, %xmm2, %xmm12 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm12[2],xmm15[3,4],xmm12[5],xmm15[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm1[0,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %xmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512DQ-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm24 = mem[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm19 = mem[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm8 = mem[2,1,3,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,5,7,6] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm30, %zmm0 -; AVX512DQ-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm1 -; AVX512DQ-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm30 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm28, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm5[1],xmm15[2,3],xmm5[4],xmm15[5,6],xmm5[7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vpshufd $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm23 = mem[1,2,2,3,5,6,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm13[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[2,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm3[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,2,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[2,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,0,1,3] -; AVX512DQ-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm5 = mem[2,1,3,2] -; AVX512DQ-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm10 = mem[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = mem[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = mem[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm29 = mem[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm26 = mem[2,1,3,3] -; AVX512DQ-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm21 = mem[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm15 = mem[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm4 = mem[2,1,3,2] -; AVX512DQ-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm12 = mem[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[2,1,3,2] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm18[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm24, %zmm24 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm25, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm24, %zmm27, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm23[2,1,3,2] -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm28 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm8 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm2[0,1,2,3],zmm23[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm23 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm13 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm7 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm13, %zmm24, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm8, %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm24, %zmm8 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm8 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,7,6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vpbroadcastd 100(%rax), %ymm2 +; AVX512DQ-SLOW-NEXT: vpbroadcastd 104(%rax), %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,1,3,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm15, %zmm22, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm15, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm3 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm22, %zmm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm15, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm15, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vpbroadcastd 124(%r8), %ymm24 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm24, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm21 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm24, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm24, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm18 -; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm24 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm24 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $248, %zmm2, %zmm0, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm19 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm2[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm23 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm2, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm28[0,1,2,3],zmm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm28 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm30 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm4, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm3, %zmm1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm17, %zmm5 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm11, %zmm0, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm29, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm27 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vbroadcasti32x8 {{.*#+}} zmm16 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] +; AVX512DQ-SLOW-NEXT: # zmm16 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermd %zmm14, %zmm16, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm28, %zmm11 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermd %zmm2, %zmm16, %zmm16 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm28, %zmm16 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm0, %zmm21 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm27, %zmm0, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermq $212, (%rsp), %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1,1,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm15, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm31, %zmm1 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm27, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm3 = mem[0,1,3,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm3 = mem[2,1,3,3] -; AVX512DQ-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm6 = mem[2,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3] -; AVX512DQ-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm26 = mem[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm10 = mem[0,2,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,2,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm12 = mem[0,1,3,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm14 = mem[2,1,3,3] -; AVX512DQ-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm15 = mem[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm16 = mem[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm2 = mem[2,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] -; AVX512DQ-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm17 = mem[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm4 = mem[0,2,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm27, %zmm3 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm29 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm15, %zmm29 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,1,1,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm11 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm27, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm11, %zmm0, %zmm22 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm26, %zmm3 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm24, %zmm3 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm16, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm17, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm24, %zmm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm20, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm13, %zmm8 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm15, %zmm8 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[1,1,2,2] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3],xmm9[4],xmm10[5,6],xmm9[7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX512DQ-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm7 = mem[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm10 = mem[0,1,3,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,1,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,3,3,3,6,7,7,7] +; AVX512DQ-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm25 = mem[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm15 = mem[2,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,3] +; AVX512DQ-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm20 = mem[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm14 = mem[0,2,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm23 = mem[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm13 = mem[0,1,3,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,1,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm27 = ymm2[2,3,3,3,6,7,7,7] +; AVX512DQ-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm28 = mem[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm12[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm12 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm26, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm13[0,0,0,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm23, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm27[2,1,3,2] +; AVX512DQ-SLOW-NEXT: vpbroadcastd 96(%rax), %ymm13 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm26, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm7, %zmm10 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm8, %zmm7, %zmm12 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm25, %zmm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm20, %zmm7 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm22, %zmm7 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm22, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm30 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm24 +; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm29, %zmm30 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 448(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 704(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 768(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 832(%rax) -; AVX512DQ-SLOW-NEXT: addq $2168, %rsp # imm = 0x878 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 832(%rax) +; AVX512DQ-SLOW-NEXT: addq $3112, %rsp # imm = 0xC28 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i16_stride7_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $1432, %rsp # imm = 0x598 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm1 -; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm1, %ymm16 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm1 -; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm1, %ymm19 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm1, %ymm22 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512DQ-FAST-NEXT: vpor %ymm13, %ymm9, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: subq $1688, %rsp # imm = 0x698 ; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm26 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm13 -; AVX512DQ-FAST-NEXT: vporq %ymm9, %ymm13, %ymm17 ; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm13 -; AVX512DQ-FAST-NEXT: vpor %ymm9, %ymm13, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm9 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm9, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm9, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm9 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm9, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm13 -; AVX512DQ-FAST-NEXT: vpor %ymm9, %ymm13, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm11 -; AVX512DQ-FAST-NEXT: vpor %ymm9, %ymm11, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm8 -; AVX512DQ-FAST-NEXT: vpor %ymm9, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm9 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm9, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm9, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm11 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm11, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm11 -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm9 -; AVX512DQ-FAST-NEXT: vporq %ymm9, %ymm8, %ymm18 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm31 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm7[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7,8,9],ymm12[10],ymm9[11,12],ymm12[13],ymm9[14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm16, %zmm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm19, %zmm9 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %ymm8 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[14,15],zero,zero,ymm8[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[16,17],zero,zero,ymm8[u,u],zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm13, %ymm9, %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm13, %ymm14 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm10 -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm10 -; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm9, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512DQ-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm12 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm12 -; AVX512DQ-FAST-NEXT: vprold $16, %ymm13, %ymm9 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm9 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm11, %ymm28 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm5 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm5, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm8, %ymm17 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm5, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,28,29,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,4,5,4,5,5,7] -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rax), %ymm9 -; AVX512DQ-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm10 -; AVX512DQ-FAST-NEXT: vpandn %ymm10, %ymm14, %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm9, %ymm12 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm19 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm7[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8,9,10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm23 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,2,2,3,10,9,11,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm25, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm26 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm5[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm29 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm7, %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <5,u,u,u,6,u,u,6> -; AVX512DQ-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm4 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm28, %zmm9, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpbroadcastd 72(%rax), %ymm4 -; AVX512DQ-FAST-NEXT: vpandn %ymm4, %ymm7, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm16 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rax), %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm13, %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm27 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm9 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %xmm12 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm21 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm13, %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,0,1,1,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %zmm4, %zmm7, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm7 -; AVX512DQ-FAST-NEXT: vpandnq %ymm7, %ymm16, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm9 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm16 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm14 -; AVX512DQ-FAST-NEXT: vpternlogq $248, %zmm4, %zmm14, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3],ymm4[4,5],ymm7[6],ymm4[7,8,9,10],ymm7[11],ymm4[12,13],ymm7[14],ymm4[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,u,3,10,10,11,11> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm18, %zmm7 +; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpandn %ymm2, %ymm5, %ymm2 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm8 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5],ymm4[6],ymm7[7,8,9,10],ymm4[11],ymm7[12,13],ymm4[14],ymm7[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm7 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm1[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm13[0],ymm7[1],ymm13[2,3],ymm7[4],ymm13[5,6,7,8],ymm7[9],ymm13[10,11],ymm7[12],ymm13[13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm25, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpbroadcastd 72(%rax), %ymm2 +; AVX512DQ-FAST-NEXT: vpandn %ymm2, %ymm5, %ymm2 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1],ymm13[2],ymm6[3,4],ymm13[5],ymm6[6,7,8,9],ymm13[10],ymm6[11,12],ymm13[13],ymm6[14,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm20, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm7, %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15] -; AVX512DQ-FAST-NEXT: vprold $16, %ymm15, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[1,2,2,3,5,6,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [2,1,3,2,10,10,10,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm17, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm30[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm30[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm20, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm31 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7,8,9],ymm7[10],ymm4[11,12],ymm7[13],ymm4[14,15] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7,8,9],ymm7[10],ymm11[11,12],ymm7[13],ymm11[14,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm17, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm6, %zmm23, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vprold $16, %ymm15, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[1,2,2,3,5,6,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7,8,9],ymm4[10],ymm6[11,12],ymm4[13],ymm6[14,15] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm19[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7,8,9,10],ymm11[11],ymm6[12,13],ymm11[14],ymm6[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [2,2,3,3,10,9,11,10] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm25, %zmm6 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm28, %zmm4 -; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm29 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm4, %zmm29, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm28, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm23 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,4,5,2,3,4,5,2,3,4,5,2,3,4,5,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm16 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm28[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8,9,10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,5,7] +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpandn %ymm2, %ymm8, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %xmm11 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,2,2,3,8,9,9,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm30[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm7 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,3,8,8,9,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX512DQ-FAST-NEXT: vprold $16, %xmm3, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,1,1,8,8,10,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, %xmm12 -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,0,0,1,8,9,9,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7,8,9,10],ymm2[11],ymm5[12,13],ymm2[14],ymm5[15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,8,9,10,11,8,9,10,11,8,9,10,11,8,9,26,27,24,25,26,27,24,25,26,27,24,25,26,27,24,25] +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm17[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5,6,7,8],ymm5[9],ymm8[10,11],ymm5[12],ymm8[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,2,3,10,9,11,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2,3],ymm2[4],ymm8[5,6,7,8],ymm2[9],ymm8[10,11],ymm2[12],ymm8[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7,8,9],ymm3[10],ymm6[11,12],ymm3[13],ymm6[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [2,2,2,3,8,10,10,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm24, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm5, %zmm21, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <5,u,u,u,6,u,u,6> +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpbroadcastd 124(%r8), %ymm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm2 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm7 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vprold $16, %xmm8, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm3[2],xmm7[3,4],xmm3[5],xmm7[6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm9, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vpbroadcastd 64(%rax), %ymm1 -; AVX512DQ-FAST-NEXT: vpbroadcastd 68(%rax), %ymm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm6, %xmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,1,3,8,8,9,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm5 +; AVX512DQ-FAST-NEXT: vprold $16, %xmm5, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,1,1,8,8,10,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm5 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,0,1,8,9,9,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm5 ; AVX512DQ-FAST-NEXT: vpbroadcastd 4(%rax), %ymm6 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm13 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,1,3,3,8,8,9,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm5 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm14, %zmm2, %zmm18 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm2, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm25, %zmm0 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm9, %ymm19 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[2,2,2,2,6,6,6,6] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7,8,9],ymm5[10],ymm3[11,12],ymm5[13],ymm3[14,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm24, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm20 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm22[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm22[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [2,1,3,2,10,10,10,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm23, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vprold $16, %ymm4, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm26[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm3 ; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,10,10,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7,8,9,10],ymm6[11],ymm3[12,13],ymm6[14],ymm3[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,2,3,3,10,9,11,10] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm2 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm7, %zmm2, %zmm29 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm29 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm26 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm11, %xmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,2,2,3,8,9,9,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm9 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm12, %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,1,3,3,8,8,9,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm21, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [10,11,12,13,10,11,12,13,10,11,12,13,14,15,14,15,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm14 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm15, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,2,3,10,9,11,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm15[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm21 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,2,2,3,8,8,8,9] -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm6, %zmm23, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vprold $16, %ymm8, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm13 +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0],xmm4[1],xmm9[2,3],xmm4[4],xmm9[5,6],xmm4[7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm27 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,1,3,8,8,9,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512DQ-FAST-NEXT: vprold $16, %xmm4, %xmm9 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1],xmm9[2],xmm12[3,4],xmm9[5],xmm12[6,7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,1,1,8,8,10,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm18, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %xmm4 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm30 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,1,8,9,9,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpbroadcastd 64(%rax), %ymm4 +; AVX512DQ-FAST-NEXT: vpbroadcastd 68(%rax), %ymm9 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm19 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm28[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7,8,9],ymm9[10],ymm0[11,12],ymm9[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vprold $16, %ymm2, %ymm9 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm28[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm9[2],ymm12[3,4],ymm9[5],ymm12[6,7,8,9],ymm9[10],ymm12[11,12],ymm9[13],ymm12[14,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm17[1,2,2,3,5,6,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm8, %ymm19 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm17[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm25, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm25, %zmm6 -; AVX512DQ-FAST-NEXT: vpermd %zmm6, %zmm29, %zmm10 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm7, %zmm28, %zmm10 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm17[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm9[2],ymm1[3,4],ymm9[5],ymm1[6,7,8,9],ymm9[10],ymm1[11,12],ymm9[13],ymm1[14,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm24, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm2[2],ymm9[3,4],ymm2[5],ymm9[6,7,8,9],ymm2[10],ymm9[11,12],ymm2[13],ymm9[14,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm23, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,2,2,3,8,8,8,9] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm15 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vprold $16, %ymm12, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm16[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7,8,9],ymm0[10],ymm14[11,12],ymm0[13],ymm14[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm12[2,3,4,5,2,3,4,5,2,3,4,5,2,3,4,5,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm16[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7,8,9,10],ymm13[11],ymm14[12,13],ymm13[14],ymm14[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,2,3,3,10,9,11,10] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [6,7,3,3,7,7,6,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm27, %ymm28, %ymm6 -; AVX512DQ-FAST-NEXT: vpbroadcastd 96(%rax), %ymm7 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm31, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm30, %zmm12 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vprold $16, %xmm9, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm21[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm1[1],ymm9[2,3],ymm1[4],ymm9[5,6,7,8],ymm1[9],ymm9[10,11],ymm1[12],ymm9[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,2,3,8,9,9,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,0,2,1,8,8,9,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm24 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm0[1],xmm11[2,3],xmm0[4],xmm11[5,6],xmm0[7] +; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm28, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpbroadcastd 32(%rax), %ymm14 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm23, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm31, %zmm24 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512DQ-FAST-NEXT: vprold $16, %xmm7, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm0[2],xmm6[3,4],xmm0[5],xmm6[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm4, %zmm0, %zmm25 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,2,1,8,8,9,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm26, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm26[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7] ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm8 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm7, %zmm26, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,2,2,2] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm6, %zmm26, %zmm8 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} xmm29 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,1,3,8,8,9,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm27, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8,9,10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,1,3,3,8,8,9,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm11 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm11 -; AVX512DQ-FAST-NEXT: vpbroadcastd 100(%rax), %ymm0 -; AVX512DQ-FAST-NEXT: vpbroadcastd 104(%rax), %ymm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm21, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm21, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %xmm7 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm20[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm9[0],ymm2[1],ymm9[2,3],ymm2[4],ymm9[5,6,7,8],ymm2[9],ymm9[10,11],ymm2[12],ymm9[13,14,15] +; AVX512DQ-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm2 = mem[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm2[0,0,2,1] +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm2[0,0,1,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm2 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm2[0,0,2,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,2,3,8,9,9,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm11, %ymm1 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm17[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [2,2,2,3,8,8,8,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm17, %zmm0 -; AVX512DQ-FAST-NEXT: vpermd %ymm23, %ymm28, %ymm9 -; AVX512DQ-FAST-NEXT: vpbroadcastd 32(%rax), %ymm20 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm9, %zmm20 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm31, %zmm20 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm11, %zmm30, %zmm20 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15] +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %xmm9 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,1,3,3,8,8,9,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm11 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm13, %zmm14, %zmm11 +; AVX512DQ-FAST-NEXT: vpbroadcastd 36(%rax), %ymm1 +; AVX512DQ-FAST-NEXT: vpbroadcastd 40(%rax), %ymm13 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm27, %zmm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm12[10,11,12,13,10,11,12,13,10,11,12,13,14,15,14,15,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm16[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm13[2],ymm5[3,4],ymm13[5],ymm5[6,7,8,9],ymm13[10],ymm5[11,12],ymm13[13],ymm5[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %xmm14 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %xmm15 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,2,2,3,8,8,8,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vpermd %ymm4, %ymm28, %ymm5 +; AVX512DQ-FAST-NEXT: vpbroadcastd 96(%rax), %ymm18 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm5, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm23, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm11, %zmm31, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm16 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX512DQ-FAST-NEXT: vprold $16, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm22, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm1 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2,3],xmm1[4],xmm8[5,6],xmm1[7] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,0,1,1,8,8,10,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm26, %zmm1 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm27, %zmm3 -; AVX512DQ-FAST-NEXT: vpbroadcastd 36(%rax), %ymm2 -; AVX512DQ-FAST-NEXT: vpbroadcastd 40(%rax), %ymm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm21, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm8, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm4 +; AVX512DQ-FAST-NEXT: vprold $16, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,0,2,1,8,8,9,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,1,1,8,8,10,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm3, %zmm26, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm14 +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,1,3,8,8,9,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm6 +; AVX512DQ-FAST-NEXT: vpbroadcastd 100(%rax), %ymm3 +; AVX512DQ-FAST-NEXT: vpbroadcastd 104(%rax), %ymm7 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm27, %zmm8 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm26, %zmm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm26, %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm0 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FAST-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %ymm5 -; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm6 = [14,21,0,0,15,22,0,15,14,21,0,0,15,22,0,15] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm5, %zmm25, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17],zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpandn %ymm8, %ymm9, %ymm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17,u,u],zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vprold $16, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpandn %ymm4, %ymm11, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,4,5,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm4, %ymm11, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpandn %ymm11, %ymm12, %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm13 = [14,5,0,0,15,6,0,15,14,5,0,0,15,6,0,15] +; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm4, %zmm13, %zmm4 +; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm13 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm13 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm9 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm9, %zmm16, %zmm6 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm14[0,0,1,1] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm9 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm14 = mem[2,1,3,2] +; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm17 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm18 = mem[2,2,3,3] +; AVX512DQ-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm20 = mem[2,1,3,2] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm9, %zmm16, %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm6 -; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm11, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm11, %ymm9 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm15[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm11 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm9 = mem[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX512DQ-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm11 = mem[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm7, %zmm8, %zmm14 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm7 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm7, %zmm26, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm7 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm7, %zmm26, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm7 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm9, %zmm16, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm9, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm17, %zmm0 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm9, %zmm16, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm9, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $248, %zmm16, %zmm9, %zmm10 +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $248, %zmm16, %zmm3, %zmm11 +; AVX512DQ-FAST-NEXT: vpermq $170, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm0 = mem[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm15, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm10 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 832(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 768(%rax) -; AVX512DQ-FAST-NEXT: addq $1432, %rsp # imm = 0x598 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQ-FAST-NEXT: addq $1688, %rsp # imm = 0x698 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-LABEL: store_i16_stride7_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $136, %rsp +; AVX512BW-NEXT: pushq %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm25 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm13 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm30 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm20 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm21, %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm20 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm11 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm17 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm21 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm22, %zmm10 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm9, %zmm0 +; AVX512BW-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18 +; AVX512BW-NEXT: kmovd %ecx, %k3 +; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm0 {%k3} +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,59,u,u,4,5,6,7,60,u,u,11,12,13,14,61,u,u,18,19,20,21,62,u,u,25,26,27,28,63,u,u> -; AVX512BW-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] -; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm24, %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm11, %zmm3 -; AVX512BW-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18 -; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vpermi2w %zmm30, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] ; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm28, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm4, %zmm28, %zmm13 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm12, %zmm10 ; AVX512BW-NEXT: movl $-1048377844, %ecx # imm = 0xC183060C -; AVX512BW-NEXT: kmovd %ecx, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm17 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0] +; AVX512BW-NEXT: kmovd %ecx, %k2 +; AVX512BW-NEXT: vmovdqu16 %zmm13, %zmm10 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] +; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm30 +; AVX512BW-NEXT: vpermt2w %zmm4, %zmm24, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm13 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm29 +; AVX512BW-NEXT: vpermt2w %zmm19, %zmm25, %zmm29 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11] +; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm14 +; AVX512BW-NEXT: vpermt2w %zmm4, %zmm15, %zmm14 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0] ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm31 -; AVX512BW-NEXT: vpermt2w %zmm29, %zmm23, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm19 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm28, %zmm29 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 -; AVX512BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 -; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm26 +; AVX512BW-NEXT: vpermt2w %zmm4, %zmm23, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm31 +; AVX512BW-NEXT: vpermi2w %zmm4, %zmm20, %zmm22 +; AVX512BW-NEXT: vpermi2w %zmm8, %zmm19, %zmm9 +; AVX512BW-NEXT: vmovdqu16 %zmm22, %zmm9 {%k3} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm17, %zmm11, %zmm22 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u> +; AVX512BW-NEXT: vpermi2w %zmm21, %zmm22, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: movl $473460961, %ecx # imm = 0x1C3870E1 ; AVX512BW-NEXT: kmovd %ecx, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm9 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm22 +; AVX512BW-NEXT: vpermt2w %zmm4, %zmm1, %zmm22 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm3, %zmm19 +; AVX512BW-NEXT: vpermt2w %zmm20, %zmm5, %zmm4 +; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm4 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2w %zmm26, %zmm25, %zmm30 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm25 -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm2, %zmm30 -; AVX512BW-NEXT: vmovdqu16 %zmm30, %zmm29 {%k3} -; AVX512BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm21 -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm12, %zmm6 -; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm6 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512BW-NEXT: vpermi2w %zmm20, %zmm2, %zmm21 -; AVX512BW-NEXT: movl $-507279602, %eax # imm = 0xE1C3870E +; AVX512BW-NEXT: vpermi2w %zmm17, %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = +; AVX512BW-NEXT: vpermi2w %zmm21, %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm27, %zmm0 +; AVX512BW-NEXT: movl $-507279602, %ecx # imm = 0xE1C3870E +; AVX512BW-NEXT: kmovd %ecx, %k3 +; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm4 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vpermt2w %zmm21, %zmm3, %zmm0 +; AVX512BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 +; AVX512BW-NEXT: kmovd %ecx, %k3 +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm10 {%k3} +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm20 +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm2, %zmm28 +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm16, %zmm12 +; AVX512BW-NEXT: vmovdqu16 %zmm28, %zmm12 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm19, %zmm13 +; AVX512BW-NEXT: movl $202911840, %ecx # imm = 0xC183060 +; AVX512BW-NEXT: kmovd %ecx, %k2 +; AVX512BW-NEXT: vmovdqu16 %zmm30, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm30 +; AVX512BW-NEXT: vpermi2w %zmm30, %zmm20, %zmm27 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm28 +; AVX512BW-NEXT: vpermt2w %zmm28, %zmm3, %zmm27 +; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm12 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = <0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u> +; AVX512BW-NEXT: vpermt2w %zmm21, %zmm27, %zmm0 +; AVX512BW-NEXT: movl $1893843847, %eax # imm = 0x70E1C387 ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm6 {%k3} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0,0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm2, %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm21, %zmm16 -; AVX512BW-NEXT: movl $202911840, %eax # imm = 0xC183060 +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm13 {%k3} +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm2, %zmm24 +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm16, %zmm19 +; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm19 {%k2} +; AVX512BW-NEXT: vpermi2w %zmm30, %zmm20, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm28, %zmm27, %zmm3 +; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm19 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: movl $405823681, %eax # imm = 0x183060C1 ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm16 {%k3} -; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm24 -; AVX512BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm11 -; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm11 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = <54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm20, %zmm3, %zmm24 -; AVX512BW-NEXT: movl $473460961, %eax # imm = 0x1C3870E1 -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm11 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm26, %zmm3, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm24, %zmm22 -; AVX512BW-NEXT: movl $-1014559204, %eax # imm = 0xC3870E1C -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm22, %zmm16 {%k2} -; AVX512BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm2 -; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm21 -; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm21 {%k3} -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm24, %zmm3 -; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm21 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm2, %zmm27 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50,16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50] +; AVX512BW-NEXT: vmovdqu16 %zmm29, %zmm14 {%k3} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm3, %zmm17 -; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm17 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm2 -; AVX512BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u> -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm2, %zmm10 -; AVX512BW-NEXT: movl $946921923, %eax # imm = 0x3870E1C3 -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm17 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermi2w %zmm16, %zmm7, %zmm25 +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm2, %zmm15 +; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm15 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = +; AVX512BW-NEXT: vpermt2w %zmm21, %zmm24, %zmm0 +; AVX512BW-NEXT: movl $-2029118408, %eax # imm = 0x870E1C38 +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm14 {%k3} +; AVX512BW-NEXT: vpermi2w %zmm30, %zmm20, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm28, %zmm24, %zmm3 +; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm15 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm5, %zmm3 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50,16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm0, %zmm18 -; AVX512BW-NEXT: movl $405823681, %eax # imm = 0x183060C1 -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm31, %zmm18 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm26, %zmm2, %zmm1 -; AVX512BW-NEXT: vpermi2w %zmm14, %zmm15, %zmm23 -; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqu16 %zmm26, %zmm18 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm24, %zmm25 +; AVX512BW-NEXT: vpermi2w %zmm6, %zmm2, %zmm23 +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm16, %zmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm10, %zmm1 -; AVX512BW-NEXT: movl $-2029118408, %eax # imm = 0x870E1C38 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u> +; AVX512BW-NEXT: vpermt2w %zmm21, %zmm23, %zmm25 +; AVX512BW-NEXT: movl $946921923, %eax # imm = 0x3870E1C3 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm18 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm1, %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm1, %zmm19 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm1, %zmm14 -; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm19 {%k3} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2w %zmm26, %zmm1, %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u> -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm1, %zmm28 -; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm14 {%k3} -; AVX512BW-NEXT: movl $1893843847, %eax # imm = 0x70E1C387 +; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm18 {%k1} +; AVX512BW-NEXT: vpermi2w %zmm20, %zmm30, %zmm24 +; AVX512BW-NEXT: vpermt2w %zmm28, %zmm23, %zmm24 +; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm0 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0,0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0] +; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm23, %zmm31 +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm23, %zmm16 +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermi2w %zmm20, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,59,u,u,4,5,6,7,60,u,u,11,12,13,14,61,u,u,18,19,20,21,62,u,u,25,26,27,28,63,u,u> +; AVX512BW-NEXT: vpermi2w %zmm20, %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqu16 %zmm31, %zmm22 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512BW-NEXT: vpermt2w %zmm21, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm2 {%k2} +; AVX512BW-NEXT: movl $-1014559204, %eax # imm = 0xC3870E1C ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm28, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,59,u,4,5,6,7,8,60,u,11,12,13,14,15,61,u,18,19,20,21,22,62,u,25,26,27,28,29,63,u> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermi2w %zmm26, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermi2w %zmm26, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] -; AVX512BW-NEXT: vpermi2w %zmm25, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31] -; AVX512BW-NEXT: vpermi2w %zmm25, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm22 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm30, %zmm3, %zmm20 +; AVX512BW-NEXT: vpermt2w %zmm28, %zmm5, %zmm20 +; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vpermi2w %zmm30, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,59,u,4,5,6,7,8,60,u,11,12,13,14,15,61,u,18,19,20,21,22,62,u,25,26,27,28,29,63,u> +; AVX512BW-NEXT: vpermi2w %zmm30, %zmm8, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31] +; AVX512BW-NEXT: vpermi2w %zmm28, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] +; AVX512BW-NEXT: vpermi2w %zmm28, %zmm5, %zmm3 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 576(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 832(%rax) -; AVX512BW-NEXT: addq $136, %rsp +; AVX512BW-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 832(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 768(%rax) +; AVX512BW-NEXT: popq %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i16>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll @@ -21,28 +21,26 @@ ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa (%rdx), %xmm1 -; SSE-NEXT: movdqa (%r8), %xmm2 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa (%r8), %xmm0 ; SSE-NEXT: movdqa (%r11), %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,7,5] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, 16(%rax) -; SSE-NEXT: movapd %xmm3, (%rax) +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,0,0] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm4, 16(%rax) +; SSE-NEXT: movapd %xmm6, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride8_vf2: @@ -54,12 +52,12 @@ ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa (%r11), %xmm3 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] @@ -77,16 +75,18 @@ ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa (%rdx), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa (%rcx), %xmm3 -; AVX2-ONLY-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX2-ONLY-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-ONLY-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastd (%rcx), %xmm2 +; AVX2-ONLY-NEXT: vpbroadcastd (%rdx), %xmm3 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpbroadcastd (%r11), %xmm2 +; AVX2-ONLY-NEXT: vpbroadcastd (%r10), %xmm3 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rax) @@ -99,16 +99,16 @@ ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512F-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3 -; AVX512F-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX512F-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 -; AVX512F-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX512F-NEXT: vmovdqa (%r8), %xmm1 +; AVX512F-NEXT: vpbroadcastd (%rdx), %xmm2 +; AVX512F-NEXT: vpunpckldq (%rcx){1to4}, %xmm2, %xmm2 +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512F-NEXT: vpbroadcastd (%r11), %xmm2 +; AVX512F-NEXT: vpunpckldq (%r10){1to4}, %xmm2, %xmm2 +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31] ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512F-NEXT: vmovdqa %ymm0, (%rax) @@ -121,18 +121,19 @@ ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512BW-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512BW-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3 -; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,16,18,8,10,24,26,1,3,17,19,9,11,25,27] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm0, %ymm1 -; AVX512BW-NEXT: vmovdqa %ymm1, (%rax) +; AVX512BW-NEXT: vmovdqa (%r8), %xmm1 +; AVX512BW-NEXT: vpbroadcastd (%rdx), %xmm2 +; AVX512BW-NEXT: vpunpckldq (%rcx){1to4}, %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512BW-NEXT: vpbroadcastd (%r11), %xmm2 +; AVX512BW-NEXT: vpunpckldq (%r10){1to4}, %xmm2, %xmm2 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15] +; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <2 x i16>, ptr %in.vecptr0, align 64 @@ -205,64 +206,60 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm3[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm9 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm10 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm10[0],xmm9[0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm14[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm15[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm1[1,2,3],xmm11[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm1[1,2,3],xmm8[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm1[1,2,3],xmm7[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3],xmm4[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3],xmm3[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2,3],xmm2[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -532,79 +529,79 @@ ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm2 ; SSE-NEXT: movdqa (%rsi), %xmm9 ; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: movdqa (%rcx), %xmm11 -; SSE-NEXT: movdqa (%r8), %xmm4 +; SSE-NEXT: movdqa (%r8), %xmm0 ; SSE-NEXT: movdqa (%r9), %xmm8 -; SSE-NEXT: movdqa (%r10), %xmm3 +; SSE-NEXT: movdqa (%r10), %xmm4 ; SSE-NEXT: movdqa (%rax), %xmm10 -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1] -; SSE-NEXT: movdqa %xmm3, %xmm14 +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE-NEXT: movdqa %xmm2, %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] +; SSE-NEXT: movdqa %xmm4, %xmm14 ; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,0,0,0] -; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: movdqa %xmm0, %xmm15 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm5[0],xmm3[1] ; SSE-NEXT: movdqa %xmm15, %xmm6 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm15[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm13[0],xmm6[1] ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm15[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm14[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm13[2],xmm7[3],xmm13[3] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm12[0],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm15[2,3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] +; SSE-NEXT: movdqa %xmm2, %xmm9 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,0,0] -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,0,0] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,0,0] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm9[0],xmm8[1] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm9[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm4[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm2[0],xmm10[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm9, 96(%rax) -; SSE-NEXT: movaps %xmm10, 112(%rax) +; SSE-NEXT: movaps %xmm9, 112(%rax) +; SSE-NEXT: movapd %xmm10, 96(%rax) ; SSE-NEXT: movaps %xmm11, 80(%rax) ; SSE-NEXT: movapd %xmm8, 64(%rax) -; SSE-NEXT: movapd %xmm7, 32(%rax) -; SSE-NEXT: movaps %xmm6, 48(%rax) +; SSE-NEXT: movaps %xmm7, 48(%rax) +; SSE-NEXT: movapd %xmm6, 32(%rax) ; SSE-NEXT: movaps %xmm5, 16(%rax) -; SSE-NEXT: movapd %xmm2, (%rax) +; SSE-NEXT: movapd %xmm3, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride8_vf8: @@ -849,362 +846,330 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride8_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $88, %rsp +; SSE-NEXT: pushq %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa 16(%rdi), %xmm15 +; SSE-NEXT: movdqa (%rdi), %xmm12 ; SSE-NEXT: movdqa (%rsi), %xmm0 -; SSE-NEXT: movdqa (%rdx), %xmm7 -; SSE-NEXT: movdqa (%rcx), %xmm1 -; SSE-NEXT: movdqa (%r8), %xmm8 -; SSE-NEXT: movdqa (%r9), %xmm2 -; SSE-NEXT: movdqa (%r10), %xmm11 -; SSE-NEXT: movdqa (%rax), %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; SSE-NEXT: movdqa %xmm8, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm13 -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: movdqa (%rdx), %xmm13 +; SSE-NEXT: movdqa (%rcx), %xmm2 +; SSE-NEXT: movdqa (%r8), %xmm14 +; SSE-NEXT: movdqa (%r9), %xmm1 +; SSE-NEXT: movdqa (%r10), %xmm15 +; SSE-NEXT: movdqa (%rax), %xmm10 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,0,0] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm5[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r8), %xmm5 +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; SSE-NEXT: movdqa 16(%r9), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: movdqa 16(%r10), %xmm7 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdx), %xmm9 -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] -; SSE-NEXT: movdqa 16(%rcx), %xmm3 +; SSE-NEXT: movdqa 16(%rax), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; SSE-NEXT: movdqa 16(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 16(%r10), %xmm15 -; SSE-NEXT: movdqa 16(%rax), %xmm7 -; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3] -; SSE-NEXT: movdqa 16(%r8), %xmm4 -; SSE-NEXT: movdqa 16(%r9), %xmm11 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,0,0,0] +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm14[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm12[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm12[0],xmm11[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3] +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; SSE-NEXT: movdqa 16(%rdx), %xmm1 +; SSE-NEXT: movdqa 16(%rcx), %xmm9 +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rsi), %xmm8 +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm2[0],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm15[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm2[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm15[0],xmm3[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm12[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm12[2],xmm5[3],xmm12[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm13[0],xmm5[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm8[2],xmm13[3],xmm8[3] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm10[0],xmm13[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm2[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm3[0],xmm10[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm15[2],xmm4[3],xmm15[3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm1, 224(%rax) -; SSE-NEXT: movaps %xmm3, 240(%rax) -; SSE-NEXT: movapd %xmm10, 160(%rax) -; SSE-NEXT: movaps %xmm8, 176(%rax) -; SSE-NEXT: movapd %xmm13, 96(%rax) -; SSE-NEXT: movaps %xmm12, 112(%rax) -; SSE-NEXT: movapd %xmm5, 32(%rax) -; SSE-NEXT: movaps %xmm6, 48(%rax) -; SSE-NEXT: movapd %xmm9, 192(%rax) -; SSE-NEXT: movaps %xmm11, 208(%rax) +; SSE-NEXT: movaps %xmm6, 240(%rax) +; SSE-NEXT: movapd %xmm9, 224(%rax) +; SSE-NEXT: movaps %xmm8, 208(%rax) +; SSE-NEXT: movapd %xmm4, 192(%rax) +; SSE-NEXT: movaps %xmm2, 176(%rax) +; SSE-NEXT: movapd %xmm3, 160(%rax) +; SSE-NEXT: movaps %xmm14, 144(%rax) +; SSE-NEXT: movapd %xmm12, 128(%rax) +; SSE-NEXT: movaps %xmm10, 112(%rax) +; SSE-NEXT: movapd %xmm11, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%rax) +; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rax) +; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: addq $88, %rsp +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: popq %rax ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride8_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $136, %rsp ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 16(%r10), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm6 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm12[0],zero,xmm12[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm14, %ymm9 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm11 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0],ymm13[1],ymm9[2,3,4],ymm13[5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm1[2,3],ymm12[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2],ymm3[3],ymm12[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2,3,4],ymm10[5],ymm8[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm8 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm10 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm13 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm14 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0],ymm15[1],ymm3[2,3,4],ymm15[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm11[3],ymm8[4,5,6],ymm11[7] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm5, %ymm15 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2],ymm2[3],ymm15[4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm6[0],ymm15[1],ymm6[2,3,4],ymm15[5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm2[2,3],ymm15[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5,6],ymm7[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm15[0],zero,xmm15[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3],ymm9[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm12 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm15[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm15, %ymm12 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm14, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2],ymm8[3],ymm12[4,5,6],ymm8[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3],ymm11[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm14[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm14[0],zero,xmm14[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3],ymm13[4,5,6],ymm11[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1],ymm11[2,3],ymm1[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm15[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm10[0],zero,xmm10[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm15, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,0,0,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $136, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -1212,137 +1177,138 @@ ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa (%r10), %xmm7 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa (%r10), %xmm9 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm10 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm11 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm6[0],zero,xmm6[1],zero +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm5[0],zero,xmm5[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm12 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm13 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm9[0],zero,xmm9[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm14 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm15 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm8[0],zero,xmm8[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm4[2,3],ymm14[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm2[2,3],ymm8[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%r10), %ymm8 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm11[0],zero,xmm11[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3],ymm10[4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm10 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm14[0],zero,xmm14[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm6[2,3],ymm13[4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r10), %ymm9 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3],ymm11[4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm14 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[2,2,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3],ymm11[4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm5[2,3],ymm12[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[8],ymm14[8],ymm9[9],ymm14[9],ymm9[10],ymm14[10],ymm9[11],ymm14[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[8],ymm8[8],ymm6[9],ymm8[9],ymm6[10],ymm8[10],ymm6[11],ymm8[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3],ymm11[4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[8],ymm10[8],ymm8[9],ymm10[9],ymm8[10],ymm10[10],ymm8[11],ymm10[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm13[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3],ymm14[4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[8],ymm6[8],ymm0[9],ymm6[9],ymm0[10],ymm6[10],ymm0[11],ymm6[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm11[2,3],ymm2[4,5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm14[4],ymm9[5],ymm14[5],ymm9[6],ymm14[6],ymm9[7],ymm14[7],ymm9[12],ymm14[12],ymm9[13],ymm14[13],ymm9[14],ymm14[14],ymm9[15],ymm14[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2,3,4],ymm4[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm2[3],ymm12[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[12],ymm10[12],ymm8[13],ymm10[13],ymm8[14],ymm10[14],ymm8[15],ymm10[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm6[4],ymm1[5],ymm6[5],ymm1[6],ymm6[6],ymm1[7],ymm6[7],ymm1[12],ymm6[12],ymm1[13],ymm6[13],ymm1[14],ymm6[14],ymm1[15],ymm6[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm4[1],ymm8[2,3,4],ymm4[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 192(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 224(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm11, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 224(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 192(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, 128(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 96(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1352,135 +1318,139 @@ ; ; AVX2-FAST-LABEL: store_i16_stride8_vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: pushq %rax +; AVX2-FAST-NEXT: subq $40, %rsp ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm5 +; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm3 ; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,0,0,u,u,1,1> ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm6 ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm9 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm10 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm12 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm0[3],ymm5[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm8 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm10 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2,3,4],ymm14[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,2,2,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm11, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm14 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm15 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm11[1],ymm7[2,3,4],ymm11[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <2,2,2,2,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,2,3,3,3,3,u,u> +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm14 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm9 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2,3,4],ymm2[5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm5[2,3],ymm2[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r10), %ymm13 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,0,0,u,u,1,1> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm15 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm6 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm7 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-FAST-NEXT: vmovdqa (%r10), %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm9 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm9[1],ymm5[2,3,4],ymm9[5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm11, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3],ymm5[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0],ymm5[1],ymm10[2,3,4],ymm5[5],ymm10[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm13[0],ymm15[0],ymm13[1],ymm15[1],ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[8],ymm15[8],ymm13[9],ymm15[9],ymm13[10],ymm15[10],ymm13[11],ymm15[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[8],ymm12[8],ymm9[9],ymm12[9],ymm9[10],ymm12[10],ymm9[11],ymm12[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm14[0],ymm6[0],ymm14[1],ymm6[1],ymm14[2],ymm6[2],ymm14[3],ymm6[3],ymm14[8],ymm6[8],ymm14[9],ymm6[9],ymm14[10],ymm6[10],ymm14[11],ymm6[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,5,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm3, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm15[4],ymm13[5],ymm15[5],ymm13[6],ymm15[6],ymm13[7],ymm15[7],ymm13[12],ymm15[12],ymm13[13],ymm15[13],ymm13[14],ymm15[14],ymm13[15],ymm15[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm9[4],ymm12[4],ymm9[5],ymm12[5],ymm9[6],ymm12[6],ymm9[7],ymm12[7],ymm9[12],ymm12[12],ymm9[13],ymm12[13],ymm9[14],ymm12[14],ymm9[15],ymm12[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3],ymm12[4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm14[4],ymm6[4],ymm14[5],ymm6[5],ymm14[6],ymm6[6],ymm14[7],ymm6[7],ymm14[12],ymm6[12],ymm14[13],ymm6[13],ymm14[14],ymm6[14],ymm14[15],ymm6[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3],ymm3[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,1,3,5,7,5,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,4,4,4,4,6,5] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2],ymm10[3],ymm14[4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm14 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm15 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm5 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,1,3,5,7,5,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm11, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm3[1],ymm13[2,3,4],ymm3[5],ymm13[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1],ymm10[2,3],ymm3[4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [2,1,6,5,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm13, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm2[3],ymm12[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [4,6,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,5,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,4,4,4,4,6,5] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [4,4,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm5, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,1,3,5,7,5,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm13, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm1, 128(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 192(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 224(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 160(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 224(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 192(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm10, 128(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-NEXT: popq %rax +; AVX2-FAST-NEXT: addq $40, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -1488,137 +1458,138 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm6[0],zero,xmm6[1],zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm5[0],zero,xmm5[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm9[0],zero,xmm9[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm8[0],zero,xmm8[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm4[2,3],ymm14[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm2[2,3],ymm8[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm11[0],zero,xmm11[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3],ymm10[4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm14[0],zero,xmm14[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm6[2,3],ymm13[4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3],ymm11[4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3],ymm11[4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm5[2,3],ymm12[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[8],ymm14[8],ymm9[9],ymm14[9],ymm9[10],ymm14[10],ymm9[11],ymm14[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[8],ymm8[8],ymm6[9],ymm8[9],ymm6[10],ymm8[10],ymm6[11],ymm8[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3],ymm11[4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[8],ymm10[8],ymm8[9],ymm10[9],ymm8[10],ymm10[10],ymm8[11],ymm10[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm13[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3],ymm14[4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[8],ymm6[8],ymm0[9],ymm6[9],ymm0[10],ymm6[10],ymm0[11],ymm6[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm11[2,3],ymm2[4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm14[4],ymm9[5],ymm14[5],ymm9[6],ymm14[6],ymm9[7],ymm14[7],ymm9[12],ymm14[12],ymm9[13],ymm14[13],ymm9[14],ymm14[14],ymm9[15],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2,3,4],ymm4[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm2[3],ymm12[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[12],ymm10[12],ymm8[13],ymm10[13],ymm8[14],ymm10[14],ymm8[15],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm6[4],ymm1[5],ymm6[5],ymm1[6],ymm6[6],ymm1[7],ymm6[7],ymm1[12],ymm6[12],ymm1[13],ymm6[13],ymm1[14],ymm6[14],ymm1[15],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm4[1],ymm8[2,3,4],ymm4[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1630,83 +1601,77 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa (%rdx), %ymm9 -; AVX512F-NEXT: vmovdqa (%rcx), %ymm10 -; AVX512F-NEXT: vmovdqa (%r8), %ymm15 -; AVX512F-NEXT: vmovdqa (%r9), %ymm3 -; AVX512F-NEXT: vmovdqa (%r10), %ymm4 -; AVX512F-NEXT: vmovdqa (%rax), %ymm1 ; AVX512F-NEXT: vmovdqa (%rax), %xmm5 ; AVX512F-NEXT: vmovdqa (%r10), %xmm6 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-NEXT: vmovdqa64 %xmm6, %xmm21 -; AVX512F-NEXT: vmovdqa64 %xmm5, %xmm22 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm16 -; AVX512F-NEXT: vmovdqa (%r9), %xmm5 +; AVX512F-NEXT: vmovdqa (%r9), %xmm7 ; AVX512F-NEXT: vmovdqa (%r8), %xmm8 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; AVX512F-NEXT: vmovdqa64 %xmm5, %xmm23 -; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20 -; AVX512F-NEXT: vmovdqa (%rcx), %xmm11 -; AVX512F-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa (%rcx), %xmm9 +; AVX512F-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm17 -; AVX512F-NEXT: vmovdqa (%rsi), %xmm13 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm14 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512F-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm18 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm15[0],ymm3[0],ymm15[1],ymm3[1],ymm15[2],ymm3[2],ymm15[3],ymm3[3],ymm15[8],ymm3[8],ymm15[9],ymm3[9],ymm15[10],ymm3[10],ymm15[11],ymm3[11] -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm5 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] -; AVX512F-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm19 -; AVX512F-NEXT: vmovdqa (%rsi), %ymm6 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11] -; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm7 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] +; AVX512F-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-NEXT: vmovdqa (%rsi), %ymm5 +; AVX512F-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm18 +; AVX512F-NEXT: vmovdqa (%rdx), %ymm6 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512F-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm14 +; AVX512F-NEXT: vmovdqa (%r8), %ymm8 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512F-NEXT: vmovdqa (%r9), %ymm9 +; AVX512F-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm19 +; AVX512F-NEXT: vmovdqa (%r10), %ymm15 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512F-NEXT: vmovdqa (%rax), %ymm12 +; AVX512F-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11] ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm15[4],ymm3[4],ymm15[5],ymm3[5],ymm15[6],ymm3[6],ymm15[7],ymm3[7],ymm15[12],ymm3[12],ymm15[13],ymm3[13],ymm15[14],ymm3[14],ymm15[15],ymm3[15] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11] ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm2[4],ymm6[4],ymm2[5],ymm6[5],ymm2[6],ymm6[6],ymm2[7],ymm6[7],ymm2[12],ymm6[12],ymm2[13],ymm6[13],ymm2[14],ymm6[14],ymm2[15],ymm6[15] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11] +; AVX512F-NEXT: vinserti64x4 $1, %ymm13, %zmm13, %zmm13 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] +; AVX512F-NEXT: vinserti64x4 $1, %ymm10, %zmm10, %zmm10 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm15[4],ymm12[4],ymm15[5],ymm12[5],ymm15[6],ymm12[6],ymm15[7],ymm12[7],ymm15[12],ymm12[12],ymm15[13],ymm12[13],ymm15[14],ymm12[14],ymm15[15],ymm12[15] +; AVX512F-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15] +; AVX512F-NEXT: vinserti64x4 $1, %ymm8, %zmm8, %zmm8 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15] ; AVX512F-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm6 -; AVX512F-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512F-NEXT: vmovdqa64 %xmm22, %xmm9 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX512F-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 -; AVX512F-NEXT: vmovdqa64 %xmm23, %xmm2 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; AVX512F-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512F-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm10 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX512F-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,u,0,16,u,u,1,17,10,10,10,26,u,u,11,27> -; AVX512F-NEXT: vpermt2d %zmm16, %zmm12, %zmm20 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,16,1,u,1,17,u,u,10,26,11,11,11,27,u,u> -; AVX512F-NEXT: vpermt2d %zmm17, %zmm13, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,u,0,16,u,u,1,17,10,10,10,26,u,u,11,27> +; AVX512F-NEXT: vpermt2d %zmm16, %zmm5, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,16,1,u,1,17,u,u,10,26,11,11,11,27,u,u> +; AVX512F-NEXT: vpermt2d %zmm17, %zmm7, %zmm0 ; AVX512F-NEXT: movb $-86, %cl ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512F-NEXT: vpermt2d %zmm18, %zmm5, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm7, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,4,20,4,5,5,21,10,9,14,30,14,13,15,31] -; AVX512F-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = [4,20,1,3,5,21,5,7,14,30,11,11,15,31,15,15] -; AVX512F-NEXT: vpermt2d %zmm19, %zmm14, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} ; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm14, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512F-NEXT: vpermt2d %zmm9, %zmm12, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm13, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,20,1,3,5,21,5,7,14,30,11,11,15,31,15,15] +; AVX512F-NEXT: vpermt2d %zmm13, %zmm1, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} +; AVX512F-NEXT: vpermt2d %zmm12, %zmm2, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2121,260 +2086,284 @@ ; ; AVX1-ONLY-LABEL: store_i16_stride8_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $296, %rsp # imm = 0x128 +; AVX1-ONLY-NEXT: subq $232, %rsp ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovdqa 48(%r10), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm0 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm7[0],ymm13[1],ymm7[2,3,4],ymm13[5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3],ymm10[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm12[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm12 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm9[0],ymm12[1],ymm9[2,3,4],ymm12[5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3],ymm12[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm12 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2,3,4],ymm11[5],ymm5[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa 32(%r10), %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa 32(%r10), %xmm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm1 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm10 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm11 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm13[0],zero,xmm13[1],zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm9, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0],ymm6[1],ymm13[2,3,4],ymm6[5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm6[0],zero,xmm6[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3],ymm10[4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 48(%r10), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm8 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm15[0],zero,xmm15[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3],ymm1[4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3,4],ymm10[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1],ymm12[2,3],ymm1[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 @@ -2382,311 +2371,261 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm7 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 288(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 352(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $296, %rsp # imm = 0x128 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: addq $232, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride8_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $264, %rsp # imm = 0x108 +; AVX2-SLOW-NEXT: subq $232, %rsp ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa (%r10), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa (%r10), %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm9[0],zero,xmm9[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm14 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm15 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm8[1],ymm12[2,3,4],ymm8[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r10), %xmm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3],ymm12[4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[2,2,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3],ymm9[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r10), %xmm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm5[2,3],ymm14[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm2[2,3],ymm13[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm14 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,0,1,1] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm13[0],zero,xmm13[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm0[0],zero,xmm0[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm3[1],ymm12[2,3,4],ymm3[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm13 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm14 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm7[0],zero,xmm7[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa (%r10), %ymm15 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm15 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r10), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm3[2,3],ymm6[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm10[4],ymm5[5],ymm10[5],ymm5[6],ymm10[6],ymm5[7],ymm10[7],ymm5[12],ymm10[12],ymm5[13],ymm10[13],ymm5[14],ymm10[14],ymm5[15],ymm10[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm15[0],ymm10[0],ymm15[1],ymm10[1],ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[8],ymm10[8],ymm15[9],ymm10[9],ymm15[10],ymm10[10],ymm15[11],ymm10[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm2[3],ymm8[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm11 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm1[3],ymm8[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm11 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm12 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm10[0],ymm5[1],ymm10[1],ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[8],ymm10[8],ymm5[9],ymm10[9],ymm5[10],ymm10[10],ymm5[11],ymm10[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm15[4],ymm10[4],ymm15[5],ymm10[5],ymm15[6],ymm10[6],ymm15[7],ymm10[7],ymm15[12],ymm10[12],ymm15[13],ymm10[13],ymm15[14],ymm10[14],ymm15[15],ymm10[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r10), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 32(%r10), %ymm13 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm7 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm0[3],ymm5[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm6 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[12],ymm6[12],ymm9[13],ymm6[13],ymm9[14],ymm6[14],ymm9[15],ymm6[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm9[0],ymm13[1],ymm9[1],ymm13[2],ymm9[2],ymm13[3],ymm9[3],ymm13[8],ymm9[8],ymm13[9],ymm9[9],ymm13[10],ymm9[10],ymm13[11],ymm9[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm7 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm15 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm6[0],ymm15[0],ymm6[1],ymm15[1],ymm6[2],ymm15[2],ymm6[3],ymm15[3],ymm6[8],ymm15[8],ymm6[9],ymm15[9],ymm6[10],ymm15[10],ymm6[11],ymm15[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[0,2,2,3,4,6,6,7] @@ -2694,584 +2633,557 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm9[4],ymm13[5],ymm9[5],ymm13[6],ymm9[6],ymm13[7],ymm9[7],ymm13[12],ymm9[12],ymm13[13],ymm9[13],ymm13[14],ymm9[14],ymm13[15],ymm9[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm15[4],ymm6[5],ymm15[5],ymm6[6],ymm15[6],ymm6[7],ymm15[7],ymm6[12],ymm15[12],ymm6[13],ymm15[13],ymm6[14],ymm15[14],ymm6[15],ymm15[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3,4],ymm3[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 224(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm15, 192(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm11, 416(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 480(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 448(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm14, 416(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm12, 384(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, 224(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 480(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 448(%rax) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 352(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-SLOW-NEXT: addq $264, %rsp # imm = 0x108 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: addq $232, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i16_stride8_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $296, %rsp # imm = 0x128 +; AVX2-FAST-NEXT: subq $264, %rsp # imm = 0x108 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,2,2,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm7 +; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm6 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,0,0,u,u,1,1> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm2 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm5 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm14, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm15 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm11, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2,3,4],ymm14[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <2,2,2,2,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm9, %ymm10 +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <2,2,3,3,3,3,u,u> +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm9, %ymm12 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm9 -; AVX2-FAST-NEXT: vmovdqa 32(%r10), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm13 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm10 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm12[2,3],ymm3[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,2,2,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,0,0,0,u,u,1,1> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,u,0,u,u,u,1,u> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2],ymm6[3],ymm2[4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm12 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,0,1,1,1,1,u,u> +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r10), %xmm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3],ymm1[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm11 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm14, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm13, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm13, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,0,0,0,u,u,1,1> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm11, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm12, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm15, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm13 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm10, %ymm15 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,2,2,2,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <2,2,3,3,3,3,u,u> +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm14 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm11, %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,0,0,u,u,1,1> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,0,u,u,u,1,u> +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm13 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,1,1,1,1,u,u> +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r10), %ymm8 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm12, %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX2-FAST-NEXT: vmovdqa (%r10), %ymm10 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm15, %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3],ymm4[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm14, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm1 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[12],ymm6[12],ymm9[13],ymm6[13],ymm9[14],ymm6[14],ymm9[15],ymm6[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm4, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm2[3],ymm10[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,5,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3],ymm0[4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm15, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm13, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3],ymm8[4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [4,4,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm14 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,1,3,5,7,5,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2,3,4],ymm14[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm3, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,1,6,5,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm3, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm5[3],ymm8[4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [4,6,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [6,5,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm13 -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm11 -; AVX2-FAST-NEXT: vmovdqa (%r10), %ymm9 -; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm5 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm9[4],ymm5[4],ymm9[5],ymm5[5],ymm9[6],ymm5[6],ymm9[7],ymm5[7],ymm9[12],ymm5[12],ymm9[13],ymm5[13],ymm9[14],ymm5[14],ymm9[15],ymm5[15] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm7 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm3 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm7[4],ymm10[5],ymm7[5],ymm10[6],ymm7[6],ymm10[7],ymm7[7],ymm10[12],ymm7[12],ymm10[13],ymm7[13],ymm10[14],ymm7[14],ymm10[15],ymm7[15] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3,4],ymm2[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm12[2,3],ymm2[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm12, %ymm6 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,4,4,4,4,6,5] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm5, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [4,4,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,1,3,5,7,5,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3],ymm3[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,1,6,5,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm3, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [4,6,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,5,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm11 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 32(%r10), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm8 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[8],ymm8[8],ymm6[9],ymm8[9],ymm6[10],ymm8[10],ymm6[11],ymm8[11] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[2],ymm7[2],ymm11[3],ymm7[3],ymm11[8],ymm7[8],ymm11[9],ymm7[9],ymm11[10],ymm7[10],ymm11[11],ymm7[11] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm5, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm10[3],ymm13[4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm5[0],ymm13[1],ymm5[1],ymm13[2],ymm5[2],ymm13[3],ymm5[3],ymm13[8],ymm5[8],ymm13[9],ymm5[9],ymm13[10],ymm5[10],ymm13[11],ymm5[11] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm15, %ymm9 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm14, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [6,5,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm14, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [4,6,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [6,5,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm12, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm9[0],ymm5[0],ymm9[1],ymm5[1],ymm9[2],ymm5[2],ymm9[3],ymm5[3],ymm9[8],ymm5[8],ymm9[9],ymm5[9],ymm9[10],ymm5[10],ymm9[11],ymm5[11] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2,3],ymm0[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm11[4],ymm7[4],ymm11[5],ymm7[5],ymm11[6],ymm7[6],ymm11[7],ymm7[7],ymm11[12],ymm7[12],ymm11[13],ymm7[13],ymm11[14],ymm7[14],ymm11[15],ymm7[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,4,4,4,4,6,5] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[8],ymm7[8],ymm10[9],ymm7[9],ymm10[10],ymm7[10],ymm10[11],ymm7[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm13[4],ymm5[4],ymm13[5],ymm5[5],ymm13[6],ymm5[6],ymm13[7],ymm5[7],ymm13[12],ymm5[12],ymm13[13],ymm5[13],ymm13[14],ymm5[14],ymm13[15],ymm5[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [4,4,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,1,3,5,7,5,7] ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm14, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm15, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [6,5,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm14, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm1, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 128(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 224(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 192(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 480(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 448(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 416(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 384(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 416(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 384(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 480(%rax) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 448(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $296, %rsp # imm = 0x128 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride8_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $264, %rsp # imm = 0x108 +; AVX2-FAST-PERLANE-NEXT: subq $232, %rsp ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm9[0],zero,xmm9[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm8[1],ymm12[2,3,4],ymm8[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r10), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3],ymm12[4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3],ymm9[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r10), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm5[2,3],ymm14[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm2[2,3],ymm13[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm13[0],zero,xmm13[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm3[1],ymm12[2,3,4],ymm3[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm7[0],zero,xmm7[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r10), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm3[2,3],ymm6[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm10[4],ymm5[5],ymm10[5],ymm5[6],ymm10[6],ymm5[7],ymm10[7],ymm5[12],ymm10[12],ymm5[13],ymm10[13],ymm5[14],ymm10[14],ymm5[15],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm15[0],ymm10[0],ymm15[1],ymm10[1],ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[8],ymm10[8],ymm15[9],ymm10[9],ymm15[10],ymm10[10],ymm15[11],ymm10[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm2[3],ymm8[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm1[3],ymm8[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm10[0],ymm5[1],ymm10[1],ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[8],ymm10[8],ymm5[9],ymm10[9],ymm5[10],ymm10[10],ymm5[11],ymm10[11] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm15[4],ymm10[4],ymm15[5],ymm10[5],ymm15[6],ymm10[6],ymm15[7],ymm10[7],ymm15[12],ymm10[12],ymm15[13],ymm10[13],ymm15[14],ymm10[14],ymm15[15],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r10), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm0[3],ymm5[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[12],ymm6[12],ymm9[13],ymm6[13],ymm9[14],ymm6[14],ymm9[15],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm9[0],ymm13[1],ymm9[1],ymm13[2],ymm9[2],ymm13[3],ymm9[3],ymm13[8],ymm9[8],ymm13[9],ymm9[9],ymm13[10],ymm9[10],ymm13[11],ymm9[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm6[0],ymm15[0],ymm6[1],ymm15[1],ymm6[2],ymm15[2],ymm6[3],ymm15[3],ymm6[8],ymm15[8],ymm6[9],ymm15[9],ymm6[10],ymm15[10],ymm6[11],ymm15[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[0,2,2,3,4,6,6,7] @@ -3279,62 +3191,62 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm9[4],ymm13[5],ymm9[5],ymm13[6],ymm9[6],ymm13[7],ymm9[7],ymm13[12],ymm9[12],ymm13[13],ymm9[13],ymm13[14],ymm9[14],ymm13[15],ymm9[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm15[4],ymm6[5],ymm15[5],ymm6[6],ymm15[6],ymm6[7],ymm15[7],ymm6[12],ymm15[12],ymm6[13],ymm15[13],ymm6[14],ymm15[14],ymm6[15],ymm15[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3,4],ymm3[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 416(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 480(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 448(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 416(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 384(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 384(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 480(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 448(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $264, %rsp # imm = 0x108 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $232, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -3342,338 +3254,309 @@ ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm5 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3> -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm26, %zmm30 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = <0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3> +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3> +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm18, %zmm25 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3> ; AVX512F-SLOW-NEXT: movw $-30584, %r11w # imm = 0x8888 ; AVX512F-SLOW-NEXT: kmovw %r11d, %k1 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm27, %zmm30 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u> -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm28, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = <0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u> +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm19, %zmm25 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u> +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm20, %zmm29 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u> ; AVX512F-SLOW-NEXT: movw $8738, %r11w # imm = 0x2222 ; AVX512F-SLOW-NEXT: kmovw %r11d, %k2 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm29, %zmm3 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[8],ymm15[8],ymm2[9],ymm15[9],ymm2[10],ymm15[10],ymm2[11],ymm15[11] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm12 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[8],ymm7[8],ymm12[9],ymm7[9],ymm12[10],ymm7[10],ymm12[11],ymm7[11] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512F-SLOW-NEXT: vpermd %zmm13, %zmm19, %zmm31 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm18, %zmm31 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm20, %zmm14 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[8],ymm13[8],ymm6[9],ymm13[9],ymm6[10],ymm13[10],ymm6[11],ymm13[11] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm21, %zmm14 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm2 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm12[4],ymm7[4],ymm12[5],ymm7[5],ymm12[6],ymm7[6],ymm12[7],ymm7[7],ymm12[12],ymm7[12],ymm12[13],ymm7[13],ymm12[14],ymm7[14],ymm12[15],ymm7[15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm7 -; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm19, %zmm17 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm12 -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm18, %zmm17 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm15 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm13[4],ymm6[5],ymm13[5],ymm6[6],ymm13[6],ymm6[7],ymm13[7],ymm6[12],ymm13[12],ymm6[13],ymm13[13],ymm6[14],ymm13[14],ymm6[15],ymm13[15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm13 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm20, %zmm16 +; AVX512F-SLOW-NEXT: vpermd %zmm10, %zmm21, %zmm29 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm18, %zmm27 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm19, %zmm27 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm20, %zmm2 +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm21, %zmm2 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa (%r10), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[8],ymm5[8],ymm8[9],ymm5[9],ymm8[10],ymm5[10],ymm8[11],ymm5[11] +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm9 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm10 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm22, %zmm28 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm23, %zmm28 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm12 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm15[0],ymm0[1],ymm15[1],ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[8],ymm15[8],ymm0[9],ymm15[9],ymm0[10],ymm15[10],ymm0[11],ymm15[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm24, %zmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm26, %zmm7 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm4 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm8[4],ymm5[4],ymm8[5],ymm5[5],ymm8[6],ymm5[6],ymm8[7],ymm5[7],ymm8[12],ymm5[12],ymm8[13],ymm5[13],ymm8[14],ymm5[14],ymm8[15],ymm5[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm5 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm8 +; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm22, %zmm30 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm10 +; AVX512F-SLOW-NEXT: vpermd %zmm13, %zmm23, %zmm30 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm12[4],ymm11[4],ymm12[5],ymm11[5],ymm12[6],ymm11[6],ymm12[7],ymm11[7],ymm12[12],ymm11[12],ymm12[13],ymm11[13],ymm12[14],ymm11[14],ymm12[15],ymm11[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm15[4],ymm0[5],ymm15[5],ymm0[6],ymm15[6],ymm0[7],ymm15[7],ymm0[12],ymm15[12],ymm0[13],ymm15[13],ymm0[14],ymm15[14],ymm0[15],ymm15[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm24, %zmm15 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm21, %zmm16 {%k2} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm26, %zmm23 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm27, %zmm23 {%k1} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm28, %zmm22 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm29, %zmm22 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm26, %zmm25 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm27, %zmm25 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm28, %zmm24 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm29, %zmm24 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa (%r10), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm19, %zmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm18, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm19, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm18, %zmm2 {%k1} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[8],ymm4[8],ymm12[9],ymm4[9],ymm12[10],ymm4[10],ymm12[11],ymm4[11] +; AVX512F-SLOW-NEXT: vpermd %zmm13, %zmm26, %zmm15 {%k2} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpermd %zmm13, %zmm18, %zmm16 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpermd %zmm13, %zmm19, %zmm16 {%k1} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX512F-SLOW-NEXT: vpermd %zmm13, %zmm20, %zmm17 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpermd %zmm13, %zmm21, %zmm17 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm18, %zmm5 +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm19, %zmm5 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm20, %zmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[8],ymm7[8],ymm1[9],ymm7[9],ymm1[10],ymm7[10],ymm1[11],ymm7[11] -; AVX512F-SLOW-NEXT: vpermd %zmm13, %zmm21, %zmm0 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm7[4],ymm1[5],ymm7[5],ymm1[6],ymm7[6],ymm1[7],ymm7[7],ymm1[12],ymm7[12],ymm1[13],ymm7[13],ymm1[14],ymm7[14],ymm1[15],ymm7[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm12[4],ymm4[4],ymm12[5],ymm4[5],ymm12[6],ymm4[6],ymm12[7],ymm4[7],ymm12[12],ymm4[12],ymm12[13],ymm4[13],ymm12[14],ymm4[14],ymm12[15],ymm4[15] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm20, %zmm4 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm21, %zmm4 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm26, %zmm5 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm27, %zmm5 {%k1} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm28, %zmm7 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm29, %zmm7 {%k2} -; AVX512F-SLOW-NEXT: movb $-86, %al -; AVX512F-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm31, %zmm14 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm21, %zmm0 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] +; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm22, %zmm12 +; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm23, %zmm12 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm9 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm13 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[8],ymm3[8],ymm6[9],ymm3[9],ymm6[10],ymm3[10],ymm6[11],ymm3[11] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm24, %zmm1 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm13[0],ymm9[0],ymm13[1],ymm9[1],ymm13[2],ymm9[2],ymm13[3],ymm9[3],ymm13[8],ymm9[8],ymm13[9],ymm9[9],ymm13[10],ymm9[10],ymm13[11],ymm9[11] +; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm26, %zmm1 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] +; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm22, %zmm8 +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm23, %zmm8 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm13[4],ymm9[4],ymm13[5],ymm9[5],ymm13[6],ymm9[6],ymm13[7],ymm9[7],ymm13[12],ymm9[12],ymm13[13],ymm9[13],ymm13[14],ymm9[14],ymm13[15],ymm9[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm3[4],ymm6[5],ymm3[5],ymm6[6],ymm3[6],ymm6[7],ymm3[7],ymm6[12],ymm3[12],ymm6[13],ymm3[13],ymm6[14],ymm3[14],ymm6[15],ymm3[15] +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm24, %zmm3 +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm26, %zmm3 {%k2} ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512F-SLOW-NEXT: movb $-86, %cl +; AVX512F-SLOW-NEXT: kmovw %ecx, %k1 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, %zmm7 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm15 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 448(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm29, (%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i16_stride8_vf32: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $472, %rsp # imm = 0x1D8 +; AVX512F-FAST-NEXT: subq $136, %rsp ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm2 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm26 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm27 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm20 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm21 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm19 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm24 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm22 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm7 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm23 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm3 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm17 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm5 +; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm14 +; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm6 +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm15 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm18 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm7 +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm19 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm9 ; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm20 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm21 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm22 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm23 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm24 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa (%r10), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm26 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm7 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm8 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm27 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm10 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm29 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm12 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm13 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm16 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm28 ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm30 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm31 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm17 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vmovdqa (%r10), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm26 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm27 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm30 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm15 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm28 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm29 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm0 -; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm9 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm11 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] +; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm9 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[2],ymm0[2],ymm9[3],ymm0[3],ymm9[8],ymm0[8],ymm9[9],ymm0[9],ymm9[10],ymm0[10],ymm9[11],ymm0[11] +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm11 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm12 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11] +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm15 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm9[4],ymm0[4],ymm9[5],ymm0[5],ymm9[6],ymm0[6],ymm9[7],ymm0[7],ymm9[12],ymm0[12],ymm9[13],ymm0[13],ymm9[14],ymm0[14],ymm9[15],ymm0[15] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm12[4],ymm11[4],ymm12[5],ymm11[5],ymm12[6],ymm11[6],ymm12[7],ymm11[7],ymm12[12],ymm11[12],ymm12[13],ymm11[13],ymm12[14],ymm11[14],ymm12[15],ymm11[15] +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm0 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[12],ymm13[12],ymm15[13],ymm13[13],ymm15[14],ymm13[14],ymm15[15],ymm13[15] +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm0[0],ymm13[1],ymm0[1],ymm13[2],ymm0[2],ymm13[3],ymm0[3],ymm13[8],ymm0[8],ymm13[9],ymm0[9],ymm13[10],ymm0[10],ymm13[11],ymm0[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm13[4],ymm0[4],ymm13[5],ymm0[5],ymm13[6],ymm0[6],ymm13[7],ymm0[7],ymm13[12],ymm0[12],ymm13[13],ymm0[13],ymm13[14],ymm0[14],ymm13[15],ymm0[15] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11> +; AVX512F-FAST-NEXT: vpermd %zmm19, %zmm1, %zmm19 ; AVX512F-FAST-NEXT: movw $-30584, %ax # imm = 0x8888 ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] -; AVX512F-FAST-NEXT: vpermd %zmm25, %zmm31, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] -; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm0, %zmm25 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm17, %zmm31, %zmm17 -; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm0, %zmm17 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm16 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm24 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11> +; AVX512F-FAST-NEXT: vpermd %zmm18, %zmm0, %zmm19 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u> +; AVX512F-FAST-NEXT: vpermd %zmm21, %zmm18, %zmm15 +; AVX512F-FAST-NEXT: movw $8738, %ax # imm = 0x2222 +; AVX512F-FAST-NEXT: kmovw %eax, %k2 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,0,1,1,1,1,u,u,10,10,11,11,11,11,u,u> +; AVX512F-FAST-NEXT: vpermd %zmm20, %zmm21, %zmm15 {%k2} +; AVX512F-FAST-NEXT: movb $-86, %al +; AVX512F-FAST-NEXT: kmovw %eax, %k3 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm15 {%k3} +; AVX512F-FAST-NEXT: vpermd %zmm23, %zmm1, %zmm19 +; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm0, %zmm19 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm25, %zmm18, %zmm20 +; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm21, %zmm20 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm20 {%k3} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] +; AVX512F-FAST-NEXT: vpermd %zmm27, %zmm19, %zmm22 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] +; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm23, %zmm22 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] +; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm24, %zmm16 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] +; AVX512F-FAST-NEXT: vpermd %zmm29, %zmm25, %zmm16 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm26 +; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm8 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm27 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm29 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm14, %zmm14 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm7 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm6 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm5 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm10, %zmm10 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm7 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm10, %zmm10 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 -; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm31, %zmm6 -; AVX512F-FAST-NEXT: vpermd %zmm12, %zmm0, %zmm6 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm7, %zmm31, %zmm7 -; AVX512F-FAST-NEXT: movw $8738, %ax # imm = 0x2222 -; AVX512F-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm0, %zmm7 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] -; AVX512F-FAST-NEXT: vpermd %zmm23, %zmm0, %zmm10 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] -; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm12, %zmm10 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm27, %zmm0, %zmm22 -; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm12, %zmm22 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm3, %zmm0, %zmm3 -; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm12, %zmm3 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm2, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpermd %zmm4, %zmm12, %zmm0 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 {%k1} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd %zmm15, %zmm2, %zmm12 -; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm5, %zmm12 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm2, %zmm15 -; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm5, %zmm15 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm5, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,u,1,u,1,u,u,u,10,u,11,u,11,u,u,u> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,0,1,1,1,1,u,u,10,10,11,11,11,11,u,u> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u> -; AVX512F-FAST-NEXT: vpermd %zmm29, %zmm9, %zmm16 -; AVX512F-FAST-NEXT: vpermd %zmm28, %zmm5, %zmm16 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm9, %zmm14 -; AVX512F-FAST-NEXT: vpermd %zmm13, %zmm5, %zmm14 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm11, %zmm9, %zmm9 -; AVX512F-FAST-NEXT: vpermd %zmm8, %zmm5, %zmm9 {%k2} -; AVX512F-FAST-NEXT: movb $-86, %al -; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm10 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm14 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm11, %zmm11 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm13, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm16 {%k3} +; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm19, %zmm22 +; AVX512F-FAST-NEXT: vpermd %zmm28, %zmm23, %zmm22 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm17, %zmm24, %zmm17 +; AVX512F-FAST-NEXT: vpermd %zmm31, %zmm25, %zmm17 {%k2} ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 384(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512F-FAST-NEXT: addq $472, %rsp # imm = 0x1D8 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm17 {%k3} +; AVX512F-FAST-NEXT: vpermd %zmm27, %zmm1, %zmm22 +; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm0, %zmm22 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm8, %zmm18, %zmm8 +; AVX512F-FAST-NEXT: vpermd %zmm29, %zmm21, %zmm8 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm8 {%k3} +; AVX512F-FAST-NEXT: vpermd %zmm7, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm0, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm18, %zmm0 +; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm21, %zmm0 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k3} +; AVX512F-FAST-NEXT: vpermd %zmm4, %zmm19, %zmm1 +; AVX512F-FAST-NEXT: vpermd %zmm3, %zmm23, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm2, %zmm24, %zmm2 +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm25, %zmm2 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k3} +; AVX512F-FAST-NEXT: vpermd %zmm11, %zmm19, %zmm1 +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm23, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm13, %zmm24, %zmm3 +; AVX512F-FAST-NEXT: vpermd %zmm12, %zmm25, %zmm3 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k3} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 448(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 384(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, (%rax) +; AVX512F-FAST-NEXT: addq $136, %rsp ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -6840,651 +6723,645 @@ ; ; AVX512F-SLOW-LABEL: store_i16_stride8_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $504, %rsp # imm = 0x1F8 +; AVX512F-SLOW-NEXT: subq $264, %rsp # imm = 0x108 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%r10), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %xmm3 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm7 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3> -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm30, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = <0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3> +; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = <0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3> +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm31, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3> ; AVX512F-SLOW-NEXT: movw $-30584, %r11w # imm = 0x8888 ; AVX512F-SLOW-NEXT: kmovw %r11d, %k2 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm29, %zmm0 {%k2} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm6, %zmm1 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u> +; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm10, %zmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u> ; AVX512F-SLOW-NEXT: movw $8738, %r11w # imm = 0x2222 ; AVX512F-SLOW-NEXT: kmovw %r11d, %k1 -; AVX512F-SLOW-NEXT: vmovdqa 96(%r10), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %ymm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11] -; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512F-SLOW-NEXT: vpermd %zmm10, %zmm19, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm18, %zmm0 {%k2} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %ymm11 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm13 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm16, %zmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm17, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm19, %zmm0 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm18, %zmm0 {%k2} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm16, %zmm31 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm17, %zmm31 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 96(%r10), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %xmm11 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm30, %zmm0 -; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm29, %zmm0 {%k2} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm30, %zmm0 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm29, %zmm0 {%k2} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%r10), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %ymm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[8],ymm2[8],ymm9[9],ymm2[9],ymm9[10],ymm2[10],ymm9[11],ymm2[11] -; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm11 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm19, %zmm0 -; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm18, %zmm0 {%k2} +; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm13, %zmm15 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm31, %zmm3 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm18, %zmm3 {%k2} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm10, %zmm14 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm13, %zmm14 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa (%r10), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11] +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[8],ymm3[8],ymm6[9],ymm3[9],ymm6[10],ymm3[10],ymm6[11],ymm3[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm17, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm16, %zmm0 {%k2} ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm16, %zmm26 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11] -; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm17, %zmm26 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %xmm8 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm2[4],ymm9[5],ymm2[5],ymm9[6],ymm2[6],ymm9[7],ymm2[7],ymm9[12],ymm2[12],ymm9[13],ymm2[13],ymm9[14],ymm2[14],ymm9[15],ymm2[15] -; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %xmm9 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] -; AVX512F-SLOW-NEXT: vpermd %zmm10, %zmm19, %zmm5 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm18, %zmm5 {%k2} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm12[4],ymm15[5],ymm12[5],ymm15[6],ymm12[6],ymm15[7],ymm12[7],ymm15[12],ymm12[12],ymm15[13],ymm12[13],ymm15[14],ymm12[14],ymm15[15],ymm12[15] +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm12 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm4, %zmm19 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[8],ymm9[8],ymm12[9],ymm9[9],ymm12[10],ymm9[10],ymm12[11],ymm9[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm4, %zmm19 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm8 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm7 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm3[4],ymm6[5],ymm3[5],ymm6[6],ymm3[6],ymm6[7],ymm3[7],ymm6[12],ymm3[12],ymm6[13],ymm3[13],ymm6[14],ymm3[14],ymm6[15],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm5 +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm17, %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm16, %zmm4 {%k2} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm12[4],ymm9[4],ymm12[5],ymm9[5],ymm12[6],ymm9[6],ymm12[7],ymm9[7],ymm12[12],ymm9[12],ymm12[13],ymm9[13],ymm12[14],ymm9[14],ymm12[15],ymm9[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm2 ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm16, %zmm24 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm17, %zmm24 {%k1} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm30, %zmm1 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm29, %zmm1 {%k2} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm21, %zmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, %zmm23 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm20, %zmm11 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, %zmm22 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm31, %zmm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm18, %zmm6 {%k2} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm10, %zmm12 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm13, %zmm12 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm31, %zmm3 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm18, %zmm3 {%k2} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, %zmm8 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm10, %zmm18 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm13, %zmm18 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %ymm6 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] -; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm19, %zmm28 -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm18, %zmm28 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11] -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm16, %zmm23 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm3[0],ymm12[0],ymm3[1],ymm12[1],ymm3[2],ymm12[2],ymm3[3],ymm12[3],ymm3[8],ymm12[8],ymm3[9],ymm12[9],ymm3[10],ymm12[10],ymm3[11],ymm12[11] -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm17, %zmm23 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm19, %zmm25 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm18, %zmm25 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm12[4],ymm3[5],ymm12[5],ymm3[6],ymm12[6],ymm3[7],ymm12[7],ymm3[12],ymm12[12],ymm3[13],ymm12[13],ymm3[14],ymm12[14],ymm3[15],ymm12[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm9[4],ymm11[5],ymm9[5],ymm11[6],ymm9[6],ymm11[7],ymm9[7],ymm11[12],ymm9[12],ymm11[13],ymm9[13],ymm11[14],ymm9[14],ymm11[15],ymm9[15] -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm16, %zmm21 -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm17, %zmm21 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa (%r10), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11] -; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm19, %zmm27 -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm3[0],ymm8[1],ymm3[1],ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[8],ymm3[8],ymm8[9],ymm3[9],ymm8[10],ymm3[10],ymm8[11],ymm3[11] -; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm18, %zmm27 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm8[4],ymm3[4],ymm8[5],ymm3[5],ymm8[6],ymm3[6],ymm8[7],ymm3[7],ymm8[12],ymm3[12],ymm8[13],ymm3[13],ymm8[14],ymm3[14],ymm8[15],ymm3[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm4[4],ymm6[5],ymm4[5],ymm6[6],ymm4[6],ymm6[7],ymm4[7],ymm6[12],ymm4[12],ymm6[13],ymm4[13],ymm6[14],ymm4[14],ymm6[15],ymm4[15] -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm19, %zmm20 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm18, %zmm20 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm16, %zmm18 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11] -; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm17, %zmm18 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm4 -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm16, %zmm16 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm3 -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm17, %zmm16 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX512F-SLOW-NEXT: vpermd %zmm10, %zmm30, %zmm19 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm8 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm29, %zmm19 {%k2} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm30, %zmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm29, %zmm10 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm13 -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm30, %zmm17 -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm29, %zmm17 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm22 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm6 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm6 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm30, %zmm8 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm29, %zmm8 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = <0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u> -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm29, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u> -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm30, %zmm7 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm29, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm30, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm29, %zmm5 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm30, %zmm5 {%k1} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm6[0],ymm1[1],ymm6[1],ymm1[2],ymm6[2],ymm1[3],ymm6[3],ymm1[8],ymm6[8],ymm1[9],ymm6[9],ymm1[10],ymm6[10],ymm1[11],ymm6[11] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm17, %zmm21 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm16, %zmm21 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, %zmm25 +; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm23, %zmm20 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm2[0],ymm7[0],ymm2[1],ymm7[1],ymm2[2],ymm7[2],ymm2[3],ymm7[3],ymm2[8],ymm7[8],ymm2[9],ymm7[9],ymm2[10],ymm7[10],ymm2[11],ymm7[11] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 +; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm22, %zmm20 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 64(%r10), %xmm9 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm6[4],ymm1[5],ymm6[5],ymm1[6],ymm6[6],ymm1[7],ymm6[7],ymm1[12],ymm6[12],ymm1[13],ymm6[13],ymm1[14],ymm6[14],ymm1[15],ymm6[15] +; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %xmm6 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %xmm5 +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm17, %zmm23 +; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %xmm3 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm16, %zmm23 {%k2} ; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm2[4],ymm7[4],ymm2[5],ymm7[5],ymm2[6],ymm7[6],ymm2[7],ymm7[7],ymm2[12],ymm7[12],ymm2[13],ymm7[13],ymm2[14],ymm7[14],ymm2[15],ymm7[15] ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm29, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX512F-SLOW-NEXT: vpermd %zmm15, %zmm30, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm29, %zmm3 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm30, %zmm3 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm29, %zmm14 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX512F-SLOW-NEXT: vpermd %zmm11, %zmm30, %zmm14 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15] +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm25, %zmm22 +; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm24, %zmm22 {%k1} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm31, %zmm25 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm8, %zmm25 {%k2} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm10, %zmm24 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm13, %zmm24 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm31, %zmm27 +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm8, %zmm27 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm29, %zmm1 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm30, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm29, %zmm2 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm30, %zmm2 {%k1} +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm10, %zmm26 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm13, %zmm26 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 64(%r10), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm17, %zmm28 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %ymm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm16, %zmm28 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm17, %zmm29 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm16, %zmm29 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm31, %zmm30 +; AVX512F-SLOW-NEXT: vmovdqa 96(%r10), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm8, %zmm30 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm31, %zmm31 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm8, %zmm31 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm10, %zmm9 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm13, %zmm9 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm10, %zmm10 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm13, %zmm10 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm1 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm17, %zmm13 +; AVX512F-SLOW-NEXT: vmovdqa 96(%r10), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %ymm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm16, %zmm13 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm17, %zmm0 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm16, %zmm0 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm7, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %ymm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm17, %zmm3 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm7, %zmm1 +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm17, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] +; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm7, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %ymm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] +; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm17, %zmm5 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm16, %zmm2 +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm17, %zmm2 {%k1} ; AVX512F-SLOW-NEXT: movb $-86, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm31 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm26 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm24 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, %zmm23 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, %zmm21 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, %zmm18 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, %zmm16 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm15 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm19 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm18 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, %zmm20 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, %zmm26 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm9 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, 448(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 576(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 512(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 704(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, 640(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 832(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 896(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512F-SLOW-NEXT: addq $504, %rsp # imm = 0x1F8 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 960(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 896(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 832(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 768(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 704(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 640(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, 576(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 512(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, 448(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, (%rax) +; AVX512F-SLOW-NEXT: addq $264, %rsp # imm = 0x108 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i16_stride8_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $2312, %rsp # imm = 0x908 +; AVX512F-FAST-NEXT: subq $2376, %rsp # imm = 0x948 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm12 -; AVX512F-FAST-NEXT: vmovdqa 64(%r10), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%r10), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa 96(%rax), %ymm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%r9), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %ymm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm5 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %ymm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm9 +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm1 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm10 +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm11 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm4 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm12 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm6 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm14 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm15 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm7 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %ymm7 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%r10), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 96(%rax), %xmm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm3, %zmm3 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%r9), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %xmm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm5 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %xmm5 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %xmm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%r10), %ymm8 +; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm9 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm10, %zmm10 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm10 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm11 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm12 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm14 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[8],ymm14[8],ymm9[9],ymm14[9],ymm9[10],ymm14[10],ymm9[11],ymm14[11] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm9[4],ymm14[4],ymm9[5],ymm14[5],ymm9[6],ymm14[6],ymm9[7],ymm14[7],ymm9[12],ymm14[12],ymm9[13],ymm14[13],ymm9[14],ymm14[14],ymm9[15],ymm14[15] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm19 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm5 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm6 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm7 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm10 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm11 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15] +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm4 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %xmm7 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 64(%r10), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 64(%rax), %ymm3 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa 64(%r9), %ymm4 -; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %ymm5 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm24 -; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %ymm6 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %ymm7 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %ymm8 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm5[0],ymm8[0],ymm5[1],ymm8[1],ymm5[2],ymm8[2],ymm5[3],ymm8[3],ymm5[8],ymm8[8],ymm5[9],ymm8[9],ymm5[10],ymm8[10],ymm5[11],ymm8[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm8[4],ymm5[5],ymm8[5],ymm5[6],ymm8[6],ymm5[7],ymm8[7],ymm5[12],ymm8[12],ymm5[13],ymm8[13],ymm5[14],ymm8[14],ymm5[15],ymm8[15] -; AVX512F-FAST-NEXT: vmovdqa 64(%rax), %xmm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 64(%r9), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %xmm7 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm30 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm7[0],ymm4[0],ymm7[1],ymm4[1],ymm7[2],ymm4[2],ymm7[3],ymm4[3],ymm7[8],ymm4[8],ymm7[9],ymm4[9],ymm7[10],ymm4[10],ymm7[11],ymm4[11] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm7[4],ymm4[4],ymm7[5],ymm4[5],ymm7[6],ymm4[6],ymm7[7],ymm4[7],ymm7[12],ymm4[12],ymm7[13],ymm4[13],ymm7[14],ymm4[14],ymm7[15],ymm4[15] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm4 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm21 +; AVX512F-FAST-NEXT: vmovdqa 64(%r10), %xmm4 +; AVX512F-FAST-NEXT: vmovdqa 64(%rax), %xmm7 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 64(%r9), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %xmm1 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm31 +; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm25 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm24 +; AVX512F-FAST-NEXT: vmovdqa 64(%r10), %ymm6 +; AVX512F-FAST-NEXT: vmovdqa 64(%rax), %ymm8 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm7 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm31 +; AVX512F-FAST-NEXT: vmovdqa 64(%r9), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %ymm7 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm20 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm17 +; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %ymm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm0 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm26 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm23 +; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm27 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm17 -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm29 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm7 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm28 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm29 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm8, %zmm27 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm30 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm28 +; AVX512F-FAST-NEXT: vmovdqa 96(%r10), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa 96(%rax), %xmm1 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm12 +; AVX512F-FAST-NEXT: vmovdqa 96(%r9), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %xmm3 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm11 +; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %xmm4 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %xmm5 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm22 +; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %xmm7 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm9 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm10 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm8 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa 96(%r10), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa 96(%rax), %ymm7 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm14 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm3, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa 96(%r9), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %ymm3 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm16 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm15 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm13 -; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm21 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm26 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm23 -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm3, %zmm22 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %ymm3 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %ymm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r10), %ymm6 -; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm3 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[8],ymm6[8],ymm3[9],ymm6[9],ymm3[10],ymm6[10],ymm3[11],ymm6[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm7 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm10 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm10[0],ymm5[0],ymm10[1],ymm5[1],ymm10[2],ymm5[2],ymm10[3],ymm5[3],ymm10[8],ymm5[8],ymm10[9],ymm5[9],ymm10[10],ymm5[10],ymm10[11],ymm5[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm19 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm11 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm14[0],ymm11[0],ymm14[1],ymm11[1],ymm14[2],ymm11[2],ymm14[3],ymm11[3],ymm14[8],ymm11[8],ymm14[9],ymm11[9],ymm14[10],ymm11[10],ymm14[11],ymm11[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm6[4],ymm3[5],ymm6[5],ymm3[6],ymm6[6],ymm3[7],ymm6[7],ymm3[12],ymm6[12],ymm3[13],ymm6[13],ymm3[14],ymm6[14],ymm3[15],ymm6[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm6 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm10[4],ymm5[4],ymm10[5],ymm5[5],ymm10[6],ymm5[6],ymm10[7],ymm5[7],ymm10[12],ymm5[12],ymm10[13],ymm5[13],ymm10[14],ymm5[14],ymm10[15],ymm5[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm2 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm11[4],ymm14[5],ymm11[5],ymm14[6],ymm11[6],ymm14[7],ymm11[7],ymm14[12],ymm11[12],ymm14[13],ymm11[13],ymm14[14],ymm11[14],ymm14[15],ymm11[15] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; AVX512F-FAST-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm10 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm13, %zmm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm4 +; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm14 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm12 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11> +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: movw $-30584, %ax # imm = 0x8888 ; AVX512F-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm0, %zmm24 -; AVX512F-FAST-NEXT: vpermd %zmm25, %zmm1, %zmm24 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm18, %zmm0, %zmm25 -; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm1, %zmm25 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm17, %zmm0, %zmm5 -; AVX512F-FAST-NEXT: vpermd %zmm27, %zmm1, %zmm5 {%k2} -; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermd %zmm13, %zmm0, %zmm27 -; AVX512F-FAST-NEXT: vpermd %zmm15, %zmm1, %zmm27 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm19, %zmm0, %zmm5 -; AVX512F-FAST-NEXT: vpermd %zmm7, %zmm1, %zmm5 {%k2} -; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermd %zmm2, %zmm0, %zmm19 -; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm1, %zmm19 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11> +; AVX512F-FAST-NEXT: vpermd (%rsp), %zmm0, %zmm2 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm18 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd %zmm19, %zmm1, %zmm19 +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm19 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd %zmm21, %zmm1, %zmm21 +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm21 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm1, %zmm2 +; AVX512F-FAST-NEXT: vpermd %zmm25, %zmm0, %zmm2 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermd %zmm17, %zmm1, %zmm24 +; AVX512F-FAST-NEXT: vpermd %zmm20, %zmm0, %zmm24 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm11, %zmm1, %zmm2 +; AVX512F-FAST-NEXT: vpermd %zmm12, %zmm0, %zmm2 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm1, %zmm25 +; AVX512F-FAST-NEXT: vpermd %zmm8, %zmm0, %zmm25 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u> +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: movw $8738, %ax # imm = 0x2222 ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 {%k1} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,0,1,1,1,1,u,u,10,10,11,11,11,11,u,u> +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm6 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 {%k1} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm8 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 {%k1} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd %zmm31, %zmm11, %zmm20 +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm20 {%k1} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd %zmm23, %zmm11, %zmm12 +; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm0, %zmm12 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm10 +; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm0, %zmm10 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm7, %zmm11, %zmm11 +; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm0, %zmm11 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm7 {%k1} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 {%k1} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd %zmm31, %zmm0, %zmm15 -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 {%k1} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd %zmm28, %zmm0, %zmm17 -; AVX512F-FAST-NEXT: vpermd %zmm29, %zmm1, %zmm17 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm0, %zmm9 -; AVX512F-FAST-NEXT: vpermd %zmm21, %zmm1, %zmm9 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm4, %zmm0, %zmm4 -; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm1, %zmm4 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm8, %zmm0, %zmm8 -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm1, %zmm8 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm16 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm21 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm21 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm28 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm28 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm29 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm29 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd %zmm23, %zmm0, %zmm23 -; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm10, %zmm23 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm20, %zmm0, %zmm20 -; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm10, %zmm20 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm12, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm10, %zmm0 {%k2} -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm10 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,u,1,u,1,u,u,u,10,u,11,u,11,u,u,u> -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm12 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm10 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,0,1,1,1,1,u,u,10,10,11,11,11,11,u,u> -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm12 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm7 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm22 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm22 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm23 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm26 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd %zmm27, %zmm0, %zmm27 +; AVX512F-FAST-NEXT: vpermd %zmm29, %zmm14, %zmm27 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm28, %zmm0, %zmm28 +; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm14, %zmm28 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm15, %zmm0, %zmm15 +; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm14, %zmm15 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm4, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm14, %zmm0 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm9 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] +; AVX512F-FAST-NEXT: vpermd %zmm4, %zmm14, %zmm4 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm16, %zmm4 {%k1} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm9 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm14 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u> -; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm22, %zmm14 -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm14 {%k1} +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm29 +; AVX512F-FAST-NEXT: vpermd %zmm29, %zmm14, %zmm29 +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm16, %zmm29 {%k1} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm9 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm26 -; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm22, %zmm26 -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm26 {%k1} +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm30 +; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm14, %zmm30 +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm16, %zmm30 {%k1} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm9 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm30 -; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm22, %zmm30 -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm30 {%k1} +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm31 +; AVX512F-FAST-NEXT: vpermd %zmm31, %zmm14, %zmm31 +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm16, %zmm31 {%k1} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm9 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm31 -; AVX512F-FAST-NEXT: vpermd %zmm31, %zmm22, %zmm31 -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm31 {%k1} +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm13 +; AVX512F-FAST-NEXT: vpermd %zmm13, %zmm14, %zmm13 +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm16, %zmm13 {%k1} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm10 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm5 -; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm22, %zmm5 -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm5 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm10 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm18 -; AVX512F-FAST-NEXT: vpermd %zmm18, %zmm22, %zmm18 -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm11, %zmm18 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm10 -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm22, %zmm10 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm22 -; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm11, %zmm10 {%k1} +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm9 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm14, %zmm5 +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm16, %zmm5 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm17 +; AVX512F-FAST-NEXT: vpermd %zmm17, %zmm14, %zmm17 +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm16, %zmm17 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm14, %zmm9 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm14, %zmm14 +; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm16, %zmm9 {%k1} ; AVX512F-FAST-NEXT: movb $-86, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm26 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm13 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, %zmm30 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, %zmm31 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm8 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm6 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm8 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm30 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, %zmm31 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm20 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm12 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, %zmm13 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, %zmm5 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm11 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 320(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, 576(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 512(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 704(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, 896(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 832(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 768(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 704(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 640(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, 832(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 768(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 960(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 896(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 64(%rax) -; AVX512F-FAST-NEXT: addq $2312, %rsp # imm = 0x908 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 576(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, 512(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 384(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-FAST-NEXT: addq $2376, %rsp # imm = 0x948 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -7499,57 +7376,57 @@ ; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512BW-NEXT: vpermt2w %zmm27, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm5, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512BW-NEXT: vpermt2w %zmm27, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512BW-NEXT: vpermt2w %zmm27, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm9, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512BW-NEXT: vpermt2w %zmm27, %zmm10, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm11, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512BW-NEXT: vpermt2w %zmm27, %zmm12, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm13, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm15, %zmm16 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm31, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -7571,10 +7448,10 @@ ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2w %zmm28, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512BW-NEXT: vpermt2w %zmm27, %zmm28, %zmm31 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = ; AVX512BW-NEXT: vpermt2w %zmm27, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -7595,33 +7472,33 @@ ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm0, %zmm27 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm21 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm22 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm23 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm24 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm29, %zmm25 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm30, %zmm26 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm16 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -7640,28 +7517,28 @@ ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm12, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm17 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm8, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm7, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm6, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm4, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u> ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm19, %zmm3 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -7762,22 +7639,22 @@ ; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm1 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k3} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm1, 896(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 960(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 832(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 896(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512BW-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll @@ -89,15 +89,15 @@ ; SSE-NEXT: movaps (%rsi), %xmm2 ; SSE-NEXT: movaps 16(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps %xmm2, 48(%rdx) -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps %xmm4, 16(%rdx) +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps %xmm2, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps %xmm4, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride2_vf8: @@ -159,25 +159,25 @@ ; SSE-NEXT: movaps 32(%rsi), %xmm6 ; SSE-NEXT: movaps 48(%rsi), %xmm7 ; SSE-NEXT: movaps %xmm0, %xmm8 -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] -; SSE-NEXT: movaps %xmm3, 96(%rdx) -; SSE-NEXT: movaps %xmm6, 112(%rdx) -; SSE-NEXT: movaps %xmm2, 64(%rdx) -; SSE-NEXT: movaps %xmm5, 80(%rdx) -; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps %xmm4, 48(%rdx) -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps %xmm8, 16(%rdx) +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE-NEXT: movaps %xmm3, 112(%rdx) +; SSE-NEXT: movaps %xmm6, 96(%rdx) +; SSE-NEXT: movaps %xmm2, 80(%rdx) +; SSE-NEXT: movaps %xmm5, 64(%rdx) +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps %xmm4, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps %xmm8, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride2_vf16: @@ -190,12 +190,12 @@ ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 @@ -204,8 +204,8 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -217,16 +217,16 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[0,1],ymm4[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[0,1],ymm4[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -269,100 +269,100 @@ ; SSE-NEXT: movaps 32(%rsi), %xmm14 ; SSE-NEXT: movaps 48(%rsi), %xmm15 ; SSE-NEXT: movaps %xmm8, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; SSE-NEXT: movaps %xmm1, %xmm9 -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] ; SSE-NEXT: movaps %xmm2, %xmm10 -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm14[2],xmm10[3],xmm14[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] ; SSE-NEXT: movaps %xmm5, %xmm14 -; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm15[2],xmm5[3],xmm15[3] ; SSE-NEXT: movaps %xmm3, %xmm15 -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3] ; SSE-NEXT: movaps %xmm4, %xmm13 -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm12[2],xmm4[3],xmm12[3] ; SSE-NEXT: movaps %xmm6, %xmm12 -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm11[2],xmm6[3],xmm11[3] ; SSE-NEXT: movaps 112(%rsi), %xmm11 ; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm11[2],xmm7[3],xmm11[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: movaps %xmm0, 224(%rdx) -; SSE-NEXT: movaps %xmm7, 240(%rdx) -; SSE-NEXT: movaps %xmm6, 192(%rdx) -; SSE-NEXT: movaps %xmm12, 208(%rdx) -; SSE-NEXT: movaps %xmm4, 160(%rdx) -; SSE-NEXT: movaps %xmm13, 176(%rdx) -; SSE-NEXT: movaps %xmm3, 128(%rdx) -; SSE-NEXT: movaps %xmm15, 144(%rdx) -; SSE-NEXT: movaps %xmm5, 96(%rdx) -; SSE-NEXT: movaps %xmm14, 112(%rdx) -; SSE-NEXT: movaps %xmm2, 64(%rdx) -; SSE-NEXT: movaps %xmm10, 80(%rdx) -; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps %xmm9, 48(%rdx) -; SSE-NEXT: movaps %xmm8, (%rdx) +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSE-NEXT: movaps %xmm0, 240(%rdx) +; SSE-NEXT: movaps %xmm7, 224(%rdx) +; SSE-NEXT: movaps %xmm6, 208(%rdx) +; SSE-NEXT: movaps %xmm12, 192(%rdx) +; SSE-NEXT: movaps %xmm4, 176(%rdx) +; SSE-NEXT: movaps %xmm13, 160(%rdx) +; SSE-NEXT: movaps %xmm3, 144(%rdx) +; SSE-NEXT: movaps %xmm15, 128(%rdx) +; SSE-NEXT: movaps %xmm5, 112(%rdx) +; SSE-NEXT: movaps %xmm14, 96(%rdx) +; SSE-NEXT: movaps %xmm2, 80(%rdx) +; SSE-NEXT: movaps %xmm10, 64(%rdx) +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps %xmm9, 32(%rdx) +; SSE-NEXT: movaps %xmm8, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride2_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm6[2],xmm10[3],xmm6[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -378,28 +378,28 @@ ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm7 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm8[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm8[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[6],ymm6[6],ymm2[7],ymm6[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[4],ymm6[4],ymm2[5],ymm6[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm2[2,3],ymm8[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm2[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[6],ymm7[6],ymm3[7],ymm7[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[4],ymm7[4],ymm3[5],ymm7[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3],ymm8[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[0,1],ymm8[0,1] -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -409,17 +409,17 @@ ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-NEXT: vpermt2d %zmm2, %zmm4, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512-NEXT: vpermi2d %zmm3, %zmm1, %zmm4 ; AVX512-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm4, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %in.vec0 = load <32 x i32>, ptr %in.vecptr0, align 64 @@ -585,106 +585,106 @@ ; ; AVX1-ONLY-LABEL: store_i32_stride2_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovaps 240(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 144(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vmovaps 144(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vmovaps 176(%rsi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm11, %ymm11 ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm12 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 208(%rsi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm14[2],xmm11[2],xmm14[3],xmm11[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm14[2],xmm9[2],xmm14[3],xmm9[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vmovaps 176(%rsi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 208(%rsi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 240(%rsi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 416(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm12, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 352(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 288(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 256(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 288(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 256(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -707,56 +707,56 @@ ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm14 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[0,1],ymm15[0,1] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm15[2,3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm15[2,3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm7[2],ymm13[2],ymm7[3],ymm13[3],ymm7[6],ymm13[6],ymm7[7],ymm13[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm7[0],ymm13[0],ymm7[1],ymm13[1],ymm7[4],ymm13[4],ymm7[5],ymm13[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm13[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm13[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[6],ymm14[6],ymm9[7],ymm14[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[4],ymm14[4],ymm9[5],ymm14[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm9[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm9[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm15[2,3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[6],ymm12[6],ymm8[7],ymm12[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[1],ymm12[1],ymm8[4],ymm12[4],ymm8[5],ymm12[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm8[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm8[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm15[2,3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm6[2],ymm11[2],ymm6[3],ymm11[3],ymm6[6],ymm11[6],ymm6[7],ymm11[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm11[0],ymm6[1],ymm11[1],ymm6[4],ymm11[4],ymm6[5],ymm11[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm6[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm6[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm15[2,3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[6],ymm10[6],ymm3[7],ymm10[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[4],ymm10[4],ymm3[5],ymm10[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm3[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm3[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm15[2,3] ; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm15 ; AVX2-ONLY-NEXT: vmovaps %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm15[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[0,1],ymm0[0,1] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 384(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 416(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 320(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 352(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 256(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 288(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 192(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 224(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 128(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm15[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 416(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 384(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 352(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 320(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 288(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 256(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 224(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 160(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -770,10 +770,10 @@ ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512-NEXT: vpermt2d %zmm4, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 @@ -783,14 +783,14 @@ ; AVX512-NEXT: vpermt2d %zmm6, %zmm10, %zmm2 ; AVX512-NEXT: vpermi2d %zmm7, %zmm3, %zmm8 ; AVX512-NEXT: vpermt2d %zmm7, %zmm10, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm3, 384(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm8, 448(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm5, 320(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm4, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm9, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm3, 448(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm8, 384(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm2, 320(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm5, 256(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %in.vec0 = load <64 x i32>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll @@ -35,33 +35,46 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,0],xmm0[2,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,0,3,7,5,4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,u,1,u,5,u,u] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd (%rdx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vmovlps %xmm1, 16(%rcx) ; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rcx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; -; AVX2-LABEL: store_i32_stride3_vf2: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,1,3,5,u,u> -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovlps %xmm1, 16(%rcx) -; AVX2-NEXT: vmovaps %xmm0, (%rcx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-ONLY-LABEL: store_i32_stride3_vf2: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX2-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = <0,4,u,1,5,u,u,u> +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-ONLY-NEXT: vmovlps %xmm1, 16(%rcx) +; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rcx) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq +; +; AVX512-LABEL: store_i32_stride3_vf2: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastq (%rdx), %ymm1 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,10,1,5,13,u,u> +; AVX512-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm0 +; AVX512-NEXT: vmovq %xmm0, 16(%rcx) +; AVX512-NEXT: vmovdqa %xmm2, (%rcx) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.vec0 = load <2 x i32>, ptr %in.vecptr0, align 64 %in.vec1 = load <2 x i32>, ptr %in.vecptr1, align 64 %in.vec2 = load <2 x i32>, ptr %in.vecptr2, align 64 @@ -79,19 +92,19 @@ ; SSE-NEXT: movaps (%rdi), %xmm0 ; SSE-NEXT: movaps (%rsi), %xmm1 ; SSE-NEXT: movaps (%rdx), %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[0,3] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[1,0] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps %xmm2, 16(%rcx) ; SSE-NEXT: movaps %xmm4, (%rcx) ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: retq @@ -191,42 +204,42 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride3_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm1 -; SSE-NEXT: movaps 16(%rdi), %xmm0 +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps (%rsi), %xmm5 -; SSE-NEXT: movaps 16(%rsi), %xmm6 -; SSE-NEXT: movaps (%rdx), %xmm2 -; SSE-NEXT: movaps 16(%rdx), %xmm3 +; SSE-NEXT: movaps 16(%rsi), %xmm3 +; SSE-NEXT: movaps (%rdx), %xmm6 +; SSE-NEXT: movaps 16(%rdx), %xmm4 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm0[1,0] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm7[0,2] ; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1] -; SSE-NEXT: movaps %xmm0, %xmm8 -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm6[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm3[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0] -; SSE-NEXT: movaps %xmm1, %xmm7 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm5[1] -; SSE-NEXT: movaps %xmm1, %xmm8 -; SSE-NEXT: movaps %xmm1, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm5[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm7[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm8[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm7[0,2] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm1[1,0] +; SSE-NEXT: movaps %xmm1, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[0,2] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm2[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: movaps %xmm9, (%rcx) -; SSE-NEXT: movaps %xmm5, 16(%rcx) -; SSE-NEXT: movaps %xmm4, 48(%rcx) -; SSE-NEXT: movaps %xmm6, 64(%rcx) -; SSE-NEXT: movaps %xmm1, 32(%rcx) -; SSE-NEXT: movaps %xmm0, 80(%rcx) +; SSE-NEXT: movaps %xmm4, 64(%rcx) +; SSE-NEXT: movaps %xmm7, 48(%rcx) +; SSE-NEXT: movaps %xmm6, 16(%rcx) +; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: movaps %xmm1, 80(%rcx) +; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride3_vf8: @@ -294,27 +307,28 @@ ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm3 +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] ; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7] ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [5,0,7,6,5,0,7,6] ; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vbroadcastsd 24(%rdi), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vbroadcastsd 24(%rdi), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm3, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -374,83 +388,78 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride3_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm1 -; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps 32(%rdi), %xmm4 -; SSE-NEXT: movaps 48(%rdi), %xmm5 -; SSE-NEXT: movaps (%rsi), %xmm7 +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movaps 16(%rdi), %xmm1 +; SSE-NEXT: movaps 32(%rdi), %xmm2 +; SSE-NEXT: movaps 48(%rdi), %xmm3 +; SSE-NEXT: movaps (%rsi), %xmm11 ; SSE-NEXT: movaps 16(%rsi), %xmm9 -; SSE-NEXT: movaps 32(%rsi), %xmm10 -; SSE-NEXT: movaps 48(%rsi), %xmm11 -; SSE-NEXT: movaps 16(%rdx), %xmm0 -; SSE-NEXT: movaps 32(%rdx), %xmm3 -; SSE-NEXT: movaps 48(%rdx), %xmm8 -; SSE-NEXT: movaps %xmm5, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] -; SSE-NEXT: movaps %xmm5, %xmm13 -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm11[3,3] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm8[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm12[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm8[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm13[2,0] -; SSE-NEXT: movaps %xmm4, %xmm13 -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm10[1] -; SSE-NEXT: movaps %xmm4, %xmm14 -; SSE-NEXT: movaps %xmm4, %xmm12 -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm10[3,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm13[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm14[2,0] -; SSE-NEXT: movaps %xmm2, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm9[1] -; SSE-NEXT: movaps %xmm2, %xmm15 +; SSE-NEXT: movaps 32(%rsi), %xmm7 +; SSE-NEXT: movaps 48(%rsi), %xmm5 +; SSE-NEXT: movaps (%rdx), %xmm12 +; SSE-NEXT: movaps 16(%rdx), %xmm10 +; SSE-NEXT: movaps 32(%rdx), %xmm8 +; SSE-NEXT: movaps 48(%rdx), %xmm6 +; SSE-NEXT: movaps %xmm12, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,0],xmm0[1,0] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm13[0,2] +; SSE-NEXT: movaps %xmm0, %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm11[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm11[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm12[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm11[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm13[0,2] +; SSE-NEXT: movaps %xmm10, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,0],xmm1[1,0] +; SSE-NEXT: movaps %xmm1, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm13[0,2] +; SSE-NEXT: movaps %xmm1, %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm9[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm10[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm13[0,2] +; SSE-NEXT: movaps %xmm8, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,0],xmm2[1,0] +; SSE-NEXT: movaps %xmm2, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm13[0,2] ; SSE-NEXT: movaps %xmm2, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm9[3,3] -; SSE-NEXT: movaps %xmm0, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm14[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm0[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,0] -; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1] -; SSE-NEXT: movaps %xmm1, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] -; SSE-NEXT: movaps (%rdx), %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm14[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm3[2,0] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[1,2],mem[2,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm7[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm8[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,3] -; SSE-NEXT: movaps %xmm15, (%rcx) -; SSE-NEXT: movaps %xmm7, 16(%rcx) -; SSE-NEXT: movaps %xmm13, 48(%rcx) -; SSE-NEXT: movaps %xmm9, 64(%rcx) -; SSE-NEXT: movaps %xmm12, 96(%rcx) -; SSE-NEXT: movaps %xmm10, 112(%rcx) -; SSE-NEXT: movaps %xmm6, 144(%rcx) -; SSE-NEXT: movaps %xmm11, 160(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm13[0,2] +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm3[1,0] +; SSE-NEXT: movaps %xmm3, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm7[0,2] +; SSE-NEXT: movaps %xmm3, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm5[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm7[0,2] +; SSE-NEXT: movaps %xmm6, 160(%rcx) +; SSE-NEXT: movaps %xmm13, 144(%rcx) +; SSE-NEXT: movaps %xmm8, 112(%rcx) +; SSE-NEXT: movaps %xmm9, 96(%rcx) +; SSE-NEXT: movaps %xmm10, 64(%rcx) +; SSE-NEXT: movaps %xmm11, 48(%rcx) +; SSE-NEXT: movaps %xmm12, 16(%rcx) +; SSE-NEXT: movaps %xmm4, (%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0,1,3] +; SSE-NEXT: movaps %xmm3, 176(%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3] -; SSE-NEXT: movaps %xmm2, 80(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3] -; SSE-NEXT: movaps %xmm4, 128(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0,1,3] -; SSE-NEXT: movaps %xmm5, 176(%rcx) +; SSE-NEXT: movaps %xmm2, 128(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] +; SSE-NEXT: movaps %xmm1, 80(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride3_vf16: @@ -471,14 +480,13 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vbroadcastsd (%rdx), %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm6[3,3],xmm5[3,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,1],xmm6[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[3,3],xmm3[3,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm7[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0,0,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5,6],ymm6[7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm4[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[1,1],xmm6[0,2] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm8[0] @@ -486,13 +494,14 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[3,3],xmm3[3,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm7[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[3,3],xmm5[3,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,1],xmm6[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0,0,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5,6],ymm6[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm7 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] @@ -505,9 +514,9 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1,2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -526,11 +535,11 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6],ymm2[7] ; AVX2-SLOW-NEXT: vbroadcastsd (%rdx), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 56(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm5[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[2,1,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm6[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = mem[1,0,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,1] @@ -538,11 +547,11 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6],ymm8[7] ; AVX2-SLOW-NEXT: vbroadcastsd 32(%rdx), %ymm9 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vbroadcastsd 56(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm3[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,1,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm1[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2] @@ -556,60 +565,62 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm8, 96(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm7, 160(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 64(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i32_stride3_vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm3 -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm4 -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm5 -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm6 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm1[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2],ymm7[3,4],ymm0[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm5[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm0[1,2],ymm7[3],ymm0[4,5],ymm7[6],ymm0[7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm7 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm7, %ymm8 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5,6],ymm8[7] -; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm2[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm6[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = [5,6,5,6,5,6,7,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm9, %ymm10 -; AVX2-FAST-NEXT: vbroadcastsd 56(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm5 +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm6 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm2 +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm7 +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm3[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0],ymm2[1],ymm9[2,3],ymm2[4],ymm9[5,6],ymm2[7] +; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm9[2],ymm2[3,4],ymm9[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm4[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7] +; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm6[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0],ymm3[1,2],ymm8[3],ymm3[4,5],ymm8[6],ymm3[7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm8 = [5,6,5,6,5,6,7,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vbroadcastsd 24(%rdi), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm9[2],ymm5[3,4],ymm9[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2,3],ymm6[4],ymm10[5,6],ymm6[7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] -; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vbroadcastsd 24(%rdi), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm5[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm6, 160(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm8, 128(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm0[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm1[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm4[1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vbroadcastsd 56(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm4, 128(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm7, 96(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -627,11 +638,11 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rdx), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm5[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm6[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm8 = mem[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,1] @@ -639,11 +650,11 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 32(%rdx), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm3[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm1[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2] @@ -657,9 +668,9 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 160(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 64(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -700,166 +711,142 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride3_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movaps (%rdi), %xmm1 -; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps 32(%rdi), %xmm3 -; SSE-NEXT: movaps 48(%rdi), %xmm4 -; SSE-NEXT: movaps (%rsi), %xmm10 +; SSE-NEXT: subq $40, %rsp +; SSE-NEXT: movaps 64(%rdi), %xmm5 +; SSE-NEXT: movaps (%rdi), %xmm2 +; SSE-NEXT: movaps 16(%rdi), %xmm3 +; SSE-NEXT: movaps 32(%rdi), %xmm4 +; SSE-NEXT: movaps 48(%rdi), %xmm10 +; SSE-NEXT: movaps 64(%rsi), %xmm6 +; SSE-NEXT: movaps (%rsi), %xmm11 ; SSE-NEXT: movaps 16(%rsi), %xmm13 -; SSE-NEXT: movaps 32(%rsi), %xmm12 -; SSE-NEXT: movaps 48(%rsi), %xmm9 -; SSE-NEXT: movaps (%rdx), %xmm5 -; SSE-NEXT: movaps 16(%rdx), %xmm6 -; SSE-NEXT: movaps 32(%rdx), %xmm7 +; SSE-NEXT: movaps 32(%rsi), %xmm9 +; SSE-NEXT: movaps 48(%rsi), %xmm7 +; SSE-NEXT: movaps (%rdx), %xmm0 +; SSE-NEXT: movaps 16(%rdx), %xmm14 +; SSE-NEXT: movaps 32(%rdx), %xmm12 ; SSE-NEXT: movaps 48(%rdx), %xmm8 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[0,3] -; SSE-NEXT: movaps %xmm5, %xmm11 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm10[3,3] -; SSE-NEXT: movaps %xmm1, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm11[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm6[0,3] -; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[1,0] +; SSE-NEXT: movaps %xmm2, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm11[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm11[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm0[2,3] +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[1,0] +; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm13[3,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm6[1,1] -; SSE-NEXT: movaps %xmm6, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[0,3] -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm12[3,3] -; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[0,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm13[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm14[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm13[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[0,2] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm4[1,0] ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm9[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm12[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm8[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm9 -; SSE-NEXT: movaps 64(%rdx), %xmm1 -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rsi), %xmm12 -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm12[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,2] -; SSE-NEXT: movaps 80(%rdi), %xmm10 -; SSE-NEXT: movaps 80(%rdx), %xmm1 -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rsi), %xmm8 -; SSE-NEXT: movaps %xmm10, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm12, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[0,2] +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm10[1,0] +; SSE-NEXT: movaps %xmm10, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm8[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[0,2] -; SSE-NEXT: movaps 96(%rdi), %xmm4 -; SSE-NEXT: movaps 96(%rdx), %xmm13 -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[0,3] -; SSE-NEXT: movaps 96(%rsi), %xmm5 -; SSE-NEXT: movaps %xmm4, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,2],xmm8[2,3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm7[1,1] +; SSE-NEXT: movaps 64(%rdx), %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[0,2] +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm5[1,0] +; SSE-NEXT: movaps %xmm5, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm6[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,2],xmm15[2,3] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm0[0,2] +; SSE-NEXT: movaps 80(%rdi), %xmm6 +; SSE-NEXT: movaps 80(%rdx), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm6[1,0] +; SSE-NEXT: movaps 80(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,2],xmm9[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[0,2] +; SSE-NEXT: movaps 96(%rdi), %xmm7 +; SSE-NEXT: movaps 96(%rdx), %xmm4 ; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm5[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm13[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm7[1,0] +; SSE-NEXT: movaps 96(%rsi), %xmm2 +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,2] ; SSE-NEXT: movaps 112(%rdi), %xmm0 -; SSE-NEXT: movaps 112(%rdx), %xmm7 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm7[0,3] -; SSE-NEXT: movaps 112(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] +; SSE-NEXT: movaps 112(%rdx), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[1,0] +; SSE-NEXT: movaps 112(%rsi), %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] ; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm15, %xmm2 -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[1,2],mem[2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm14[2,3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[1,2],mem[2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[1,2],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm13[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm7[2,3] -; SSE-NEXT: movaps %xmm1, 352(%rcx) -; SSE-NEXT: movaps %xmm3, 336(%rcx) -; SSE-NEXT: movaps %xmm5, 304(%rcx) -; SSE-NEXT: movaps %xmm6, 288(%rcx) -; SSE-NEXT: movaps %xmm8, 256(%rcx) -; SSE-NEXT: movaps %xmm11, 240(%rcx) -; SSE-NEXT: movaps %xmm12, 208(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 192(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 160(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 144(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rcx) +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm5[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[0,2] +; SSE-NEXT: movaps %xmm5, 352(%rcx) +; SSE-NEXT: movaps %xmm1, 336(%rcx) +; SSE-NEXT: movaps %xmm4, 304(%rcx) +; SSE-NEXT: movaps %xmm8, 288(%rcx) +; SSE-NEXT: movaps %xmm9, 256(%rcx) +; SSE-NEXT: movaps %xmm10, 240(%rcx) +; SSE-NEXT: movaps %xmm15, 208(%rcx) +; SSE-NEXT: movaps %xmm11, 192(%rcx) +; SSE-NEXT: movaps %xmm12, 160(%rcx) +; SSE-NEXT: movaps %xmm13, 144(%rcx) +; SSE-NEXT: movaps %xmm14, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 96(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -872,79 +859,96 @@ ; SSE-NEXT: movaps %xmm1, (%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 368(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3] -; SSE-NEXT: movaps %xmm4, 320(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0,1,3] -; SSE-NEXT: movaps %xmm10, 272(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0,1,3] -; SSE-NEXT: movaps %xmm9, 224(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0,1,3] +; SSE-NEXT: movaps %xmm7, 320(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0,1,3] +; SSE-NEXT: movaps %xmm6, 272(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: movaps %xmm0, 224(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 176(%rcx) -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[1,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 128(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 80(%rcx) -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,3] +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: addq $152, %rsp +; SSE-NEXT: addq $40, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride3_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovapd (%rdx), %ymm4 -; AVX1-ONLY-NEXT: vmovapd 32(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 64(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vmovapd (%rdx), %ymm5 +; AVX1-ONLY-NEXT: vmovapd 32(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vmovapd 64(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovapd 96(%rdx), %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm5[1],xmm1[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm1[1,1],xmm6[0,2] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd (%rdx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm6[3,3],xmm5[3,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,1],xmm6[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm3[2,3,2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0,0,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm12[1],xmm6[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm6[1,1],xmm13[0,2] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm12[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0],xmm12[2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm12[2],ymm6[3,4],ymm12[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm12[3,3],xmm9[3,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],xmm9[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[1,1],xmm12[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[0,0,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5,6],ymm12[7] +; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm8[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm1[1,1],xmm11[0,2] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm8[2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd (%rdx), %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm9[3,3],xmm4[3,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm4[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,1],xmm9[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm5[2,3,2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0,0,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1],ymm4[2,3],ymm8[4],ymm4[5,6],ymm8[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm10[1],xmm6[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm6[1,1],xmm8[0,2] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm10[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0],xmm10[2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3,4],ymm8[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[3,3],xmm7[3,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1],xmm8[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0,0,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm8[1,1],xmm10[0,2] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm8[0],xmm9[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,0],xmm9[2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm10[3,3],xmm9[3,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[1,1],xmm10[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[0,0,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7] +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm11[1],xmm10[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm10[1,1],xmm12[0,2] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,0],xmm11[2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm11 +; AVX1-ONLY-NEXT: vbroadcastsd 96(%rdx), %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] ; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm11 ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm12 @@ -955,53 +959,37 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[0,0,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6],ymm12[7] -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm13[1],xmm12[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm12[1,1],xmm14[0,2] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm12[0],xmm13[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,0],xmm13[2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vbroadcastsd 96(%rdx), %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm8[3,3],xmm7[3,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1],xmm8[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0,0,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1,1,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[1,1,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1,1,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm12[1,2],ymm3[3],ymm12[4,5],ymm3[6],ymm12[7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1,1,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm8[1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm12[1,2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1,2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1,2],ymm0[3],ymm12[4,5],ymm0[6],ymm12[7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 288(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm11, 352(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 256(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 288(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 256(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -1009,106 +997,106 @@ ; AVX2-SLOW-LABEL: store_i32_stride3_vf32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $40, %rsp -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm9 -; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm7 -; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm10 -; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm3 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm10 +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm7 +; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm4 +; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm11 +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm8 +; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm5 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm8[0,0,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vbroadcastsd (%rdx), %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[0,0,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vbroadcastsd (%rdx), %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastsd 88(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm9[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[2,1,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5,6],ymm11[7] +; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm10[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm2[2],ymm9[3,4],ymm2[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm11[2,1,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm9[1],ymm2[2,3],ymm9[4],ymm2[5,6],ymm9[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm11 = mem[1,0,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm6[0,0,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6],ymm11[7] -; AVX2-SLOW-NEXT: vbroadcastsd 64(%rdx), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm9 = mem[1,0,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5,6],ymm9[7] +; AVX2-SLOW-NEXT: vbroadcastsd 32(%rdx), %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd 56(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm7[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm7[2,1,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm8[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = mem[1,0,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,1] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7] -; AVX2-SLOW-NEXT: vbroadcastsd 32(%rdx), %ymm14 +; AVX2-SLOW-NEXT: vbroadcastsd 64(%rdx), %ymm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastsd 120(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vbroadcastsd 88(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm4[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm3[2,1,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm5[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm15 = mem[1,0,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,0,1] ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3],ymm15[4],ymm2[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vbroadcastsd 96(%rdx), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm4[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1],ymm15[2],ymm2[3,4],ymm15[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-SLOW-NEXT: vbroadcastsd 120(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm12 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm12[2,1,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3,4],ymm4[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm12[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0],ymm4[1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm13[1],ymm9[2,3],ymm13[4],ymm9[5,6],ymm13[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3,4],ymm8[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm10[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0],ymm6[1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm10[2],ymm6[3,4],ymm10[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm11[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0],ymm6[1,2],ymm10[3],ymm6[4,5],ymm10[6],ymm6[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2],ymm3[3,4],ymm7[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm8[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0],ymm3[1,2],ymm7[3],ymm3[4,5],ymm7[6],ymm3[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm7[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm5[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[1,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm12[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm11, 64(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm9, 352(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm15, 288(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm14, 352(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm14, 256(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rcx) ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-SLOW-NEXT: addq $40, %rsp @@ -1117,203 +1105,203 @@ ; ; AVX2-FAST-LABEL: store_i32_stride3_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm10 ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm1 -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm10 -; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm13 -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm12 -; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm14 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm4 +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm9 +; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm12 +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm11 +; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm1 +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovaps 64(%rsi), %xmm6 ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [1,0,2,2,1,0,2,2] ; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm8, %ymm5 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = [5,6,5,6,5,6,7,7] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm9, %ymm5 -; AVX2-FAST-NEXT: vbroadcastsd 88(%rdi), %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm14[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm11[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0],ymm6[1],ymm15[2,3],ymm6[4],ymm15[5,6],ymm6[7] -; AVX2-FAST-NEXT: vbroadcastsd 64(%rdx), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm15[2],ymm6[3,4],ymm15[5],ymm6[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm14[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0],ymm11[1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm7[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm10[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0],ymm1[1],ymm13[2,3],ymm1[4],ymm13[5,6],ymm1[7] +; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm13[2],ymm1[3,4],ymm13[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm8, %ymm3 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm7[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0],ymm3[1],ymm13[2,3],ymm3[4],ymm13[5,6],ymm3[7] +; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm5[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5,6],ymm6[7] +; AVX2-FAST-NEXT: vbroadcastsd 64(%rdx), %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2],ymm6[3,4],ymm13[5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovaps 96(%rsi), %xmm13 +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm2[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5,6],ymm8[7] +; AVX2-FAST-NEXT: vbroadcastsd 96(%rdx), %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm11[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1,2],ymm13[3],ymm10[4,5],ymm13[6],ymm10[7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm13 = [5,6,5,6,5,6,7,7] +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm13, %ymm12 +; AVX2-FAST-NEXT: vbroadcastsd 24(%rdi), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm14 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6],ymm11[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm9[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm14[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0],ymm7[1,2],ymm12[3],ymm7[4,5],ymm12[6],ymm7[7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm13, %ymm9 +; AVX2-FAST-NEXT: vbroadcastsd 56(%rdi), %ymm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm12 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm14[1],ymm9[2,3],ymm14[4],ymm9[5,6],ymm14[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm4[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2],ymm5[3,4],ymm14[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm12[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm9, %ymm14 -; AVX2-FAST-NEXT: vbroadcastsd 56(%rdi), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm15 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6],ymm12[7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm8, %ymm10 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3],ymm10[4],ymm7[5,6],ymm10[7] -; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm15[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm3[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0],ymm5[1,2],ymm14[3],ymm5[4,5],ymm14[6],ymm5[7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm13, %ymm4 +; AVX2-FAST-NEXT: vbroadcastsd 88(%rdi), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm14[2],ymm4[3,4],ymm14[5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm14 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm14[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1,2],ymm0[3],ymm10[4,5],ymm0[6],ymm10[7] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm9, %ymm10 -; AVX2-FAST-NEXT: vbroadcastsd 120(%rdi), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1],ymm4[2],ymm10[3,4],ymm4[5],ymm10[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm14[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm10[1],ymm4[2,3],ymm10[4],ymm4[5,6],ymm10[7] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5,6],ymm8[7] -; AVX2-FAST-NEXT: vbroadcastsd 96(%rdx), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vbroadcastsd 24(%rdi), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm3, 288(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm4, 352(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm7, 96(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm12, 160(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm13, 128(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm11, 224(%rcx) +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3],ymm12[4],ymm4[5,6],ymm12[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm14[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0],ymm2[1,2],ymm12[3],ymm2[4,5],ymm12[6],ymm2[7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm13, %ymm0 +; AVX2-FAST-NEXT: vbroadcastsd 120(%rdi), %ymm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm14[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6],ymm12[7] +; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm2, 320(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm4, 256(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm5, 224(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm9, 160(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm7, 128(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm11, 64(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm10, 32(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm8, 288(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm5, 256(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i32_stride3_vf32: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $40, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm8[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rdx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rdx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 88(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm9[1,2,3,3,5,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm10[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm2[2],ymm9[3,4],ymm2[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm11[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm9[1],ymm2[2,3],ymm9[4],ymm2[5,6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm11 = mem[1,0,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm6[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 64(%rdx), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm9 = mem[1,0,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 32(%rdx), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm7[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm7[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm8[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = mem[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 32(%rdx), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 64(%rdx), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 120(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 88(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm4[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm3[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm5[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm15 = mem[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3],ymm15[4],ymm2[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 96(%rdx), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm4[1,2,3,3,5,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1],ymm15[2],ymm2[3,4],ymm15[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 120(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm12[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3,4],ymm4[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm12[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0],ymm4[1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm13[1],ymm9[2,3],ymm13[4],ymm9[5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3,4],ymm8[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm10[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0],ymm6[1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm10[2],ymm6[3,4],ymm10[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm11[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0],ymm6[1,2],ymm10[3],ymm6[4,5],ymm10[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2],ymm3[3,4],ymm7[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm8[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0],ymm3[1,2],ymm7[3],ymm3[4,5],ymm7[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm7[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm5[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm12[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 352(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, 288(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 352(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 256(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: addq $40, %rsp @@ -1333,27 +1321,26 @@ ; AVX512-NEXT: vpermt2d %zmm2, %zmm6, %zmm7 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512-NEXT: vpermt2d %zmm4, %zmm8, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512-NEXT: vpermt2d %zmm3, %zmm9, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] -; AVX512-NEXT: vpermt2d %zmm5, %zmm11, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = <5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10> -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] -; AVX512-NEXT: vpermt2d %zmm5, %zmm14, %zmm13 -; AVX512-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 -; AVX512-NEXT: vpermt2d %zmm5, %zmm8, %zmm1 -; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 -; AVX512-NEXT: vpermt2d %zmm4, %zmm11, %zmm9 -; AVX512-NEXT: vpermt2d %zmm0, %zmm12, %zmm2 -; AVX512-NEXT: vpermt2d %zmm4, %zmm14, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm9, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm13, 256(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm10, 320(%rcx) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = <5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10> +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512-NEXT: vpermt2d %zmm0, %zmm9, %zmm10 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512-NEXT: vpermt2d %zmm4, %zmm11, %zmm10 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512-NEXT: vpermt2d %zmm2, %zmm12, %zmm0 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512-NEXT: vpermt2d %zmm4, %zmm2, %zmm0 +; AVX512-NEXT: vpermi2d %zmm3, %zmm1, %zmm6 +; AVX512-NEXT: vpermt2d %zmm5, %zmm8, %zmm6 +; AVX512-NEXT: vpermi2d %zmm1, %zmm3, %zmm9 +; AVX512-NEXT: vpermt2d %zmm5, %zmm11, %zmm9 +; AVX512-NEXT: vpermt2d %zmm3, %zmm12, %zmm1 +; AVX512-NEXT: vpermt2d %zmm5, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm1, 320(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm9, 256(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm6, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm10, 64(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm7, (%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1371,333 +1358,273 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride3_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $664, %rsp # imm = 0x298 -; SSE-NEXT: movaps (%rdi), %xmm2 -; SSE-NEXT: movaps 16(%rdi), %xmm4 -; SSE-NEXT: movaps 32(%rdi), %xmm5 -; SSE-NEXT: movaps 48(%rdi), %xmm6 -; SSE-NEXT: movaps (%rsi), %xmm0 -; SSE-NEXT: movaps 16(%rsi), %xmm11 -; SSE-NEXT: movaps 32(%rsi), %xmm14 -; SSE-NEXT: movaps 48(%rsi), %xmm3 -; SSE-NEXT: movaps (%rdx), %xmm7 +; SSE-NEXT: subq $424, %rsp # imm = 0x1A8 +; SSE-NEXT: movaps 64(%rdi), %xmm7 +; SSE-NEXT: movaps (%rdi), %xmm4 +; SSE-NEXT: movaps 16(%rdi), %xmm5 +; SSE-NEXT: movaps 32(%rdi), %xmm6 +; SSE-NEXT: movaps 48(%rdi), %xmm9 +; SSE-NEXT: movaps 64(%rsi), %xmm10 +; SSE-NEXT: movaps (%rsi), %xmm1 +; SSE-NEXT: movaps 16(%rsi), %xmm0 +; SSE-NEXT: movaps 32(%rsi), %xmm12 +; SSE-NEXT: movaps 48(%rsi), %xmm13 +; SSE-NEXT: movaps (%rdx), %xmm2 ; SSE-NEXT: movaps 16(%rdx), %xmm8 -; SSE-NEXT: movaps 32(%rdx), %xmm9 -; SSE-NEXT: movaps 48(%rdx), %xmm10 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm7[0,3] -; SSE-NEXT: movaps %xmm7, %xmm12 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm12[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[0,3] -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm11[3,3] +; SSE-NEXT: movaps 32(%rdx), %xmm11 +; SSE-NEXT: movaps 48(%rdx), %xmm14 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm4[1,0] +; SSE-NEXT: movaps %xmm4, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm3[0,2] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm2[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm8[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm5[1,0] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm14[3,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,2],xmm8[2,3] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm9[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm1[0,2] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm6[1,0] ; SSE-NEXT: movaps %xmm6, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm3[3,3] -; SSE-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm10[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm2 -; SSE-NEXT: movaps 64(%rdx), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm12[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,2],xmm11[2,3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm12[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[0,2] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm9[1,0] +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm13[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,2],xmm14[2,3] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm13[1,1] +; SSE-NEXT: movaps 64(%rdx), %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[0,2] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm7[1,0] +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdi), %xmm2 -; SSE-NEXT: movaps 80(%rdx), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm10[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm1[2,3] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm10[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps 80(%rdi), %xmm3 +; SSE-NEXT: movaps 80(%rdx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm3[1,0] +; SSE-NEXT: movaps 80(%rsi), %xmm2 +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm0[2,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm2 -; SSE-NEXT: movaps 96(%rdx), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdi), %xmm3 +; SSE-NEXT: movaps 96(%rdx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm3[1,0] +; SSE-NEXT: movaps 96(%rsi), %xmm2 +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm0[2,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdi), %xmm2 -; SSE-NEXT: movaps 112(%rdx), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rdi), %xmm3 +; SSE-NEXT: movaps 112(%rdx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm3[1,0] +; SSE-NEXT: movaps 112(%rsi), %xmm2 +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm0[2,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm2 -; SSE-NEXT: movaps 128(%rdx), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdi), %xmm3 +; SSE-NEXT: movaps 128(%rdx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm3[1,0] +; SSE-NEXT: movaps 128(%rsi), %xmm2 +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm0[2,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm2 -; SSE-NEXT: movaps 144(%rdx), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%rdi), %xmm3 +; SSE-NEXT: movaps 144(%rdx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm3[1,0] +; SSE-NEXT: movaps 144(%rsi), %xmm2 +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdi), %xmm13 +; SSE-NEXT: movaps 160(%rdx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm13[1,0] +; SSE-NEXT: movaps 160(%rsi), %xmm2 +; SSE-NEXT: movaps %xmm13, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm14 -; SSE-NEXT: movaps 160(%rdx), %xmm1 -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm14, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%rdi), %xmm12 -; SSE-NEXT: movaps 176(%rdx), %xmm1 -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,2],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 176(%rdi), %xmm15 +; SSE-NEXT: movaps 176(%rdx), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm15[1,0] ; SSE-NEXT: movaps 176(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm12, %xmm2 +; SSE-NEXT: movaps %xmm15, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm10[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm0[0,2] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rdi), %xmm10 +; SSE-NEXT: movaps 192(%rdx), %xmm12 ; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm10[1,0] +; SSE-NEXT: movaps 192(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm10, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm10, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%rdi), %xmm13 -; SSE-NEXT: movaps 192(%rdx), %xmm1 -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%rsi), %xmm11 -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm11[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] -; SSE-NEXT: movaps 208(%rdi), %xmm6 -; SSE-NEXT: movaps 208(%rdx), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%rsi), %xmm8 -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm8[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,2],xmm12[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[0,2] +; SSE-NEXT: movaps 208(%rdi), %xmm9 +; SSE-NEXT: movaps 208(%rdx), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm9[1,0] +; SSE-NEXT: movaps 208(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,2],xmm8[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[0,2] ; SSE-NEXT: movaps 224(%rdi), %xmm5 -; SSE-NEXT: movaps 224(%rdx), %xmm15 -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm15[0,3] -; SSE-NEXT: movaps 224(%rsi), %xmm4 +; SSE-NEXT: movaps 224(%rdx), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm5[1,0] +; SSE-NEXT: movaps 224(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm15[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,2] -; SSE-NEXT: movaps 240(%rdi), %xmm2 -; SSE-NEXT: movaps 240(%rdx), %xmm9 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm9[0,3] -; SSE-NEXT: movaps 240(%rsi), %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[1,2],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,2],xmm15[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm9[2,3] -; SSE-NEXT: movaps %xmm0, 736(%rcx) -; SSE-NEXT: movaps %xmm3, 720(%rcx) -; SSE-NEXT: movaps %xmm4, 688(%rcx) +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,2],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[0,2] +; SSE-NEXT: movaps 240(%rdi), %xmm1 +; SSE-NEXT: movaps 240(%rdx), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm1[1,0] +; SSE-NEXT: movaps 240(%rsi), %xmm2 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[0,2] +; SSE-NEXT: movaps %xmm6, 736(%rcx) +; SSE-NEXT: movaps %xmm0, 720(%rcx) +; SSE-NEXT: movaps %xmm3, 688(%rcx) ; SSE-NEXT: movaps %xmm7, 672(%rcx) ; SSE-NEXT: movaps %xmm8, 640(%rcx) -; SSE-NEXT: movaps %xmm10, 624(%rcx) -; SSE-NEXT: movaps %xmm11, 592(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 576(%rcx) +; SSE-NEXT: movaps %xmm11, 624(%rcx) +; SSE-NEXT: movaps %xmm12, 592(%rcx) +; SSE-NEXT: movaps %xmm14, 576(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 544(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1746,19 +1673,19 @@ ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3] -; SSE-NEXT: movaps %xmm2, 752(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] +; SSE-NEXT: movaps %xmm1, 752(%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0,1,3] ; SSE-NEXT: movaps %xmm5, 704(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0,1,3] -; SSE-NEXT: movaps %xmm6, 656(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0,1,3] +; SSE-NEXT: movaps %xmm9, 656(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0,1,3] +; SSE-NEXT: movaps %xmm10, 608(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3] +; SSE-NEXT: movaps %xmm15, 560(%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0,1,3] -; SSE-NEXT: movaps %xmm13, 608(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0,1,3] -; SSE-NEXT: movaps %xmm12, 560(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0,1,3] -; SSE-NEXT: movaps %xmm14, 512(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, 512(%rcx) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 464(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1776,7 +1703,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 224(%rcx) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 176(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1788,7 +1715,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: addq $664, %rsp # imm = 0x298 +; SSE-NEXT: addq $424, %rsp # imm = 0x1A8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride3_vf64: @@ -2280,197 +2207,205 @@ ; AVX2-FAST-LABEL: store_i32_stride3_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $232, %rsp -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm12 -; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm8 -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm10 -; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm0 -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm9 -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm4 -; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm2 -; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm13 -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm7 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm7, %ymm11 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm6[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3],ymm11[4],ymm14[5,6],ymm11[7] -; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm8[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm11[2],ymm6[3,4],ymm11[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0],ymm6[1,2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7] -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = [5,6,5,6,5,6,7,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm6, %ymm8 -; AVX2-FAST-NEXT: vbroadcastsd 24(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7] -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm7, %ymm8 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6],ymm8[7] -; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm4[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0],ymm3[1,2],ymm8[3],ymm3[4,5],ymm8[6],ymm3[7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vbroadcastsd 56(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm14 +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm4 +; AVX2-FAST-NEXT: vmovaps 64(%rsi), %xmm6 +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0],ymm3[1],ymm9[2,3],ymm3[4],ymm9[5,6],ymm3[7] +; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm9[2],ymm3[3,4],ymm9[5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm2, %ymm3 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,0,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm14[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] ; AVX2-FAST-NEXT: vbroadcastsd 64(%rdx), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 96(%rsi), %xmm3 +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm11[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX2-FAST-NEXT: vbroadcastsd 96(%rdx), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 128(%rsi), %xmm3 +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm8[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX2-FAST-NEXT: vbroadcastsd 128(%rdx), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 160(%rsi), %xmm3 +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm7[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX2-FAST-NEXT: vbroadcastsd 160(%rdx), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 192(%rsi), %xmm3 +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm10[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX2-FAST-NEXT: vbroadcastsd 192(%rdx), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vmovaps 224(%rsi), %xmm3 +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm15 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm15[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] +; AVX2-FAST-NEXT: vbroadcastsd 224(%rdx), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] +; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm3 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm3[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = [5,6,5,6,5,6,7,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vbroadcastsd 24(%rdi), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vbroadcastsd 88(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm3 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm3[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm1[1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vbroadcastsd 56(%rdi), %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[1,1,2,2] +; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm12[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX2-FAST-NEXT: vbroadcastsd 96(%rdx), %ymm2 +; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm3 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm3[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm1[1,2],ymm14[3],ymm1[4,5],ymm14[6],ymm1[7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vbroadcastsd 88(%rdi), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm12[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm13[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vbroadcastsd 120(%rdi), %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm13[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovaps 128(%rsi), %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX2-FAST-NEXT: vbroadcastsd 128(%rdx), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[1,1,2,2] +; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm2[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm11[2],ymm1[3,4],ymm11[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm0 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm1[1,2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vbroadcastsd 120(%rdi), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps 128(%rdx), %ymm2 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vbroadcastsd 152(%rdi), %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovaps 160(%rsi), %ymm1 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX2-FAST-NEXT: vbroadcastsd 160(%rdx), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps 160(%rdx), %ymm0 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm2[1,2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vbroadcastsd 184(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm5[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0],ymm1[1],ymm9[2,3],ymm1[4],ymm9[5,6],ymm1[7] -; AVX2-FAST-NEXT: vbroadcastsd 192(%rdx), %ymm9 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[1,1,2,2] +; AVX2-FAST-NEXT: vmovaps 128(%rsi), %ymm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps 128(%rdx), %ymm0 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vbroadcastsd 152(%rdi), %ymm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2],ymm1[3,4],ymm9[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[1,1,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm0[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm9[2],ymm5[3,4],ymm9[5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovaps 192(%rdx), %ymm9 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm9[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm5[1,2],ymm10[3],ymm5[4,5],ymm10[6],ymm5[7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vbroadcastsd 216(%rdi), %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm10[2],ymm0[3,4],ymm10[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[1,1,2,2] +; AVX2-FAST-NEXT: vmovaps 160(%rsi), %ymm7 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm7[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps 160(%rdx), %ymm9 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm9[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm0[1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vbroadcastsd 184(%rdi), %ymm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7] -; AVX2-FAST-NEXT: vmovaps 224(%rsi), %ymm9 -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm10 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6],ymm7[7] -; AVX2-FAST-NEXT: vbroadcastsd 224(%rdx), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2],ymm7[3,4],ymm11[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[1,1,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm9[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovaps 224(%rdx), %ymm11 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0],ymm10[1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vbroadcastsd 248(%rdi), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm11[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm9[1],ymm6[2,3],ymm9[4],ymm6[5,6],ymm9[7] -; AVX2-FAST-NEXT: vmovaps %ymm6, 736(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm10, 704(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm7, 672(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm0, 640(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm5, 608(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm1, 576(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm2, 544(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm3, 512(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm4, 480(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm8, 448(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm15, 416(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm14, 384(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm13, 352(%rcx) +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6],ymm9[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm10[1,1,2,2] +; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm10 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm10[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovaps 192(%rdx), %ymm12 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm12[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0],ymm9[1,2],ymm13[3],ymm9[4,5],ymm13[6],ymm9[7] +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm5, %ymm10 +; AVX2-FAST-NEXT: vbroadcastsd 216(%rdi), %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5,6],ymm12[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm15[1,1,2,2] +; AVX2-FAST-NEXT: vmovaps 224(%rsi), %ymm13 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovaps 224(%rdx), %ymm15 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm15[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm12[1,2],ymm6[3],ymm12[4,5],ymm6[6],ymm12[7] +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vbroadcastsd 248(%rdi), %ymm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm12[2],ymm5[3,4],ymm12[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm15[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7] +; AVX2-FAST-NEXT: vmovaps %ymm5, 736(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm6, 704(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm10, 640(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm9, 608(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm7, 544(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 512(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm8, 416(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm2, 352(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm11, 320(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm3, 256(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm14, 224(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm4, 160(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 672(%rcx) ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 576(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 480(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 384(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-FAST-NEXT: addq $232, %rsp @@ -2722,10 +2657,10 @@ ; ; AVX512-LABEL: store_i32_stride3_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm4 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm6 @@ -2735,53 +2670,52 @@ ; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm11 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5> -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512-NEXT: vpermt2d %zmm7, %zmm15, %zmm16 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] -; AVX512-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = <5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10> -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512-NEXT: vpermt2d %zmm0, %zmm18, %zmm19 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] -; AVX512-NEXT: vpermt2d %zmm11, %zmm20, %zmm19 -; AVX512-NEXT: vpermt2d %zmm7, %zmm12, %zmm0 -; AVX512-NEXT: vpermt2d %zmm11, %zmm14, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-NEXT: vpermt2d %zmm6, %zmm15, %zmm7 -; AVX512-NEXT: vpermt2d %zmm10, %zmm17, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512-NEXT: vpermt2d %zmm1, %zmm18, %zmm11 -; AVX512-NEXT: vpermt2d %zmm10, %zmm20, %zmm11 -; AVX512-NEXT: vpermt2d %zmm6, %zmm12, %zmm1 -; AVX512-NEXT: vpermt2d %zmm10, %zmm14, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512-NEXT: vpermt2d %zmm5, %zmm15, %zmm6 -; AVX512-NEXT: vpermt2d %zmm9, %zmm17, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512-NEXT: vpermt2d %zmm2, %zmm18, %zmm10 -; AVX512-NEXT: vpermt2d %zmm9, %zmm20, %zmm10 -; AVX512-NEXT: vpermt2d %zmm5, %zmm12, %zmm2 -; AVX512-NEXT: vpermt2d %zmm9, %zmm14, %zmm2 -; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm15 -; AVX512-NEXT: vpermt2d %zmm8, %zmm17, %zmm15 -; AVX512-NEXT: vpermt2d %zmm3, %zmm18, %zmm4 -; AVX512-NEXT: vpermt2d %zmm8, %zmm20, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm4, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm2, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm10, 256(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm6, 320(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm1, 384(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm11, 448(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm7, 512(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm0, 576(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm19, 640(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm16, 704(%rcx) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = <5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10> +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512-NEXT: vpermt2d %zmm0, %zmm15, %zmm16 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512-NEXT: vpermt2d %zmm8, %zmm17, %zmm16 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512-NEXT: vpermt2d %zmm4, %zmm18, %zmm0 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512-NEXT: vpermt2d %zmm8, %zmm4, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512-NEXT: vpermt2d %zmm5, %zmm12, %zmm8 +; AVX512-NEXT: vpermt2d %zmm9, %zmm14, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512-NEXT: vpermt2d %zmm1, %zmm15, %zmm19 +; AVX512-NEXT: vpermt2d %zmm9, %zmm17, %zmm19 +; AVX512-NEXT: vpermt2d %zmm5, %zmm18, %zmm1 +; AVX512-NEXT: vpermt2d %zmm9, %zmm4, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-NEXT: vpermt2d %zmm6, %zmm12, %zmm5 +; AVX512-NEXT: vpermt2d %zmm10, %zmm14, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512-NEXT: vpermt2d %zmm2, %zmm15, %zmm9 +; AVX512-NEXT: vpermt2d %zmm10, %zmm17, %zmm9 +; AVX512-NEXT: vpermt2d %zmm6, %zmm18, %zmm2 +; AVX512-NEXT: vpermt2d %zmm10, %zmm4, %zmm2 +; AVX512-NEXT: vpermi2d %zmm7, %zmm3, %zmm12 +; AVX512-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 +; AVX512-NEXT: vpermi2d %zmm3, %zmm7, %zmm15 +; AVX512-NEXT: vpermt2d %zmm11, %zmm17, %zmm15 +; AVX512-NEXT: vpermt2d %zmm7, %zmm18, %zmm3 +; AVX512-NEXT: vpermt2d %zmm11, %zmm4, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm3, 704(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm15, 640(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm12, 576(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm2, 512(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm9, 448(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm5, 384(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm1, 320(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm19, 256(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm8, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm13, (%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -2798,7 +2732,7 @@ ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; AVX: {{.*}} ; AVX1: {{.*}} -; AVX2-ONLY: {{.*}} +; AVX2: {{.*}} ; AVX512-FAST: {{.*}} ; AVX512-SLOW: {{.*}} ; AVX512BW: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll @@ -151,10 +151,10 @@ ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps %xmm0, 32(%r8) -; SSE-NEXT: movaps %xmm1, 48(%r8) +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps %xmm1, 32(%r8) ; SSE-NEXT: movaps %xmm5, 16(%r8) ; SSE-NEXT: movaps %xmm6, (%r8) ; SSE-NEXT: retq @@ -247,33 +247,33 @@ ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm10[0] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm10[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] ; SSE-NEXT: movaps %xmm4, %xmm7 ; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] ; SSE-NEXT: movaps %xmm1, %xmm8 ; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] ; SSE-NEXT: movaps %xmm8, %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm7[0] +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm7[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; SSE-NEXT: movaps %xmm1, 96(%r8) -; SSE-NEXT: movaps %xmm6, 112(%r8) -; SSE-NEXT: movaps %xmm8, 64(%r8) -; SSE-NEXT: movaps %xmm10, 80(%r8) -; SSE-NEXT: movaps %xmm0, 32(%r8) -; SSE-NEXT: movaps %xmm5, 48(%r8) -; SSE-NEXT: movaps %xmm2, (%r8) -; SSE-NEXT: movaps %xmm3, 16(%r8) +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE-NEXT: movaps %xmm1, 112(%r8) +; SSE-NEXT: movaps %xmm6, 96(%r8) +; SSE-NEXT: movaps %xmm8, 80(%r8) +; SSE-NEXT: movaps %xmm10, 64(%r8) +; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps %xmm5, 32(%r8) +; SSE-NEXT: movaps %xmm2, 16(%r8) +; SSE-NEXT: movaps %xmm3, (%r8) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride4_vf8: @@ -345,18 +345,18 @@ ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm6, (%r8) ; AVX2-ONLY-NEXT: vzeroupper @@ -409,94 +409,94 @@ ; SSE-NEXT: movaps %xmm5, %xmm6 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; SSE-NEXT: movaps %xmm6, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm15[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] ; SSE-NEXT: movaps %xmm13, %xmm15 ; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; SSE-NEXT: movaps %xmm11, %xmm7 ; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm15[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm15[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm3[2],xmm11[3],xmm3[3] ; SSE-NEXT: movaps %xmm11, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm13[1] -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm13[0] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm13[1] ; SSE-NEXT: movaps %xmm10, %xmm15 ; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] ; SSE-NEXT: movaps %xmm4, %xmm13 ; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] ; SSE-NEXT: movaps %xmm13, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm15[1] -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm15[0] +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm15[1] ; SSE-NEXT: movaps 48(%rdx), %xmm15 ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] ; SSE-NEXT: movaps 48(%rcx), %xmm12 ; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] ; SSE-NEXT: movaps %xmm4, %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm10[0] +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] ; SSE-NEXT: movaps %xmm15, %xmm10 ; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] ; SSE-NEXT: movaps 48(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm12[2],xmm15[3],xmm12[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm15[0] -; SSE-NEXT: movaps %xmm2, 224(%r8) -; SSE-NEXT: movaps %xmm1, 240(%r8) -; SSE-NEXT: movaps %xmm3, 192(%r8) -; SSE-NEXT: movaps %xmm0, 208(%r8) -; SSE-NEXT: movaps %xmm4, 160(%r8) -; SSE-NEXT: movaps %xmm9, 176(%r8) -; SSE-NEXT: movaps %xmm13, 128(%r8) -; SSE-NEXT: movaps %xmm14, 144(%r8) -; SSE-NEXT: movaps %xmm11, 96(%r8) -; SSE-NEXT: movaps %xmm8, 112(%r8) -; SSE-NEXT: movaps %xmm7, 64(%r8) +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm15[1] +; SSE-NEXT: movaps %xmm2, 240(%r8) +; SSE-NEXT: movaps %xmm1, 224(%r8) +; SSE-NEXT: movaps %xmm3, 208(%r8) +; SSE-NEXT: movaps %xmm0, 192(%r8) +; SSE-NEXT: movaps %xmm4, 176(%r8) +; SSE-NEXT: movaps %xmm9, 160(%r8) +; SSE-NEXT: movaps %xmm13, 144(%r8) +; SSE-NEXT: movaps %xmm14, 128(%r8) +; SSE-NEXT: movaps %xmm11, 112(%r8) +; SSE-NEXT: movaps %xmm8, 96(%r8) +; SSE-NEXT: movaps %xmm7, 80(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%r8) -; SSE-NEXT: movaps %xmm5, 32(%r8) +; SSE-NEXT: movaps %xmm0, 64(%r8) +; SSE-NEXT: movaps %xmm5, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r8) -; SSE-NEXT: movaps %xmm6, (%r8) +; SSE-NEXT: movaps %xmm0, 32(%r8) +; SSE-NEXT: movaps %xmm6, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r8) +; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride4_vf16: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $24, %rsp ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm5 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[1],xmm9[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm11 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm11 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm12 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm12 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm14[0],xmm2[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0] @@ -504,8 +504,8 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm10[1],xmm5[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm10[1],xmm4[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm11[0],xmm12[0] @@ -514,9 +514,9 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[1],xmm4[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[1],xmm5[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm8[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] @@ -524,34 +524,34 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm15 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm13[1],xmm15[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm10, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm2[2],xmm14[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[3,0],xmm7[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,0],xmm7[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm12[2],xmm11[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm12[2],xmm11[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,0],xmm7[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,0],xmm7[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm8[2],xmm6[2] @@ -571,15 +571,15 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) ; AVX1-ONLY-NEXT: addq $24, %rsp @@ -588,26 +588,26 @@ ; ; AVX2-ONLY-LABEL: store_i32_stride4_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm5 ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm7 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm6 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm9 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm10 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1],ymm3[2,3],ymm11[4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm11 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] @@ -615,9 +615,9 @@ ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm9 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm9 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm8 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm8 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,1,3] @@ -627,9 +627,9 @@ ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm11 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm9 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,2,3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] @@ -644,14 +644,14 @@ ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 192(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 192(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 64(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r8) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -944,131 +944,131 @@ ; AVX1-ONLY-LABEL: store_i32_stride4_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm11 +; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm11[0],xmm2[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[1],xmm6[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm10[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[1],xmm4[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm8[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm13 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm13[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm9 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm9[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm5[0] +; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm6[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm12 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[1],xmm12[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm3 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm3[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm8 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm11[1],xmm8[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm6[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm5[0],xmm2[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm1[2],xmm14[2] @@ -1082,7 +1082,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm1[2],xmm14[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 @@ -1127,16 +1127,16 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm5[2],xmm7[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = zero,zero,xmm6[2],xmm7[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[3,0],xmm10[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm3[2],xmm4[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 @@ -1146,40 +1146,40 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm2[2],xmm6[2] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm2[2],xmm5[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm8[3,0],xmm11[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 480(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 416(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 224(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 352(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 416(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 352(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 288(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 160(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) ; AVX1-ONLY-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -1187,82 +1187,82 @@ ; AVX2-ONLY-LABEL: store_i32_stride4_vf32: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: pushq %rax -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm11 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm1 ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm9 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm3 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm10 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm14 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm15 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm3[2,3],ymm10[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm3[2,3],ymm12[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm15[2],xmm13[2],xmm15[3],xmm13[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm13 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2,3],ymm15[4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm15 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm13 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm14 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm11[2,3],ymm0[4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1],ymm9[2,3],ymm0[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm11 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3],ymm14[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm14 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm15 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm0[2,3],ymm11[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[4],ymm14[4],ymm13[5],ymm14[5] -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[6],ymm15[6],ymm14[7],ymm15[7] +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm14 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3],ymm6[4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[4],ymm14[4],ymm0[5],ymm14[5] @@ -1270,17 +1270,17 @@ ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1],ymm7[2,3],ymm13[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm14 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm4 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm14[0],ymm4[0],ymm14[1],ymm4[1],ymm14[4],ymm4[4],ymm14[5],ymm4[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm15 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm13[0],ymm15[0],ymm13[1],ymm15[1],ymm13[4],ymm15[4],ymm13[5],ymm15[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] @@ -1305,24 +1305,24 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm5, 448(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 352(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 320(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 96(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm10, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 416(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 384(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 352(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 416(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 384(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 288(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 256(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r8) ; AVX2-ONLY-NEXT: popq %rax ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -1337,32 +1337,32 @@ ; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512F-NEXT: vpermt2d %zmm2, %zmm10, %zmm11 ; AVX512F-NEXT: movb $-86, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512F-NEXT: vpermt2d %zmm6, %zmm9, %zmm12 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512F-NEXT: vpermt2d %zmm2, %zmm13, %zmm14 ; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512F-NEXT: vpermt2d %zmm6, %zmm12, %zmm15 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512F-NEXT: vpermt2d %zmm2, %zmm16, %zmm17 ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = ; AVX512F-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u> ; AVX512F-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512F-NEXT: vpermi2d %zmm7, %zmm5, %zmm8 @@ -1377,14 +1377,14 @@ ; AVX512F-NEXT: vpermt2d %zmm7, %zmm15, %zmm5 ; AVX512F-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm1, 384(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm16, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm13, 256(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm10, 320(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm17, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm14, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm1, 448(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm16, 384(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm13, 320(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm10, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm17, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm11, (%r8) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1398,32 +1398,32 @@ ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm10, %zmm11 ; AVX512BW-NEXT: movb $-86, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm9, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm13, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u> ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512BW-NEXT: vpermi2d %zmm7, %zmm5, %zmm8 @@ -1438,14 +1438,14 @@ ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <32 x i32>, ptr %in.vecptr0, align 64 @@ -2725,32 +2725,32 @@ ; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512F-NEXT: vpermt2d %zmm21, %zmm14, %zmm8 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512F-NEXT: vpermt2d %zmm17, %zmm7, %zmm4 ; AVX512F-NEXT: movb $-86, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512F-NEXT: vpermt2d %zmm21, %zmm16, %zmm10 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512F-NEXT: vpermt2d %zmm17, %zmm11, %zmm8 ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512F-NEXT: vpermt2d %zmm21, %zmm18, %zmm20 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512F-NEXT: vpermt2d %zmm17, %zmm15, %zmm10 ; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = ; AVX512F-NEXT: vpermt2d %zmm21, %zmm20, %zmm22 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u> ; AVX512F-NEXT: vpermt2d %zmm17, %zmm21, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -2801,22 +2801,22 @@ ; AVX512F-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm21, %zmm3 ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm3, 896(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm15, 960(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm11, 768(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm7, 832(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm2, 640(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm27, 704(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm26, 512(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm25, 576(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm1, 384(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm24, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm22, 256(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm17, 320(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm3, 960(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm15, 896(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm11, 832(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm7, 768(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm2, 704(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm27, 640(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm26, 576(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm25, 512(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm1, 448(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm24, 384(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm22, 320(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm17, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2838,32 +2838,32 @@ ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm14, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm7, %zmm4 ; AVX512BW-NEXT: movb $-86, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm16, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm11, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm18, %zmm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm15, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm20, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u> ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -2914,22 +2914,22 @@ ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm21, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, 896(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 768(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 832(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 640(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 704(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 512(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 576(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 960(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 896(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 832(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 768(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 704(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 640(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 576(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 512(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i32>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll @@ -28,8 +28,8 @@ ; SSE-NEXT: movaps %xmm0, %xmm6 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,0] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] @@ -43,64 +43,64 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,1,4,6,6,5] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[u,u,0,2,u,5,7,u] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovlps %xmm1, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2,2,1,4,6,6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[u,u,0,2,u,5,7,u] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero +; AVX1-ONLY-NEXT: vmovlps %xmm0, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i32_stride5_vf2: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: movq (%r8), %rax ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovq %rax, %xmm2 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm3 = <0,2,4,6,u,1,3,5> -; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm3, %ymm0 -; AVX2-ONLY-NEXT: vmovd %eax, %xmm3 -; AVX2-ONLY-NEXT: vpbroadcastd %xmm3, %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4],ymm0[5,6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovq %xmm1, 32(%r9) +; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-ONLY-NEXT: movq (%r8), %rax +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-ONLY-NEXT: vmovq %rax, %xmm1 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,2,6,u,1,5,3> +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovd %eax, %xmm2 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7] ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-ONLY-NEXT: vmovq %xmm1, 32(%r9) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512-LABEL: store_i32_stride5_vf2: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = <0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u> -; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm1 -; AVX512-NEXT: vmovlps %xmm1, 32(%r9) -; AVX512-NEXT: vmovaps %ymm0, (%r9) +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,2,4,6,16,1,3,5,7,17,u,u,u,u,u,u> +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512-NEXT: vextracti32x4 $2, %zmm2, %xmm0 +; AVX512-NEXT: vmovq %xmm0, 32(%r9) +; AVX512-NEXT: vmovdqa %ymm2, (%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %in.vec0 = load <2 x i32>, ptr %in.vecptr0, align 64 @@ -131,25 +131,25 @@ ; SSE-NEXT: movaps %xmm0, %xmm6 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm4[3,3] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm3[3,3] ; SSE-NEXT: movaps %xmm1, %xmm7 ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[0,2] -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm4[2,3] -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm5[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm0, %xmm5 ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[0,1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm5[0,1] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[3,0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm5[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,0] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm3, 32(%r9) -; SSE-NEXT: movaps %xmm8, 48(%r9) +; SSE-NEXT: movaps %xmm3, 48(%r9) +; SSE-NEXT: movaps %xmm8, 32(%r9) ; SSE-NEXT: movaps %xmm7, 64(%r9) ; SSE-NEXT: movaps %xmm6, (%r9) ; SSE-NEXT: movaps %xmm0, 16(%r9) @@ -185,8 +185,8 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3] ; AVX1-ONLY-NEXT: vmovaps %xmm2, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -214,7 +214,7 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5],ymm6[6],ymm5[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[3],xmm2[1,2],zero +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[3],xmm2[1,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vmovaps %xmm1, 64(%r9) ; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%r9) @@ -248,10 +248,10 @@ ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm3[3] -; AVX2-FAST-NEXT: vmovaps %xmm0, 64(%r9) +; AVX2-FAST-NEXT: vmovaps %xmm1, 64(%r9) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -279,7 +279,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5],ymm6[6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[3],xmm2[1,2],zero +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[3],xmm2[1,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, 64(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%r9) @@ -321,138 +321,134 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride5_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm1 -; SSE-NEXT: movaps 16(%rdi), %xmm5 -; SSE-NEXT: movdqa (%rsi), %xmm6 -; SSE-NEXT: movdqa 16(%rsi), %xmm3 -; SSE-NEXT: movdqa (%rdx), %xmm7 -; SSE-NEXT: movdqa 16(%rdx), %xmm10 -; SSE-NEXT: movaps (%rcx), %xmm4 -; SSE-NEXT: movaps 16(%rcx), %xmm2 -; SSE-NEXT: movaps (%r8), %xmm0 -; SSE-NEXT: movaps 16(%r8), %xmm11 -; SSE-NEXT: movaps %xmm2, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm11[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm9[0,2] -; SSE-NEXT: movaps %xmm4, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,3],xmm0[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm6[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm9 = xmm13[0],xmm9[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm12[0,2] -; SSE-NEXT: movaps %xmm5, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm11[2,3] -; SSE-NEXT: movaps %xmm2, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm13[2,0] -; SSE-NEXT: movaps %xmm5, %xmm13 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm3[2],xmm13[3],xmm3[3] -; SSE-NEXT: movaps %xmm10, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm11[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm13[0,1] -; SSE-NEXT: movaps %xmm5, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] -; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm15 -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm4[2],xmm15[3],xmm4[3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm14[2,0] -; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm14[0,1] -; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm6[0] -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm10[2,0] -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm11[0],xmm5[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,0] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm14, (%r9) -; SSE-NEXT: movaps %xmm4, 32(%r9) -; SSE-NEXT: movaps %xmm15, 48(%r9) -; SSE-NEXT: movaps %xmm13, 80(%r9) -; SSE-NEXT: movaps %xmm2, 112(%r9) -; SSE-NEXT: movaps %xmm12, 128(%r9) -; SSE-NEXT: movaps %xmm1, 16(%r9) -; SSE-NEXT: movaps %xmm9, 64(%r9) +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa (%rsi), %xmm5 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 +; SSE-NEXT: movdqa (%rdx), %xmm8 +; SSE-NEXT: movdqa 16(%rdx), %xmm3 +; SSE-NEXT: movaps (%rcx), %xmm6 +; SSE-NEXT: movaps 16(%rcx), %xmm1 +; SSE-NEXT: movaps (%r8), %xmm9 +; SSE-NEXT: movaps 16(%r8), %xmm4 +; SSE-NEXT: movaps %xmm9, %xmm13 +; SSE-NEXT: movaps %xmm9, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm6[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm7 = xmm11[0],xmm7[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,0] +; SSE-NEXT: movaps %xmm4, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,0],xmm1[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,0] +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm12[0] +; SSE-NEXT: movdqa %xmm0, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm0[3,0] +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[2,0] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm9[0],xmm0[1,2,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm5[2],xmm14[3],xmm5[3] +; SSE-NEXT: movdqa 16(%rdi), %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm14[0,1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm6[2],xmm12[3],xmm6[3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm13[0,2] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm6[0] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; SSE-NEXT: movaps %xmm4, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm6[0,1] +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[3,0] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm3[2,0] +; SSE-NEXT: movss {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3] +; SSE-NEXT: movaps %xmm1, 128(%r9) +; SSE-NEXT: movaps %xmm13, 112(%r9) +; SSE-NEXT: movdqa %xmm8, 80(%r9) +; SSE-NEXT: movaps %xmm12, 48(%r9) +; SSE-NEXT: movaps %xmm9, 32(%r9) +; SSE-NEXT: movdqa %xmm11, (%r9) +; SSE-NEXT: movaps %xmm10, 144(%r9) ; SSE-NEXT: movaps %xmm5, 96(%r9) -; SSE-NEXT: movaps %xmm8, 144(%r9) +; SSE-NEXT: movaps %xmm7, 64(%r9) +; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride5_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm4 -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm8[1],xmm6[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = xmm8[0],xmm6[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm11 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6],ymm13[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm5[1,2,3],ymm10[4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = zero,zero,xmm9[2],xmm7[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm9[1,1],xmm7[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm4[1,1],ymm2[1,1],ymm4[5,5],ymm2[5,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0],ymm7[1,2,3,4],ymm9[5],ymm7[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX1-ONLY-NEXT: vbroadcastss 4(%rcx), %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = zero,zero,xmm8[2],xmm6[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3],ymm10[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm8 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4,5],ymm8[6],ymm6[7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,1],xmm3[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,0] +; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3],ymm2[4,5,6],ymm8[7] ; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm10[3,3],xmm8[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm3[3,3],ymm1[3,3],ymm3[7,7],ymm1[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4,5,6],ymm9[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[3,3],xmm8[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm1[3,3],ymm0[3,3],ymm1[7,7],ymm0[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm9 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vbroadcastss 4(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm4[2],xmm3[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm6 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4,5],ymm6[6],ymm3[7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = zero,zero,xmm4[2],xmm7[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,1],xmm7[1,1] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,1],ymm5[1,1],ymm9[5,5],ymm5[5,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm7[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm6[2],ymm8[3,4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm4[1,2,3,4],ymm6[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -485,24 +481,24 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm2[1],ymm7[2,3,4],ymm2[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm1[1,2],ymm8[3,4],ymm1[5,6],ymm8[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3,4],ymm8[5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 16(%r8), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4],ymm8[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3,4],ymm9[5,6],ymm10[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] +; AVX2-SLOW-NEXT: vbroadcastsd 24(%r8), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm2[1],ymm8[2,3,4],ymm2[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm3[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm1[1,2],ymm9[3,4],ymm1[5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4],ymm9[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 24(%r8), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vbroadcastsd 16(%r8), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4],ymm9[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] @@ -511,8 +507,8 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm8, 128(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm7, 96(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm8, 96(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 128(%r9) ; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%r9) ; AVX2-SLOW-NEXT: vmovaps %ymm5, (%r9) ; AVX2-SLOW-NEXT: vzeroupper @@ -541,29 +537,29 @@ ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm7 = <0,1,u,u,3,2,3,u> +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm7 = ; AVX2-FAST-NEXT: vpermps %ymm6, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6],ymm8[7] ; AVX2-FAST-NEXT: vinsertf128 $1, (%r8), %ymm6, %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm2[1],ymm7[2,3,4],ymm2[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm1[1,2],ymm8[3,4],ymm1[5,6],ymm8[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3,4],ymm8[5,6],ymm9[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4],ymm8[5],ymm7[6,7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3,4],ymm9[5,6],ymm10[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] +; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm2[1],ymm8[2,3,4],ymm2[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm3[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm1[1,2],ymm9[3,4],ymm1[5,6],ymm9[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4],ymm9[5],ymm8[6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] @@ -572,8 +568,8 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm8, 128(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm7, 96(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm8, 96(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm7, 128(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm6, (%r9) ; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%r9) ; AVX2-FAST-NEXT: vzeroupper @@ -608,24 +604,24 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm2[1],ymm7[2,3,4],ymm2[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm1[1,2],ymm8[3,4],ymm1[5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3,4],ymm8[5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%r8), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4],ymm8[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3,4],ymm9[5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%r8), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm2[1],ymm8[2,3,4],ymm2[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm3[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm1[1,2],ymm9[3,4],ymm1[5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4],ymm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%r8), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%r8), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4],ymm9[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] @@ -634,8 +630,8 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 128(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 128(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%r9) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -650,19 +646,19 @@ ; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = <6,14,u,23,31,7,15,u> ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = <11,19,27,u,4,12,20,28,u,5,13,21,29,u,6,14> +; AVX512-NEXT: vpbroadcastq 24(%r8), %ymm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4,5,6],ymm4[7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,8,16,24,u,1,9,17,25,u,2,10,18,26,u,3> ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512-NEXT: vpermi2d %zmm2, %zmm4, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,8,16,24,u,1,9,17,25,u,2,10,18,26,u,3> +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = <11,19,27,u,4,12,20,28,u,5,13,21,29,u,6,14> ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512-NEXT: vpermi2d %zmm2, %zmm4, %zmm0 -; AVX512-NEXT: vpbroadcastq 24(%r8), %ymm1 -; AVX512-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512-NEXT: vmovdqa64 %zmm5, 64(%r9) -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm1[2],ymm3[3,4,5,6],ymm1[7] -; AVX512-NEXT: vmovdqa %ymm0, 128(%r9) +; AVX512-NEXT: vmovdqa64 %zmm0, 64(%r9) +; AVX512-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512-NEXT: vmovdqa %ymm3, 128(%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %in.vec0 = load <8 x i32>, ptr %in.vecptr0, align 64 @@ -683,443 +679,439 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride5_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $168, %rsp -; SSE-NEXT: movdqa (%rsi), %xmm8 -; SSE-NEXT: movdqa 16(%rsi), %xmm6 -; SSE-NEXT: movdqa 32(%rsi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdx), %xmm10 -; SSE-NEXT: movdqa 16(%rdx), %xmm7 -; SSE-NEXT: movdqa 32(%rdx), %xmm4 -; SSE-NEXT: movaps (%rcx), %xmm5 -; SSE-NEXT: movaps 16(%rcx), %xmm14 -; SSE-NEXT: movaps 32(%rcx), %xmm12 -; SSE-NEXT: movaps (%r8), %xmm3 -; SSE-NEXT: movaps 16(%r8), %xmm15 +; SSE-NEXT: subq $136, %rsp +; SSE-NEXT: movdqa (%rsi), %xmm6 +; SSE-NEXT: movdqa 16(%rsi), %xmm5 +; SSE-NEXT: movdqa 32(%rsi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm9 +; SSE-NEXT: movdqa 32(%rdx), %xmm7 +; SSE-NEXT: movaps (%rcx), %xmm8 +; SSE-NEXT: movaps 16(%rcx), %xmm13 +; SSE-NEXT: movaps 32(%rcx), %xmm14 +; SSE-NEXT: movaps (%r8), %xmm4 +; SSE-NEXT: movaps 16(%r8), %xmm10 ; SSE-NEXT: movaps 32(%r8), %xmm11 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm15[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm8[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm13[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm11[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] +; SSE-NEXT: movaps %xmm14, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdx), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm9 -; SSE-NEXT: movaps 48(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps 48(%rcx), %xmm15 +; SSE-NEXT: movaps 48(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm15[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movdqa (%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm6[1,1] +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[2,0] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm3[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movaps %xmm14, %xmm10 -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm15[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm4, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm8[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm1[0,1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movdqa 16(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm13[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[3,0] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm0[0,2] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm15[2,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm14 = xmm14[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[2,0] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movdqa %xmm4, %xmm15 -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; SSE-NEXT: movdqa 32(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm3[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm7[2,0] +; SSE-NEXT: movss {{.*#+}} xmm8 = xmm4[0],xmm8[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm12[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[0,1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,2] +; SSE-NEXT: movaps (%rsp), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSE-NEXT: movaps 48(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm15[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,1] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[3,0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm1[0,2] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,1] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movaps %xmm9, %xmm4 +; SSE-NEXT: shufps $81, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[1,0],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm1[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[2,0] -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm13 = xmm0[0],xmm13[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm7 = xmm0[0],xmm7[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movss {{.*#+}} xmm9 = xmm1[0],xmm9[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm13[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm12[2,0] +; SSE-NEXT: movss {{.*#+}} xmm7 = xmm2[0],xmm7[1,2,3] +; SSE-NEXT: movaps %xmm15, 288(%r9) +; SSE-NEXT: movaps %xmm0, 272(%r9) +; SSE-NEXT: movaps %xmm3, 240(%r9) +; SSE-NEXT: movaps %xmm10, 208(%r9) +; SSE-NEXT: movaps %xmm4, 192(%r9) +; SSE-NEXT: movdqa %xmm5, 160(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm11[2,0] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm9, 288(%r9) -; SSE-NEXT: movaps %xmm4, 272(%r9) -; SSE-NEXT: movdqa %xmm5, 240(%r9) -; SSE-NEXT: movaps %xmm15, 208(%r9) -; SSE-NEXT: movaps %xmm12, 192(%r9) -; SSE-NEXT: movdqa %xmm6, 160(%r9) -; SSE-NEXT: movaps %xmm14, 128(%r9) -; SSE-NEXT: movaps %xmm10, 112(%r9) -; SSE-NEXT: movdqa %xmm8, 80(%r9) +; SSE-NEXT: movaps %xmm0, 128(%r9) +; SSE-NEXT: movaps %xmm6, 112(%r9) +; SSE-NEXT: movdqa %xmm11, 80(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%r9) +; SSE-NEXT: movaps %xmm14, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 304(%r9) -; SSE-NEXT: movaps %xmm2, 256(%r9) +; SSE-NEXT: movaps %xmm7, 256(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r9) -; SSE-NEXT: movaps %xmm3, 176(%r9) +; SSE-NEXT: movaps %xmm8, 176(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%r9) -; SSE-NEXT: movaps %xmm7, 96(%r9) +; SSE-NEXT: movaps %xmm9, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r9) -; SSE-NEXT: movaps %xmm13, 16(%r9) -; SSE-NEXT: addq $168, %rsp +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%r9) +; SSE-NEXT: addq $136, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride5_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 +; AVX1-ONLY-NEXT: pushq %rax +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm12 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm10[1],xmm6[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = xmm10[0],xmm6[0],zero,zero +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm12[1,1],xmm6[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm11 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm12[0],xmm11[0] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm11 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm14[0],xmm13[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6],ymm7[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3],ymm5[4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm9[1],xmm8[1],zero -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm14 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm14[0],xmm13[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm9[1,1],xmm8[1,1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm11[0],xmm10[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = xmm9[0],xmm8[0],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm15, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5,6],ymm7[7] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm7 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm15, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0],ymm5[1,2,3],ymm15[4],ymm5[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX1-ONLY-NEXT: vbroadcastss 4(%rcx), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = zero,zero,xmm10[2],xmm6[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm10 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm10[1],ymm6[2,3,4,5],ymm10[6],ymm6[7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm11[2],xmm10[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm11[1,1],xmm10[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm7[1,1],ymm2[1,1],ymm7[5,5],ymm2[5,5] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm6 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm12 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm12[3,3],xmm11[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[3,3],ymm1[3,3],ymm3[7,7],ymm1[7,7] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm15[3,4],ymm11[5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm12[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm10[1,2,3,4],ymm15[5],ymm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4,5,6],ymm15[7] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm15 +; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm15 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX1-ONLY-NEXT: vbroadcastss 36(%rcx), %xmm14 +; AVX1-ONLY-NEXT: vbroadcastss 4(%rcx), %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = zero,zero,xmm12[2],xmm6[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1],ymm6[2,3],ymm13[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm13 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm13[1],ymm6[2,3,4,5],ymm13[6],ymm6[7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,zero,xmm14[2],xmm15[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[1,1],xmm15[1,1] +; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm7[1,1],ymm0[1,1],ymm7[5,5],ymm0[5,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4],ymm13[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm12[3,3],xmm15[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm12[2],xmm15[2],xmm12[3],xmm15[3] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm14[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0],ymm13[1,2,3,4],ymm15[5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6],ymm15[7] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm15 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX1-ONLY-NEXT: vbroadcastss 36(%rcx), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = zero,zero,xmm9[2],xmm8[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2,3],ymm13[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm9 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5],ymm9[6],ymm8[7] -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[3,3],xmm9[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] -; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vmovaps %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[3,3],ymm0[3,3],ymm15[7,7],ymm0[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3,4],ymm9[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm14[2],xmm13[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm14[1,1],xmm13[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm13 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm14[1,1],ymm13[1,1],ymm14[5,5],ymm13[5,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm8 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0],ymm8[1],ymm0[2,3,4,5],ymm8[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = zero,zero,xmm9[2],xmm0[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm9[1,1],xmm0[1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm15[1,1],ymm1[1,1],ymm15[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2],ymm9[3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm10[3,3],xmm9[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[3,3],ymm10[3,3],ymm11[7,7],ymm10[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm4[3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm0[2],ymm9[3,4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm12[3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm15[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm13[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4],ymm9[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm9 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm2[1,2,3,4],ymm0[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2],ymm4[3,4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm14[3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm11[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm15[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm1[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2],ymm7[3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 128(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 256(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 288(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 256(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 128(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 96(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) +; AVX1-ONLY-NEXT: popq %rax ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i32_stride5_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm4 -; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm2 -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm7 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm6 +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm4 +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm2 ; AVX2-SLOW-NEXT: vmovaps (%r8), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%r8), %ymm6 -; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm9 -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm11 -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm10 -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm13 -; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm14 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm14[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2],xmm15[3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2,3],ymm15[4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm15[1],ymm8[2,3,4,5],ymm15[6],ymm8[7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm15 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm14 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,3,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm9[0,1,1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vinsertf128 $1, (%r8), %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm10[1,2,3],ymm9[4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm8 +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm11 +; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm9 +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm13 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm10 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm12 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm14 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,3,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[0,1,1,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3],ymm15[4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm15 +; AVX2-SLOW-NEXT: vinsertf128 $1, (%r8), %ymm14, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0],ymm7[1,2,3],ymm14[4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm14[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2],xmm13[3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,1,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3,4,5],ymm13[6],ymm10[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vinsertf128 $1, 32(%r8), %ymm11, %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4],ymm12[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm3[1,2],ymm13[3,4],ymm3[5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 48(%r8), %ymm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm13 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm7[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm9[1],ymm8[2,3,4,5],ymm9[6],ymm8[7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm9 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,3,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm10[0,1,1,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3],ymm12[4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm12 +; AVX2-SLOW-NEXT: vinsertf128 $1, 32(%r8), %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3],ymm10[4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vmovaps 32(%r8), %ymm10 +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm13[2],xmm11[3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm10[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3,4,5],ymm13[6],ymm11[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm4[1],ymm13[2,3,4],ymm4[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm6[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm3[1,2],ymm14[3,4],ymm3[5,6],ymm14[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 56(%r8), %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 16(%r8), %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4],ymm14[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm12[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm15 -; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm0 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm15 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2],ymm8[3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 24(%r8), %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm14[2],ymm8[3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm15[1,2],ymm9[3,4],ymm15[5,6],ymm9[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3,4],ymm9[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 16(%r8), %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0],ymm9[1,2,3,4],ymm14[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 24(%r8), %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm1[1],ymm15[2,3,4],ymm1[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm5[1,2],ymm7[3,4],ymm5[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 48(%r8), %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0],ymm7[1,2,3,4],ymm15[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm15[1,2],ymm8[3,4],ymm15[5,6],ymm8[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[6],ymm5[6],ymm0[7],ymm5[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm15[3,4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 56(%r8), %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm15[2],ymm8[3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm12[0,1,3,0,4,5,7,4] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,3,0,4,5,7,4] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 224(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm8, 128(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm13, 288(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm12, 256(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm11, 160(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm10, 192(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r9) +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm8, 288(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 256(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm14, 128(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm13, 96(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm11, 192(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r9) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i32_stride5_vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm2 -; AVX2-FAST-NEXT: vmovaps (%r8), %ymm4 -; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm3 +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm3 +; AVX2-FAST-NEXT: vmovaps (%r8), %ymm2 +; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm4 ; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm6 ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm9 ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 @@ -1135,7 +1127,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm2[0,1,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4,5],ymm7[6],ymm5[7] ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] @@ -1144,88 +1136,88 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm14[2],xmm7[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm4[0,1,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4,5],ymm7[6],ymm5[7] ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm7 +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm7 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1] ; AVX2-FAST-NEXT: vpermps %ymm8, %ymm12, %ymm8 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,3,2,3,2,3,2] ; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm11 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3],ymm11[4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm11 +; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm11 ; AVX2-FAST-NEXT: vinsertf128 $1, (%r8), %ymm6, %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm8[1,2,3],ymm6[4],ymm8[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm11[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm2[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm3[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm1[1,2],ymm13[3,4],ymm1[5,6],ymm13[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2],ymm8[3,4],ymm13[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm13 +; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm13 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0],ymm8[1,2,3,4],ymm13[5],ymm8[6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm11[0,2,3,3,4,6,7,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm7[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm13[1,2],ymm5[3,4],ymm13[5,6],ymm5[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm13 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm13 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm13[3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 56(%r8), %ymm13 +; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm13 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1],ymm13[2],ymm5[3,4,5,6],ymm13[7] ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm5 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; AVX2-FAST-NEXT: vpermps %ymm5, %ymm12, %ymm5 -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm12 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm12 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm10 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm10 ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm14 ; AVX2-FAST-NEXT: vinsertf128 $1, 32(%r8), %ymm9, %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0],ymm0[1,2,3],ymm5[4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3,4],ymm0[5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm14 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm14[2],ymm5[2],ymm14[3],ymm5[3],ymm14[6],ymm5[6],ymm14[7],ymm5[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm5 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm12[1,2],ymm15[3,4],ymm12[5,6],ymm15[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4],ymm15[5],ymm0[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm14[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm10[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm15[1,2],ymm6[3,4],ymm15[5,6],ymm6[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm5[2],ymm12[2],ymm5[3],ymm12[3],ymm5[6],ymm12[6],ymm5[7],ymm12[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm14[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm5[1,2],ymm6[3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm15[3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0],ymm6[1,2,3,4],ymm15[5],ymm6[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vbroadcastsd 56(%r8), %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm15[2],ymm6[3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,3,0,4,5,7,4] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2],ymm7[3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm10[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm6, 96(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm2, 224(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm6, 288(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm9, 160(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm13, 288(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm8, 256(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm13, 128(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm8, 96(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1237,126 +1229,126 @@ ; ; AVX2-FAST-PERLANE-LABEL: store_i32_stride5_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm14[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2,3],ymm15[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm15[1],ymm8[2,3,4,5],ymm15[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm9[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, (%r8), %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm10[1,2,3],ymm9[4],ymm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm14 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[0,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3],ymm15[4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, (%r8), %ymm14, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0],ymm7[1,2,3],ymm14[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm14[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3,4,5],ymm13[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 32(%r8), %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4],ymm12[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm3[1,2],ymm13[3,4],ymm3[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4],ymm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%r8), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm13 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm7[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm9[1],ymm8[2,3,4,5],ymm9[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm9 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm10[0,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3],ymm12[4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 32(%r8), %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3],ymm10[4],ymm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm13[2],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm10[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3,4,5],ymm13[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm4[1],ymm13[2,3,4],ymm4[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm6[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm3[1,2],ymm14[3,4],ymm3[5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%r8), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%r8), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4],ymm14[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm12[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm15 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2],ymm8[3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%r8), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm14[2],ymm8[3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm15[1,2],ymm9[3,4],ymm15[5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3,4],ymm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%r8), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0],ymm9[1,2,3,4],ymm14[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%r8), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm1[1],ymm15[2,3,4],ymm1[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm5[1,2],ymm7[3,4],ymm5[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%r8), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0],ymm7[1,2,3,4],ymm15[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm15[1,2],ymm8[3,4],ymm15[5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[6],ymm5[6],ymm0[7],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm15[3,4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%r8), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm15[2],ymm8[3,4,5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm12[0,1,3,0,4,5,7,4] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,3,0,4,5,7,4] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 224(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 128(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 288(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, 256(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 160(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 192(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r9) +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 288(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 256(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 128(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 192(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r9) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -1509,82 +1501,82 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride5_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $712, %rsp # imm = 0x2C8 -; SSE-NEXT: movdqa (%rsi), %xmm9 -; SSE-NEXT: movdqa 16(%rsi), %xmm7 -; SSE-NEXT: movdqa 32(%rsi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdx), %xmm11 -; SSE-NEXT: movdqa 16(%rdx), %xmm10 -; SSE-NEXT: movdqa 32(%rdx), %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm3 -; SSE-NEXT: movaps 16(%rcx), %xmm5 -; SSE-NEXT: movaps 32(%rcx), %xmm6 -; SSE-NEXT: movaps (%r8), %xmm4 -; SSE-NEXT: movaps 16(%r8), %xmm13 -; SSE-NEXT: movaps 32(%r8), %xmm12 -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,3,2,3] +; SSE-NEXT: subq $616, %rsp # imm = 0x268 +; SSE-NEXT: movdqa (%rsi), %xmm14 +; SSE-NEXT: movdqa 16(%rsi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rsi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdx), %xmm8 +; SSE-NEXT: movdqa 16(%rdx), %xmm4 +; SSE-NEXT: movdqa 32(%rdx), %xmm7 +; SSE-NEXT: movaps (%rcx), %xmm13 +; SSE-NEXT: movaps 16(%rcx), %xmm3 +; SSE-NEXT: movaps 32(%rcx), %xmm11 +; SSE-NEXT: movaps (%r8), %xmm10 +; SSE-NEXT: movaps 16(%r8), %xmm6 +; SSE-NEXT: movaps 32(%r8), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm13[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm13[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,3,3,3] +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm3[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm12[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm11[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm8 +; SSE-NEXT: movdqa 48(%rsi), %xmm2 +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm2 -; SSE-NEXT: movaps 48(%r8), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps 48(%rcx), %xmm9 +; SSE-NEXT: movaps 48(%r8), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm9[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 64(%rcx), %xmm0 +; SSE-NEXT: movaps 64(%rcx), %xmm5 +; SSE-NEXT: movaps 64(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%r8), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rsi), %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rsi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 80(%rcx), %xmm0 +; SSE-NEXT: movaps 80(%rcx), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%r8), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1593,256 +1585,236 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 96(%rcx), %xmm0 +; SSE-NEXT: movaps 96(%rcx), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%r8), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rsi), %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rsi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 112(%rcx), %xmm15 -; SSE-NEXT: movaps 112(%r8), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps 112(%rcx), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movdqa (%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm14[1,1] +; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm15[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm8[2,0] +; SSE-NEXT: movss {{.*#+}} xmm15 = xmm10[0],xmm15[1,2,3] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm13[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa (%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm11 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[0,2] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm4[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm13[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movdqa 32(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm13[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movaps 32(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm1[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm7[2,0] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm12[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm11[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm11[2],xmm4[3],xmm11[3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm12[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movaps 48(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps (%rsp), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[0,2] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 64(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movaps 64(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[1,1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm14[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm2[2,0] +; SSE-NEXT: movss {{.*#+}} xmm14 = xmm3[0],xmm14[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[0,1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 80(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm13 +; SSE-NEXT: movaps 80(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm13 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps %xmm11, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm12 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[2,3] +; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[3,0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 96(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movaps 96(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: movaps %xmm9, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm1[2,0] +; SSE-NEXT: movss {{.*#+}} xmm9 = xmm7[0],xmm9[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm2[0,1] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movaps 112(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[0,1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[3,0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm14, %xmm6 -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: shufps $81, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[1,0],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm2[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm2[0],xmm6[1,2,3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm11 -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1] -; SSE-NEXT: movaps 112(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm11[0] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE-NEXT: movaps %xmm1, %xmm11 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm15[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm15[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] +; SSE-NEXT: shufps $81, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[1,0],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm2[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm2[0],xmm6[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm15 = xmm0[0],xmm15[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] +; SSE-NEXT: shufps $81, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[1,0],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm2[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm9 = xmm0[0],xmm9[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm7 = xmm0[0],xmm7[1,2,3] -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm14[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm0[0],xmm5[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] -; SSE-NEXT: movaps %xmm1, 608(%r9) -; SSE-NEXT: movaps %xmm11, 592(%r9) -; SSE-NEXT: movaps %xmm4, 560(%r9) -; SSE-NEXT: movaps %xmm6, 528(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 512(%r9) -; SSE-NEXT: movaps %xmm8, 480(%r9) +; SSE-NEXT: movss {{.*#+}} xmm11 = xmm2[0],xmm11[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm15[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3] +; SSE-NEXT: movaps %xmm0, 608(%r9) +; SSE-NEXT: movaps %xmm3, 592(%r9) +; SSE-NEXT: movaps %xmm5, 560(%r9) +; SSE-NEXT: movaps %xmm8, 528(%r9) +; SSE-NEXT: movaps %xmm7, 512(%r9) +; SSE-NEXT: movaps %xmm10, 480(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 448(%r9) ; SSE-NEXT: movaps %xmm12, 432(%r9) @@ -1879,158 +1851,160 @@ ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 624(%r9) -; SSE-NEXT: movaps %xmm3, 576(%r9) +; SSE-NEXT: movaps %xmm4, 576(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 544(%r9) -; SSE-NEXT: movaps %xmm5, 496(%r9) +; SSE-NEXT: movaps %xmm9, 496(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 464(%r9) -; SSE-NEXT: movaps %xmm7, 416(%r9) +; SSE-NEXT: movaps %xmm11, 416(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 384(%r9) -; SSE-NEXT: movaps %xmm9, 336(%r9) +; SSE-NEXT: movaps %xmm14, 336(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 304(%r9) -; SSE-NEXT: movaps %xmm10, 256(%r9) +; SSE-NEXT: movaps %xmm6, 256(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r9) -; SSE-NEXT: movaps %xmm2, 176(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%r9) -; SSE-NEXT: movaps %xmm15, 96(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) -; SSE-NEXT: addq $712, %rsp # imm = 0x2C8 +; SSE-NEXT: addq $616, %rsp # imm = 0x268 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride5_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $600, %rsp # imm = 0x258 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm9[1],xmm6[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm9[0],xmm6[0],zero,zero +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[1,1],xmm5[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm14 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm15 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm15[0],xmm14[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm11 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm14[0],xmm10[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,0] +; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6],ymm8[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm4[1],xmm3[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm3[0],zero,zero +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm7[1,1],xmm3[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm2[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm11[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,0] +; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6],ymm8[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm7[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 68(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm0[1],xmm1[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm11 = xmm0[0],xmm1[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5,6],ymm7[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0],ymm7[1,2,3],ymm10[4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 100(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm9[0],xmm0[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,0] +; AVX1-ONLY-NEXT: vbroadcastss 68(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm1[1,1],xmm2[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3],ymm12[4,5,6],ymm8[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm13, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm8[1,2,3],ymm12[4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm8 +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,0] +; AVX1-ONLY-NEXT: vbroadcastss 100(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm13 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm12[1],xmm13[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm11 = xmm12[0],xmm13[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5,6],ymm7[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0],ymm7[1,2,3],ymm10[4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX1-ONLY-NEXT: vbroadcastss 4(%rcx), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = zero,zero,xmm9[2],xmm6[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm7 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,zero,xmm7[2],xmm6[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[1,1],xmm6[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm12[1,1],xmm13[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2,3],ymm15[4,5,6],ymm8[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1,2,3],ymm0[4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm14[2],xmm10[3],xmm14[3] +; AVX1-ONLY-NEXT: vbroadcastss 4(%rcx), %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm6[2],xmm5[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm5 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5],ymm5[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = zero,zero,xmm5[2],xmm0[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1],xmm0[1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm15 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1],ymm15[1,1],ymm7[5,5],ymm15[5,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[3,3],xmm7[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm10 -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[3,3],ymm9[3,3],ymm10[7,7],ymm9[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm9 -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2,3,4],ymm9[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,1],ymm15[1,1],ymm5[5,5],ymm15[5,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[3,3],xmm6[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm9[2],ymm7[3,4,5,6],ymm9[7] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm8 +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[3,3],ymm6[3,3],ymm8[7,7],ymm6[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX1-ONLY-NEXT: vbroadcastss 36(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm4[2],xmm3[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4],ymm6[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm4[2],xmm11[3],xmm4[3] +; AVX1-ONLY-NEXT: vbroadcastss 36(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm7[2],xmm3[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm3 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5],ymm3[6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4,5],ymm3[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm3[2],xmm2[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[1,1],xmm2[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm3[2],xmm0[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm0[1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,1],ymm3[1,1],ymm4[5,5],ymm3[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,3],xmm3[3,3] @@ -2045,17 +2019,17 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4],ymm4[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm4[2],ymm3[3,4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4],ymm4[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; AVX1-ONLY-NEXT: vbroadcastss 68(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm0[2],xmm1[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm2[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3029,213 +3003,209 @@ ; ; AVX512F-LABEL: store_i32_stride5_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm11 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm14 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm16, %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm17, %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm13 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm14 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm4 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm16 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm15 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm6 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm12, %zmm8 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm5, %zmm2 ; AVX512F-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512F-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm19, %zmm9 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm8, %zmm7 +; AVX512F-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512F-NEXT: vpermt2d %zmm15, %zmm17, %zmm2 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] +; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm18, %zmm10 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm8 +; AVX512F-NEXT: movw $6342, %ax # imm = 0x18C6 +; AVX512F-NEXT: kmovw %eax, %k3 +; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm8 {%k3} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512F-NEXT: vpermt2d %zmm15, %zmm19, %zmm8 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm11, %zmm10 ; AVX512F-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512F-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm21, %zmm15 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] -; AVX512F-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm23, %zmm24 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 -; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512F-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] +; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512F-NEXT: vpermt2d %zmm15, %zmm21, %zmm10 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] +; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm22, %zmm23 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] +; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm24, %zmm25 +; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512F-NEXT: vpermt2d %zmm15, %zmm23, %zmm25 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm26, %zmm27 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm28, %zmm29 -; AVX512F-NEXT: movw $6342, %ax # imm = 0x18C6 -; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm29 {%k3} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] -; AVX512F-NEXT: vpermt2d %zmm13, %zmm27, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm16, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm0 {%k1} -; AVX512F-NEXT: vpermt2d %zmm13, %zmm18, %zmm0 -; AVX512F-NEXT: vpermi2d %zmm6, %zmm4, %zmm19 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm2, %zmm8 -; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm8 {%k2} -; AVX512F-NEXT: vpermt2d %zmm5, %zmm20, %zmm8 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm2, %zmm21 -; AVX512F-NEXT: vpermi2d %zmm4, %zmm6, %zmm10 -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm10 {%k1} -; AVX512F-NEXT: vpermt2d %zmm5, %zmm22, %zmm10 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm2, %zmm23 -; AVX512F-NEXT: vpermi2d %zmm6, %zmm4, %zmm25 -; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm25 {%k2} -; AVX512F-NEXT: vpermt2d %zmm5, %zmm24, %zmm25 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm26, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm28, %zmm1 -; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm1 {%k3} -; AVX512F-NEXT: vpermt2d %zmm5, %zmm27, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm8, 256(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm0, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm29, 384(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm15, 448(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm9, 512(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm7, 576(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm3, (%r9) +; AVX512F-NEXT: vpermt2d %zmm16, %zmm26, %zmm14 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] +; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm13, %zmm16, %zmm0 +; AVX512F-NEXT: vmovdqa32 %zmm14, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512F-NEXT: vpermt2d %zmm15, %zmm13, %zmm0 +; AVX512F-NEXT: vpermi2d %zmm7, %zmm4, %zmm12 +; AVX512F-NEXT: vpermi2d %zmm3, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm5 {%k1} +; AVX512F-NEXT: vpermt2d %zmm6, %zmm17, %zmm5 +; AVX512F-NEXT: vpermi2d %zmm7, %zmm4, %zmm18 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm3, %zmm9 +; AVX512F-NEXT: vmovdqa32 %zmm18, %zmm9 {%k3} +; AVX512F-NEXT: vpermt2d %zmm6, %zmm19, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm3, %zmm1, %zmm20 +; AVX512F-NEXT: vpermi2d %zmm7, %zmm4, %zmm11 +; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm11 {%k2} +; AVX512F-NEXT: vpermt2d %zmm6, %zmm21, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm3, %zmm1, %zmm22 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm7, %zmm24 +; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm24 {%k1} +; AVX512F-NEXT: vpermt2d %zmm6, %zmm23, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm26, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm16, %zmm1 +; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm1 {%k2} +; AVX512F-NEXT: vpermt2d %zmm6, %zmm13, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, 576(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm24, 512(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm11, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm9, 384(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm5, 320(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm25, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i32_stride5_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm14 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm16, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm17, %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm14 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm16 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm15 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm6 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm12, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm5, %zmm2 ; AVX512BW-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm19, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm8, %zmm7 +; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm17, %zmm2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm18, %zmm10 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm8 +; AVX512BW-NEXT: movw $6342, %ax # imm = 0x18C6 +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm8 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm19, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm11, %zmm10 ; AVX512BW-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm21, %zmm15 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm23, %zmm24 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm21, %zmm10 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm22, %zmm23 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm24, %zmm25 +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm23, %zmm25 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm26, %zmm27 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm28, %zmm29 -; AVX512BW-NEXT: movw $6342, %ax # imm = 0x18C6 -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm29 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm16, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm0 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm18, %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm19 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm8 {%k2} -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm20, %zmm8 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm21 -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm10 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm10 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm22, %zmm10 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm23 -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm25 -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm25 {%k2} -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm24, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm26, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm28, %zmm1 -; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm1 {%k3} -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm27, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 512(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 576(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%r9) +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm26, %zmm14 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] +; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm13, %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm4, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm5 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm17, %zmm5 +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm4, %zmm18 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm3, %zmm9 +; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm9 {%k3} +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm19, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm20 +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm4, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm11 {%k2} +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm21, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm22 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm7, %zmm24 +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm24 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm23, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm26, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm16, %zmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm1 {%k2} +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, 576(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 512(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 384(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <32 x i32>, ptr %in.vecptr0, align 64 @@ -3256,70 +3226,69 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride5_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1736, %rsp # imm = 0x6C8 -; SSE-NEXT: movdqa (%rsi), %xmm10 -; SSE-NEXT: movdqa 16(%rsi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rsi), %xmm6 +; SSE-NEXT: subq $1464, %rsp # imm = 0x5B8 +; SSE-NEXT: movdqa (%rsi), %xmm5 +; SSE-NEXT: movdqa 16(%rsi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdx), %xmm8 -; SSE-NEXT: movdqa 16(%rdx), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdx), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm5 -; SSE-NEXT: movaps 16(%rcx), %xmm9 -; SSE-NEXT: movaps 32(%rcx), %xmm12 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%r8), %xmm4 -; SSE-NEXT: movaps 16(%r8), %xmm11 -; SSE-NEXT: movaps 32(%r8), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rsi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdx), %xmm4 +; SSE-NEXT: movdqa 16(%rdx), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdx), %xmm12 +; SSE-NEXT: movaps (%rcx), %xmm7 +; SSE-NEXT: movaps 16(%rcx), %xmm15 +; SSE-NEXT: movaps 32(%rcx), %xmm13 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%r8), %xmm9 +; SSE-NEXT: movaps 16(%r8), %xmm10 +; SSE-NEXT: movaps 32(%r8), %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm11[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm15[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm13[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm12 +; SSE-NEXT: movdqa 48(%rsi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm6 -; SSE-NEXT: movaps 48(%r8), %xmm7 -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps 48(%rcx), %xmm8 +; SSE-NEXT: movaps 48(%r8), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm8[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdx), %xmm15 +; SSE-NEXT: movdqa 64(%rdx), %xmm14 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 64(%rcx), %xmm13 -; SSE-NEXT: movaps 64(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps 64(%rcx), %xmm6 +; SSE-NEXT: movaps 64(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm6[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rsi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3328,26 +3297,25 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 80(%rcx), %xmm14 -; SSE-NEXT: movaps 80(%r8), %xmm2 +; SSE-NEXT: movaps 80(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps 80(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdx), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 96(%rcx), %xmm0 +; SSE-NEXT: movaps 96(%rcx), %xmm13 +; SSE-NEXT: movaps 96(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm13[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rsi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3356,12 +3324,12 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 112(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%r8), %xmm2 +; SSE-NEXT: movaps 112(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps 112(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3370,12 +3338,12 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 128(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%r8), %xmm2 +; SSE-NEXT: movaps 128(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rsi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3384,12 +3352,12 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 144(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%r8), %xmm2 +; SSE-NEXT: movaps 144(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps 144(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3398,12 +3366,12 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 160(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%r8), %xmm2 +; SSE-NEXT: movaps 160(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 176(%rsi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3412,12 +3380,12 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 176(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%r8), %xmm2 +; SSE-NEXT: movaps 176(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps 176(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3426,12 +3394,12 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 192(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%r8), %xmm2 +; SSE-NEXT: movaps 192(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rsi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3440,12 +3408,12 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 208(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%r8), %xmm2 +; SSE-NEXT: movaps 208(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps 208(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 224(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3454,12 +3422,12 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 224(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%r8), %xmm2 +; SSE-NEXT: movaps 224(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 240(%rsi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3468,167 +3436,176 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 240(%rcx), %xmm0 +; SSE-NEXT: movaps 240(%rcx), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 240(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%r8), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm4[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm5[1,1] +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm4[2,0] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm9[0],xmm2[1,2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movaps 16(%rdi), %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: movaps 16(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm15[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[0,1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm2[3,0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm10[0,2] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movdqa 32(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm1[1,1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm9, %xmm8 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm12[2,0] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movaps 48(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm11[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm8[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[0,2] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm11[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movdqa 64(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm1[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm14[2,0] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps 32(%rdi), %xmm2 +; SSE-NEXT: movaps 80(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm7[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movaps 80(%rdi), %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[0,1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movaps %xmm14, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,2] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps 96(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movaps 96(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[1,1] ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[2,0] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3] +; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm13[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -3642,22 +3619,21 @@ ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movaps 128(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -3665,16 +3641,21 @@ ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[1,1] ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[2,0] +; SSE-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -3688,82 +3669,91 @@ ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[0,1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[2,3] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[3,0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 160(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movaps 160(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[1,1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[2,0] +; SSE-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[0,1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 176(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps 176(%rdi), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[3,0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 192(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm4 +; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movaps 192(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[1,1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm15[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm2[2,0] +; SSE-NEXT: movss {{.*#+}} xmm15 = xmm14[0],xmm15[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[0,1] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -3775,185 +3765,128 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] ; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm9 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[2,3] +; SSE-NEXT: movaps %xmm2, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[3,0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 224(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movaps 224(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm10 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm9[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm1[2,0] +; SSE-NEXT: movss {{.*#+}} xmm9 = xmm5[0],xmm9[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[0,1] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm2[0,2] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movaps 240(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE-NEXT: movaps %xmm3, %xmm15 +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movaps 240(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm6 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm2[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[0,1] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm7[3,0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps $81, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[1,0],mem[1,1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm3[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps $81, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[1,0],mem[1,1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm3[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps $81, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[1,0],mem[1,1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm3[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps $81, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[1,0],mem[1,1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm3[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps $81, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[1,0],mem[1,1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm3[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm15 = xmm0[0],xmm15[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps $81, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[1,0],mem[1,1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm14 = xmm0[0],xmm14[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm11 = xmm0[0],xmm11[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm12 = xmm0[0],xmm12[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm0[0],xmm4[1,2,3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm0[0],xmm5[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm3[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps $81, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[1,0],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm3[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm12 = xmm3[0],xmm12[1,2,3] +; SSE-NEXT: shufps $81, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[1,0],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm1[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm7 = xmm1[0],xmm7[1,2,3] ; SSE-NEXT: movaps %xmm0, 1248(%r9) -; SSE-NEXT: movaps %xmm3, 1232(%r9) +; SSE-NEXT: movaps %xmm4, 1232(%r9) ; SSE-NEXT: movaps %xmm6, 1200(%r9) -; SSE-NEXT: movaps %xmm7, 1168(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1152(%r9) -; SSE-NEXT: movaps %xmm8, 1120(%r9) +; SSE-NEXT: movaps %xmm8, 1168(%r9) +; SSE-NEXT: movaps %xmm5, 1152(%r9) +; SSE-NEXT: movaps %xmm10, 1120(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1088(%r9) -; SSE-NEXT: movaps %xmm9, 1072(%r9) +; SSE-NEXT: movaps %xmm11, 1072(%r9) ; SSE-NEXT: movaps %xmm13, 1040(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1008(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 992(%r9) +; SSE-NEXT: movaps %xmm14, 992(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 960(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4030,35 +3963,38 @@ ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1264(%r9) -; SSE-NEXT: movaps %xmm5, 1216(%r9) +; SSE-NEXT: movaps %xmm7, 1216(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1184(%r9) -; SSE-NEXT: movaps %xmm4, 1136(%r9) +; SSE-NEXT: movaps %xmm9, 1136(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1104(%r9) ; SSE-NEXT: movaps %xmm12, 1056(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1024(%r9) -; SSE-NEXT: movaps %xmm10, 976(%r9) +; SSE-NEXT: movaps %xmm15, 976(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 944(%r9) -; SSE-NEXT: movaps %xmm11, 896(%r9) +; SSE-NEXT: movaps %xmm2, 896(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 864(%r9) -; SSE-NEXT: movaps %xmm14, 816(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 816(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 784(%r9) -; SSE-NEXT: movaps %xmm2, 736(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 736(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 704(%r9) -; SSE-NEXT: movaps %xmm15, 656(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 656(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 624(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 576(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 544(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 496(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 464(%r9) @@ -4074,7 +4010,7 @@ ; SSE-NEXT: movaps %xmm0, 256(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r9) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%r9) @@ -4084,44 +4020,44 @@ ; SSE-NEXT: movaps %xmm0, 64(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) -; SSE-NEXT: addq $1736, %rsp # imm = 0x6C8 +; SSE-NEXT: addq $1464, %rsp # imm = 0x5B8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride5_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1784, %rsp # imm = 0x6F8 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm6 +; AVX1-ONLY-NEXT: subq $1800, %rsp # imm = 0x708 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm7 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm14 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm5[1],xmm6[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm5[0],xmm6[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm7[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm11[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm8 +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm9[0],xmm0[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm15[1],xmm14[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm15[0],xmm14[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm15[1,1],xmm14[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm2[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm13 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 @@ -4129,33 +4065,33 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 68(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm10[1],xmm13[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = xmm10[0],xmm13[0],zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[1,1],xmm13[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 100(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm7[1],xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = xmm7[0],xmm0[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm8[1,1],xmm0[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm6, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm0 @@ -4168,13 +4104,13 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm4 ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm8[1],xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = xmm8[0],xmm0[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r8), %ymm12, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm4[1,2,3],ymm9[4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm3[1,1],xmm0[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r8), %ymm11, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm4[1,2,3],ymm6[4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4182,35 +4118,35 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 164(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm9 +; AVX1-ONLY-NEXT: vbroadcastss 164(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm6 ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,xmm4[1],xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm4[0],xmm0[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3],ymm12[4,5,6],ymm9[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r8), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1,2,3],ymm0[4],ymm9[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm4[1,1],xmm0[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3],ymm11[4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r8), %ymm12, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm6[1,2,3],ymm11[4],ymm6[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 196(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,0] +; AVX1-ONLY-NEXT: vbroadcastss 196(%rdx), %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm6, %ymm11 ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,xmm9[1],xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm9[0],xmm0[0],zero,zero +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm6[1,1],xmm0[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3],ymm12[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6],ymm11[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r8), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1,2,3],ymm0[4],ymm11[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4218,24 +4154,23 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 228(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss 228(%rdx), %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm0[1],xmm1[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = xmm0[0],xmm1[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm12[1,1],xmm0[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; AVX1-ONLY-NEXT: vbroadcastss 4(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm5[2],xmm6[2] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm5[2],xmm7[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] @@ -4243,33 +4178,34 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm1[2],xmm0[2] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm11[1,1],ymm1[5,5],ymm11[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,3],xmm1[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm1[3,3],xmm2[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,3],ymm2[3,3],ymm5[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4],ymm5[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm5[2],ymm1[3,4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -4284,32 +4220,32 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm1[2],xmm0[2] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm14 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm15 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,1],ymm14[1,1],ymm15[5,5],ymm14[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,3],xmm1[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[3,3],xmm1[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,3],ymm2[3,3],ymm5[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4],ymm5[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm5[2],ymm1[3,4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -4324,9 +4260,9 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm1[2],xmm0[2] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm13 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4334,32 +4270,32 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,3],xmm1[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[3,3],xmm1[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,3],ymm2[3,3],ymm5[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4],ymm5[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm5[2],ymm1[3,4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vbroadcastss 100(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm7[2],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm8[2],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] @@ -4381,11 +4317,11 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[3,3],xmm1[3,3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3],ymm3[3,3],ymm2[7,7],ymm3[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3],ymm5[3,3],ymm2[7,7],ymm5[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm2 @@ -4400,9 +4336,9 @@ ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vbroadcastss 132(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm8[2],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm3[2],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] @@ -4467,10 +4403,10 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[3,3],xmm1[3,3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm11 ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3],ymm12[3,3],ymm2[7,7],ymm12[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3],ymm11[3,3],ymm2[7,7],ymm11[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm2 @@ -4485,9 +4421,9 @@ ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vbroadcastss 196(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm9[2],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm6[2],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] @@ -4510,12 +4446,12 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3],ymm8[3,3],ymm9[7,7],ymm8[7,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,3],ymm8[3,3],ymm7[7,7],ymm8[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm10 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm10[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm9 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm9[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] @@ -4525,10 +4461,9 @@ ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vbroadcastss 228(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm2[2],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm12[2],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] @@ -4538,9 +4473,9 @@ ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm12 ; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm6[1,1],ymm4[5,5],ymm6[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm12[1,1],ymm4[5,5],ymm12[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 240(%rcx), %xmm0 @@ -4554,10 +4489,10 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3,4],ymm7[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0],ymm3[1,2,3,4],ymm6[5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm7[2],ymm5[3,4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm6[2],ymm5[3,4,5,6],ymm6[7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[3,0,2,3,7,4,6,7] @@ -4565,21 +4500,22 @@ ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm11[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3],ymm3[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm14[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm14[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3],ymm3[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -4617,39 +4553,39 @@ ; AVX1-ONLY-NEXT: # ymm13 = ymm13[0,1,2],mem[3],ymm13[4,5,6,7] ; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = mem[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm12[4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm11[4],ymm14[5,6,7] ; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = mem[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4,5,6],ymm15[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3],ymm14[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[0,1,2],mem[3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4],ymm9[5,6,7] -; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = mem[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4,5,6],ymm15[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0],ymm11[1,2,3],ymm14[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm11[0,1,2],mem[3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = mem[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1],ymm9[2],ymm14[3,4,5,6],ymm9[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1],ymm8[2],ymm14[3,4,5,6],ymm8[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm6[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm12[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1184(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 1024(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 864(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 1024(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 864(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm13, 704(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 544(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 384(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1248(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4714,7 +4650,7 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) -; AVX1-ONLY-NEXT: addq $1784, %rsp # imm = 0x6F8 +; AVX1-ONLY-NEXT: addq $1800, %rsp # imm = 0x708 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -6514,429 +6450,419 @@ ; ; AVX512F-LABEL: store_i32_stride5_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm23 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm18 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm3 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm29 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm21 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm25 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm28, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 +; AVX512F-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm20 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm14 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm25 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm30 +; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm28 +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm23 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] +; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm18, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm17, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm15, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm11, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm14, %zmm31 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm13, %zmm22 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512F-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm30, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm28, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm4, %zmm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm12, %zmm29 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] +; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm16, %zmm21 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] +; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm0, %zmm31, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm14, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm13, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm30, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm24, %zmm28, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm14, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm25, %zmm28, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm25, %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm12, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm25, %zmm16, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm26, %zmm31, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm25, %zmm13, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm25, %zmm30, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 -; AVX512F-NEXT: vpermi2d %zmm21, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm1, %zmm21, %zmm14 -; AVX512F-NEXT: vpermi2d %zmm21, %zmm1, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm30, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm12, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm30, %zmm16, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm31, %zmm30 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] +; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2d %zmm23, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm23, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm23, %zmm2, %zmm12 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm23, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm23, %zmm31, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm28, %zmm7 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] ; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm23, %zmm2, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm7, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm2, %zmm19 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm18, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm15, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm28, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm17 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm31 ; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm17, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm28, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm2, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm19 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm17, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm5, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm28, %zmm9 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm25, %zmm17 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm25, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm1, %zmm25, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm25, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm18, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm31, %zmm15, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm28, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm31 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 ; AVX512F-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm8 {%k1} -; AVX512F-NEXT: movw $25368, %ax # imm = 0x6318 -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm16, %zmm22 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} ; AVX512F-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512F-NEXT: kmovw %eax, %k3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm13 {%k3} +; AVX512F-NEXT: movw $25368, %ax # imm = 0x6318 +; AVX512F-NEXT: kmovw %eax, %k2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm14, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512F-NEXT: vpermt2d %zmm0, %zmm14, %zmm29 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm21 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512F-NEXT: vpermt2d %zmm0, %zmm13, %zmm19 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k3} +; AVX512F-NEXT: vmovdqa32 %zmm9, %zmm24 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm17 {%k2} +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm14, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm13, %zmm17 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} +; AVX512F-NEXT: vmovdqa32 %zmm8, %zmm26 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm27 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm13, %zmm31 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] -; AVX512F-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512F-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] -; AVX512F-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm27 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm30 {%k3} -; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm15, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm16, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm25, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm19 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm9, %zmm26 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm4 {%k3} -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm7 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm5, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm15, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm16, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm25, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm17 {%k2} -; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm17 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm14 {%k1} -; AVX512F-NEXT: vpermt2d %zmm2, %zmm15, %zmm14 -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm13 {%k2} -; AVX512F-NEXT: vpermt2d %zmm2, %zmm16, %zmm13 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm1 {%k3} -; AVX512F-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm13, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm14, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm17, 256(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm0, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm4, 384(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm26, 448(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm24, 512(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm6, 576(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm19, 640(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm30, 704(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm27, 768(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm20, 832(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm18, 896(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm23, 960(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm21, 1024(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm22, 1088(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm31, 1152(%r9) +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm15 {%k3} +; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm15 +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} +; AVX512F-NEXT: vpermt2d %zmm0, %zmm14, %zmm12 +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm16 {%k1} +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm16 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm1 {%k2} +; AVX512F-NEXT: vpermt2d %zmm0, %zmm13, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, 1216(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm16, 1152(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm12, 1088(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm15, 1024(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm18, 960(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm31, 896(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm27, 832(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm26, 768(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm3, 704(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm4, 640(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm17, 576(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm22, 512(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm24, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm23, 384(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm20, 320(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm19, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm21, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm29, 128(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1216(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%r9) -; AVX512F-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512F-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%r9) +; AVX512F-NEXT: addq $520, %rsp # imm = 0x208 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i32_stride5_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm29 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm21 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm25 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm28, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 +; AVX512BW-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm14 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm25 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm30 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm28 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm23 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm17, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm5 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm15, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm11, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm14, %zmm31 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm13, %zmm22 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm28, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm4, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm29 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] +; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm16, %zmm21 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] +; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm14, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm13, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm30, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm28, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm14, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm28, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm12, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm25, %zmm16, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm26, %zmm31, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm13, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm30, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm28, %zmm16 -; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm21, %zmm14 -; AVX512BW-NEXT: vpermi2d %zmm21, %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm30, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm12, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm30, %zmm16, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm31, %zmm30 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] +; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2d %zmm23, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm23, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm23, %zmm2, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm23, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm31, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm28, %zmm7 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm2, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm7, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm2, %zmm19 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm15, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm28, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm17 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm31 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm17, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm28, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm2, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm19 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm17, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm28, %zmm9 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm25, %zmm17 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm25, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm25, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm31, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm28, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm31 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm18 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 ; AVX512BW-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm8 {%k1} -; AVX512BW-NEXT: movw $25368, %ax # imm = 0x6318 -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} ; AVX512BW-NEXT: movw $6342, %ax # imm = 0x18C6 ; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k3} +; AVX512BW-NEXT: movw $25368, %ax # imm = 0x6318 +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm14, %zmm29 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm13, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm14, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm13, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm27 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm13, %zmm31 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm18 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm21 {%k3} -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm31 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm21 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm27 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm30 {%k3} -; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm26 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm4 {%k3} -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm5, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm16, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm25, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm17 {%k2} -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm17 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm14 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm14 -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm13 {%k2} -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm13 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k3} -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 512(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 576(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 640(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 704(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 768(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 832(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 896(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 960(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 1024(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 1088(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 1152(%r9) +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm15 {%k3} +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm15 +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm14, %zmm12 +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm16 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm16 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm1 {%k2} +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, 1216(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 1152(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 1088(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 1024(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 960(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 896(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 832(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 768(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 704(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 640(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 576(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 512(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 384(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 128(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1216(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r9) -; AVX512BW-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512BW-NEXT: vmovaps %zmm0, (%r9) +; AVX512BW-NEXT: addq $520, %rsp # imm = 0x208 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i32>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll @@ -45,23 +45,24 @@ ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero -; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[u,u,0,2,u,u,5,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,3],xmm4[1,3] -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm2[2,3],ymm0[4,6],ymm2[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -70,22 +71,24 @@ ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2,1,3,4,6,5,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,2,2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] -; AVX2-SLOW-NEXT: vmovaps %xmm1, 32(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX2-SLOW-NEXT: vmovaps %xmm0, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rax) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -94,23 +97,24 @@ ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = <0,2,4,6,u,u,1,3> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = <0,2,4,6,u,u,1,3> +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-NEXT: vmovaps %xmm3, 32(%rax) +; AVX2-FAST-NEXT: vmovaps %xmm2, 32(%rax) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -119,43 +123,44 @@ ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2,1,3,4,6,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512-LABEL: store_i32_stride6_vf2: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = <0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u> -; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vextractf32x4 $2, %zmm0, 32(%rax) -; AVX512-NEXT: vmovaps %ymm0, (%rax) +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2 +; AVX512-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,2,4,6,16,20,1,3,5,7,17,21,u,u,u,u> +; AVX512-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 +; AVX512-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) +; AVX512-NEXT: vmovdqa %ymm1, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %in.vec0 = load <2 x i32>, ptr %in.vecptr0, align 64 @@ -180,35 +185,35 @@ ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm4 -; SSE-NEXT: movaps (%rdx), %xmm1 -; SSE-NEXT: movaps (%rcx), %xmm5 -; SSE-NEXT: movaps (%r8), %xmm7 -; SSE-NEXT: movaps (%r9), %xmm3 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm3[3,3] -; SSE-NEXT: movaps %xmm7, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm3[1,1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0] +; SSE-NEXT: movaps (%rsi), %xmm1 +; SSE-NEXT: movaps (%rdx), %xmm2 +; SSE-NEXT: movaps (%rcx), %xmm3 +; SSE-NEXT: movaps (%r8), %xmm4 +; SSE-NEXT: movaps (%r9), %xmm5 +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm7[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm2[0] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[2,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm8[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm9[0,2] -; SSE-NEXT: movaps %xmm3, 16(%rax) -; SSE-NEXT: movaps %xmm2, 32(%rax) +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm4[0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm7[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0] +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm9[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm2, 80(%rax) ; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps %xmm1, 80(%rax) -; SSE-NEXT: movaps %xmm6, 64(%rax) +; SSE-NEXT: movaps %xmm6, 32(%rax) +; SSE-NEXT: movaps %xmm8, 16(%rax) +; SSE-NEXT: movaps %xmm1, 64(%rax) ; SSE-NEXT: movaps %xmm7, (%rax) ; SSE-NEXT: retq ; @@ -229,7 +234,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm11 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm11[0],ymm6[2],ymm11[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm3[0,0],xmm1[0,0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm3[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7] @@ -335,164 +340,161 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride6_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm4 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: movaps (%rsi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rsi), %xmm10 -; SSE-NEXT: movaps (%rdx), %xmm8 -; SSE-NEXT: movaps 16(%rdx), %xmm2 -; SSE-NEXT: movaps (%rcx), %xmm6 -; SSE-NEXT: movaps 16(%rcx), %xmm9 -; SSE-NEXT: movaps (%r8), %xmm5 -; SSE-NEXT: movaps 16(%r8), %xmm11 -; SSE-NEXT: movaps (%r9), %xmm7 -; SSE-NEXT: movaps 16(%r9), %xmm3 -; SSE-NEXT: movaps %xmm9, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] -; SSE-NEXT: movaps %xmm1, %xmm12 -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; SSE-NEXT: movaps %xmm11, %xmm13 -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm12[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm14[2,0] -; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[1,1] -; SSE-NEXT: movaps %xmm2, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm11[0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] -; SSE-NEXT: movaps %xmm15, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm14[0,2] -; SSE-NEXT: movaps %xmm5, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3],xmm7[3,3] -; SSE-NEXT: movaps %xmm8, %xmm11 -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm14[0,2] -; SSE-NEXT: movaps %xmm4, %xmm14 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] -; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm14[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm6[2,0] -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm5[0,2] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: movaps %xmm14, 48(%rax) -; SSE-NEXT: movaps %xmm1, 96(%rax) -; SSE-NEXT: movaps %xmm3, 112(%rax) -; SSE-NEXT: movaps %xmm13, 160(%rax) -; SSE-NEXT: movaps %xmm2, 176(%rax) -; SSE-NEXT: movaps %xmm4, (%rax) +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movaps 16(%rdi), %xmm2 +; SSE-NEXT: movaps (%rsi), %xmm11 +; SSE-NEXT: movaps 16(%rsi), %xmm10 +; SSE-NEXT: movaps (%rdx), %xmm1 +; SSE-NEXT: movaps 16(%rdx), %xmm5 +; SSE-NEXT: movaps (%rcx), %xmm12 +; SSE-NEXT: movaps 16(%rcx), %xmm7 +; SSE-NEXT: movaps (%r8), %xmm13 +; SSE-NEXT: movaps 16(%r8), %xmm9 +; SSE-NEXT: movaps (%r9), %xmm14 +; SSE-NEXT: movaps 16(%r9), %xmm8 +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] +; SSE-NEXT: movaps %xmm14, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm13[0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE-NEXT: movaps %xmm14, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm13[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm15[2,0] +; SSE-NEXT: movaps %xmm12, %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm1[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSE-NEXT: movaps %xmm14, %xmm11 +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,0],xmm13[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm14[2,0] +; SSE-NEXT: movaps %xmm5, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1] +; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; SSE-NEXT: movaps %xmm8, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm9[0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm12[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm13[0] +; SSE-NEXT: movaps %xmm8, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm15[2,0] +; SSE-NEXT: movaps %xmm7, %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm5[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm15[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,0],xmm9[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm8[2,0] +; SSE-NEXT: movaps %xmm5, 176(%rax) +; SSE-NEXT: movaps %xmm10, 160(%rax) +; SSE-NEXT: movaps %xmm2, 144(%rax) +; SSE-NEXT: movaps %xmm13, 128(%rax) +; SSE-NEXT: movaps %xmm14, 112(%rax) +; SSE-NEXT: movaps %xmm12, 96(%rax) +; SSE-NEXT: movaps %xmm1, 80(%rax) +; SSE-NEXT: movaps %xmm11, 64(%rax) +; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: movaps %xmm4, 32(%rax) ; SSE-NEXT: movaps %xmm6, 16(%rax) -; SSE-NEXT: movaps %xmm8, 64(%rax) -; SSE-NEXT: movaps %xmm11, 80(%rax) -; SSE-NEXT: movaps %xmm10, 128(%rax) -; SSE-NEXT: movaps %xmm12, 144(%rax) +; SSE-NEXT: movaps %xmm3, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride6_vf8: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 16(%r9), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,2],xmm9[1,2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2,1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[1,2],xmm5[1,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2,1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 4(%r8), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 4(%r9), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm10[2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 16(%r9), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm10[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,2],ymm2[1,2],ymm1[5,6],ymm2[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm10 +; AVX1-ONLY-NEXT: vbroadcastss (%rdx), %xmm11 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm11 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm11[0],mem[0],xmm11[1],mem[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm12[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 4(%r8), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 4(%r9), %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm12[3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm2[1,2],ymm3[1,2],ymm2[5,6],ymm3[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm12[2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm10[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4,5,6],ymm10[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[3,0],ymm3[7,4],ymm2[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm2 -; AVX1-ONLY-NEXT: vbroadcastss (%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,0],ymm1[3,0],ymm2[7,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i32_stride6_vf8: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11 ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] @@ -517,49 +519,49 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm12[2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastq %xmm10, %ymm11 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm12, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm13 +; AVX2-SLOW-NEXT: vpbroadcastd %xmm13, %ymm11 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm11[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4],ymm11[5],ymm14[6],ymm11[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm1[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4],ymm12[5],ymm14[6],ymm12[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5],ymm12[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 20(%r9), %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[2,2,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm13[2,3],ymm8[2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = mem[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 16(%r9), %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 16(%r9), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, 160(%rax) +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 96(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm7, 64(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm11, 128(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm12, 128(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm6, (%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm5, 32(%rax) ; AVX2-SLOW-NEXT: vzeroupper @@ -568,93 +570,93 @@ ; AVX2-FAST-LABEL: store_i32_stride6_vf8: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm5 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm6 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm11 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm9 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm9 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm10 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm10 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm10[0],zero,xmm10[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm12 +; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm13 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm12 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm12[0],zero,xmm12[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm13 -; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm14 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm13[2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm12, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vpbroadcastd (%r9), %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm11[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4],ymm12[5],ymm13[6],ymm12[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,3,2,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,0,7,0,6,0,7,0] -; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,6,0,7,0,6,0,7] -; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm10, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm12[2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm10, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm11 +; AVX2-FAST-NEXT: vpbroadcastd %xmm11, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm12 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm1[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2],ymm13[3],ymm14[4],ymm13[5],ymm14[6],ymm13[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm12[4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [2,2,3,3,2,2,3,3] +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 64(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm12, 128(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 32(%rax) +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm12[2,3],ymm1[2,3] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,0,7,0,6,0,7,0] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,7,0,6,0,7] +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd (%r9), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm13, 128(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%rax) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i32_stride6_vf8: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] @@ -679,49 +681,49 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm12[2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm10, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm12, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm13, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm11[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4],ymm11[5],ymm14[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm1[1,1,2,3,5,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4],ymm12[5],ymm14[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 20(%r9), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm13[2,3],ymm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = mem[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 16(%r9), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 16(%r9), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -783,9 +785,9 @@ ; SSE-NEXT: movaps (%rcx), %xmm1 ; SSE-NEXT: movaps 16(%rcx), %xmm0 ; SSE-NEXT: movaps (%r8), %xmm3 -; SSE-NEXT: movaps 16(%r8), %xmm14 +; SSE-NEXT: movaps 16(%r8), %xmm13 ; SSE-NEXT: movaps (%r9), %xmm4 -; SSE-NEXT: movaps 16(%r9), %xmm13 +; SSE-NEXT: movaps 16(%r9), %xmm14 ; SSE-NEXT: movaps %xmm9, %xmm11 ; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 @@ -796,52 +798,52 @@ ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm11[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[0,2] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm9[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[2,3] +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[2,3] ; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm3[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm10, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm8, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm13, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm14[0] +; SSE-NEXT: movaps %xmm14, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm13[0] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm13[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm13[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; SSE-NEXT: movaps %xmm14, %xmm2 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm13[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm8[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm8[2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdi), %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3],xmm13[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,0],xmm13[3,3] ; SSE-NEXT: movaps 32(%rdx), %xmm13 ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; SSE-NEXT: movaps 32(%rcx), %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm14[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm14[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm13, %xmm14 ; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] @@ -854,53 +856,53 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm15[2,3] ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm4[2,0] ; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm12[2,3] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm12[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm2[0,2] -; SSE-NEXT: movaps 48(%rdx), %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm3[2,0] +; SSE-NEXT: movaps 48(%rdx), %xmm0 ; SSE-NEXT: movaps 48(%rcx), %xmm10 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] -; SSE-NEXT: movaps 48(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] +; SSE-NEXT: movaps 48(%rdi), %xmm1 ; SSE-NEXT: movaps 48(%rsi), %xmm9 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] -; SSE-NEXT: movaps 48(%r8), %xmm1 -; SSE-NEXT: movaps 48(%r9), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] +; SSE-NEXT: movaps 48(%r8), %xmm7 +; SSE-NEXT: movaps 48(%r9), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm4[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE-NEXT: movaps %xmm10, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movaps %xmm5, %xmm9 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm7[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm5[2,0] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 368(%rax) +; SSE-NEXT: movaps %xmm0, 368(%rax) ; SSE-NEXT: movaps %xmm9, 352(%rax) -; SSE-NEXT: movaps %xmm2, 336(%rax) -; SSE-NEXT: movaps %xmm5, 320(%rax) +; SSE-NEXT: movaps %xmm1, 336(%rax) +; SSE-NEXT: movaps %xmm3, 320(%rax) ; SSE-NEXT: movaps %xmm6, 304(%rax) -; SSE-NEXT: movaps %xmm4, 288(%rax) +; SSE-NEXT: movaps %xmm2, 288(%rax) ; SSE-NEXT: movaps %xmm13, 272(%rax) ; SSE-NEXT: movaps %xmm8, 256(%rax) ; SSE-NEXT: movaps %xmm12, 240(%rax) @@ -936,347 +938,333 @@ ; ; AVX1-ONLY-LABEL: store_i32_stride6_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $104, %rsp -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm13 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm11 -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[1,2],xmm8[1,2] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm15 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm6 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm7 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 4(%r8), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 4(%r9), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 4(%r8), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 4(%r9), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm13[0],ymm5[1],ymm13[1],ymm5[4],ymm13[4],ymm5[5],ymm13[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[4],ymm15[4],ymm10[5],ymm15[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 48(%r9), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 16(%r9), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,2],xmm3[1,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 36(%r8), %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm11[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 36(%r9), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm11[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm11 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm14 -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 16(%r9), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm10[0,1,2,3,4],ymm14[5],ymm10[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm5[2],ymm13[2],ymm5[3],ymm13[3],ymm5[6],ymm13[6],ymm5[7],ymm13[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[1,2],ymm9[1,2],ymm7[5,6],ymm9[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 52(%r9), %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1,2],ymm10[3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,2],ymm12[1,2],ymm11[5,6],ymm12[5,6] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 36(%r8), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 36(%r9), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[4],ymm14[4],ymm12[5],ymm14[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 48(%r9), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[6],ymm15[6],ymm10[7],ymm15[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm6[1,2],ymm8[1,2],ymm6[5,6],ymm8[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm15[2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm14 = ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[6],ymm14[6],ymm12[7],ymm14[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm1[1,2],ymm2[1,2],ymm1[5,6],ymm2[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 52(%r9), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm15 +; AVX1-ONLY-NEXT: vbroadcastss (%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm15 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm13, %ymm3 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm4[1],ymm0[2,3,4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vbroadcastss 32(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 32(%r9), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[3,0],ymm7[3,0],ymm9[7,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[3,0],ymm6[3,0],ymm8[7,4],ymm6[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vbroadcastss 32(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 32(%r9), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm9, %ymm4 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4,5,6],ymm6[7] -; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm6 -; AVX1-ONLY-NEXT: vbroadcastss (%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm12[3,0],ymm11[3,0],ymm12[7,4],ymm11[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,0],ymm1[3,0],ymm2[7,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 256(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 288(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $104, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i32_stride6_vf16: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $200, %rsp -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm5 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm6 -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm13[0],zero,xmm13[1],zero +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm11 +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm12[0],zero,xmm12[1],zero ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 4(%r9), %ymm4 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[1,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,2,2,3] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm11[0],zero,xmm11[1],zero ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 36(%r9), %ymm4 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd 32(%rcx), %xmm3 -; AVX2-SLOW-NEXT: vpbroadcastd 32(%rdx), %xmm4 +; AVX2-SLOW-NEXT: vpbroadcastd (%rcx), %xmm3 +; AVX2-SLOW-NEXT: vpbroadcastd (%rdx), %xmm4 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm6, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm15 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm15, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm14 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[1,1,2,3,5,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm8 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 52(%r9), %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd (%rcx), %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastd (%rdx), %xmm3 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm13 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm13, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm5 +; AVX2-SLOW-NEXT: vpbroadcastd %xmm5, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,2,3,5,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm5 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 20(%r9), %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3,4,5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm8 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = mem[0],zero,mem[1],zero +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 20(%r9), %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd 32(%rcx), %xmm4 +; AVX2-SLOW-NEXT: vpbroadcastd 32(%rdx), %xmm6 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[6],ymm14[6],ymm9[7],ymm14[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm11[2,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm11[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm4[2,3,4,5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm13 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm13 = xmm12[2],mem[2],xmm12[3],mem[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[4],ymm14[4],ymm9[5],ymm14[5] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 48(%r9), %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm9[2,3] -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = mem[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm11, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm7 +; AVX2-SLOW-NEXT: vpbroadcastd %xmm7, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm4[2,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm4 +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm9[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm14[2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 52(%r9), %ymm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm14[3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm14 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm14 = xmm10[2],mem[2],xmm10[3],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm10, %ymm14 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3,4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm5[1],ymm12[2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm13[0],ymm0[0],ymm13[1],ymm0[1],ymm13[4],ymm0[4],ymm13[5],ymm0[5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 16(%r9), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 16(%r9), %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm12[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm13[2],ymm0[2],ymm13[3],ymm0[3],ymm13[6],ymm0[6],ymm13[7],ymm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = mem[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm8, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm7 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[4],ymm4[4],ymm6[5],ymm4[5] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm15[0],ymm1[0],ymm15[1],ymm1[1],ymm15[4],ymm1[4],ymm15[5],ymm1[5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 48(%r9), %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm15[2],ymm1[2],ymm15[3],ymm1[3],ymm15[6],ymm1[6],ymm15[7],ymm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = mem[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, 288(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 256(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 352(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 352(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 288(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 256(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) @@ -1286,354 +1274,352 @@ ; ; AVX2-FAST-LABEL: store_i32_stride6_vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $232, %rsp -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm12 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX2-FAST-NEXT: subq $200, %rsp +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 ; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm9 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 36(%r9), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd 32(%rcx), %xmm3 -; AVX2-FAST-NEXT: vpbroadcastd 32(%rdx), %xmm5 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm10 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm11 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm4, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 32(%r9), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm7 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm11[0],zero,xmm11[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm14 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm14[2],ymm2[3],ymm14[3],ymm2[6],ymm14[6],ymm2[7],ymm14[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm1 -; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm3 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd (%r9), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm7[0],zero,xmm7[1],zero +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 36(%r9), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm2 +; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm3 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm13 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm11, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm6 +; AVX2-FAST-NEXT: vpbroadcastd %xmm6, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm5 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm3[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm4 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3,4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm9 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm12, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm15[1],ymm7[2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm15[2,3] -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm15 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [6,5,3,3,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm6, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,6,2,3,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm12, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm12, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm7 = ymm7[0],ymm14[0],ymm7[1],ymm14[1],ymm7[4],ymm14[4],ymm7[5],ymm14[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm15[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm7 = ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[6],ymm1[6],ymm5[7],ymm1[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm7[2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3,4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm3 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm8 = ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[6],ymm1[6],ymm5[7],ymm1[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd 32(%rcx), %xmm2 +; AVX2-FAST-NEXT: vpbroadcastd 32(%rdx), %xmm4 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm7, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm7 +; AVX2-FAST-NEXT: vpbroadcastd %xmm7, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm14 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm15 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm9[4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1,2],ymm12[3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm10 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3] +; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm10, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm10, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm6[1],ymm0[2,3,4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[4],ymm1[4],ymm5[5],ymm1[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[4],ymm4[4],ymm13[5],ymm4[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm13[0],ymm3[0],ymm13[1],ymm3[1],ymm13[4],ymm3[4],ymm13[5],ymm3[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,5,3,3,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm3, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [4,6,2,3,4,6,6,7] +; AVX2-FAST-NEXT: vpermd (%r9), %ymm5, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm10, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm8[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm8 = ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[6],ymm15[6],ymm14[7],ymm15[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd 32(%r9), %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 288(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 256(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 352(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 288(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 256(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm12, 320(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $232, %rsp +; AVX2-FAST-NEXT: addq $200, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i32_stride6_vf16: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $200, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm13[0],zero,xmm13[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm12[0],zero,xmm12[1],zero ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%r9), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[1,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm11[0],zero,xmm11[1],zero ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%r9), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rcx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rdx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rcx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rdx), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm6, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm15, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[1,1,2,3,5,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 52(%r9), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rcx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rdx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm13, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm5, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,2,3,5,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 20(%r9), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3,4,5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[1,1,2,3,5,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm8 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm6 = mem[0],zero,mem[1],zero +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 20(%r9), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rcx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rdx), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[6],ymm14[6],ymm9[7],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm11[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm11[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm4[2,3,4,5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm13 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm12[2],mem[2],xmm12[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[4],ymm14[4],ymm9[5],ymm14[5] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 48(%r9), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = mem[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm7, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[1,1,2,3,5,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm4[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm9[4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm14[2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 52(%r9), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm14[3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm14 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm14 = xmm10[2],mem[2],xmm10[3],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm10, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3,4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm5[1],ymm12[2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm13[0],ymm0[0],ymm13[1],ymm0[1],ymm13[4],ymm0[4],ymm13[5],ymm0[5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 16(%r9), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 16(%r9), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm12[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm13[2],ymm0[2],ymm13[3],ymm0[3],ymm13[6],ymm0[6],ymm13[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = mem[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm8, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[4],ymm4[4],ymm6[5],ymm4[5] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm15[0],ymm1[0],ymm15[1],ymm1[1],ymm15[4],ymm1[4],ymm15[5],ymm1[5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 48(%r9), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm15[2],ymm1[2],ymm15[3],ymm1[3],ymm15[6],ymm1[6],ymm15[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = mem[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 288(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 256(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 352(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 288(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 256(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) @@ -1663,15 +1649,18 @@ ; AVX512F-SLOW-NEXT: vpermi2d %zmm1, %zmm6, %zmm7 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512F-SLOW-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512F-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512F-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512F-SLOW-NEXT: movb $-110, %cl +; AVX512F-SLOW-NEXT: kmovw %ecx, %k2 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> ; AVX512F-SLOW-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512F-SLOW-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512F-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -1679,23 +1668,20 @@ ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512F-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512F-SLOW-NEXT: movb $-110, %cl -; AVX512F-SLOW-NEXT: kmovw %ecx, %k2 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> ; AVX512F-SLOW-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512F-SLOW-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512F-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = ; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512F-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> ; AVX512F-SLOW-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512F-SLOW-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512F-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -1716,10 +1702,10 @@ ; AVX512F-SLOW-NEXT: vpermi2d %zmm1, %zmm11, %zmm2 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512F-SLOW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 128(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) ; AVX512F-SLOW-NEXT: vzeroupper @@ -1734,70 +1720,74 @@ ; AVX512F-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512F-FAST-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512F-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm6 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512F-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512F-FAST-NEXT: movb $-110, %cl -; AVX512F-FAST-NEXT: kmovw %ecx, %k2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm7, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512F-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm8 +; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm8 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512F-FAST-NEXT: vpermi2d %zmm7, %zmm8, %zmm9 ; AVX512F-FAST-NEXT: movb $36, %cl ; AVX512F-FAST-NEXT: kmovw %ecx, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 {%k1} ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 -; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] -; AVX512F-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512F-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm6, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512F-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm8 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] +; AVX512F-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermi2d %zmm8, %zmm9, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm7, %zmm8 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512F-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm8 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512F-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 +; AVX512F-FAST-NEXT: movb $-110, %cl +; AVX512F-FAST-NEXT: kmovw %ecx, %k2 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm9, %zmm8 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm8, %zmm9 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512F-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm8 ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512F-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm10, %zmm8 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm8, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm8 ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512F-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm11, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm11, %zmm8 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm11 -; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] -; AVX512F-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm8, %zmm11 +; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] +; AVX512F-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm8 ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512F-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> ; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm3, %zmm2 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] @@ -1805,9 +1795,9 @@ ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 256(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -1833,15 +1823,18 @@ ; AVX512BW-SLOW-NEXT: vpermi2d %zmm1, %zmm6, %zmm7 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512BW-SLOW-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512BW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512BW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512BW-SLOW-NEXT: movb $-110, %cl +; AVX512BW-SLOW-NEXT: kmovd %ecx, %k2 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> ; AVX512BW-SLOW-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512BW-SLOW-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512BW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -1849,23 +1842,20 @@ ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512BW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512BW-SLOW-NEXT: movb $-110, %cl -; AVX512BW-SLOW-NEXT: kmovd %ecx, %k2 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> ; AVX512BW-SLOW-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512BW-SLOW-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512BW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = ; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512BW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> ; AVX512BW-SLOW-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512BW-SLOW-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512BW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -1886,10 +1876,10 @@ ; AVX512BW-SLOW-NEXT: vpermi2d %zmm1, %zmm11, %zmm2 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512BW-SLOW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, 256(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, 128(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -1904,70 +1894,74 @@ ; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512BW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm6 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512BW-FAST-NEXT: movb $-110, %cl -; AVX512BW-FAST-NEXT: kmovd %ecx, %k2 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm7, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512BW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm8 +; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %ymm8 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512BW-FAST-NEXT: vpermi2d %zmm7, %zmm8, %zmm9 ; AVX512BW-FAST-NEXT: movb $36, %cl ; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 -; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] -; AVX512BW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm6, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512BW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm8 +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] +; AVX512BW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpermi2d %zmm8, %zmm9, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm7, %zmm8 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512BW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm8 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512BW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 +; AVX512BW-FAST-NEXT: movb $-110, %cl +; AVX512BW-FAST-NEXT: kmovd %ecx, %k2 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm9, %zmm8 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm8, %zmm9 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512BW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm8 ; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512BW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm10, %zmm8 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm8, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm8 ; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512BW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm11, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm11, %zmm8 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm11 -; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] -; AVX512BW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm8, %zmm11 +; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] +; AVX512BW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm8 ; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512BW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> ; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm3, %zmm2 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] @@ -1975,9 +1969,9 @@ ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, 256(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec0 = load <16 x i32>, ptr %in.vecptr0, align 64 @@ -2023,22 +2017,22 @@ ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm13[0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm8[0,2] +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm8[2,0] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm8 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm11[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm9[2,3] +; SSE-NEXT: movaps %xmm7, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm9[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,0],xmm6[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm7[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm12, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] @@ -2050,25 +2044,25 @@ ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm12[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm10[2,3] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdi), %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: movaps 32(%rdx), %xmm6 ; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; SSE-NEXT: movaps 32(%rcx), %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm6, %xmm7 ; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] @@ -2083,22 +2077,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdx), %xmm6 ; SSE-NEXT: movaps 48(%rcx), %xmm0 @@ -2116,22 +2110,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdx), %xmm6 ; SSE-NEXT: movaps 64(%rcx), %xmm0 @@ -2149,22 +2143,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rdx), %xmm6 ; SSE-NEXT: movaps 80(%rcx), %xmm0 @@ -2182,22 +2176,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 96(%rdx), %xmm9 ; SSE-NEXT: movaps 96(%rcx), %xmm0 @@ -2213,53 +2207,53 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm13[2,3] ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm4[2,0] ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm1[2],xmm11[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm11[2,3] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm11[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm2[0,2] -; SSE-NEXT: movaps 112(%rdx), %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm3[2,0] +; SSE-NEXT: movaps 112(%rdx), %xmm0 ; SSE-NEXT: movaps 112(%rcx), %xmm12 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] -; SSE-NEXT: movaps 112(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] +; SSE-NEXT: movaps 112(%rdi), %xmm1 ; SSE-NEXT: movaps 112(%rsi), %xmm10 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1] -; SSE-NEXT: movaps 112(%r8), %xmm1 -; SSE-NEXT: movaps 112(%r9), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: movaps %xmm1, %xmm10 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: movaps 112(%r8), %xmm7 +; SSE-NEXT: movaps 112(%r9), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm4[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movaps %xmm5, %xmm10 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm2[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm7[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm5[2,0] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 752(%rax) +; SSE-NEXT: movaps %xmm0, 752(%rax) ; SSE-NEXT: movaps %xmm10, 736(%rax) -; SSE-NEXT: movaps %xmm2, 720(%rax) -; SSE-NEXT: movaps %xmm5, 704(%rax) +; SSE-NEXT: movaps %xmm1, 720(%rax) +; SSE-NEXT: movaps %xmm3, 704(%rax) ; SSE-NEXT: movaps %xmm6, 688(%rax) -; SSE-NEXT: movaps %xmm4, 672(%rax) +; SSE-NEXT: movaps %xmm2, 672(%rax) ; SSE-NEXT: movaps %xmm9, 656(%rax) ; SSE-NEXT: movaps %xmm8, 640(%rax) ; SSE-NEXT: movaps %xmm11, 624(%rax) @@ -2343,58 +2337,50 @@ ; ; AVX1-ONLY-LABEL: store_i32_stride6_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm8 -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm5 +; AVX1-ONLY-NEXT: subq $744, %rsp # imm = 0x2E8 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm15 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm11 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm8 +; AVX1-ONLY-NEXT: vmovups %ymm8, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm14 +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 4(%r8), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 4(%r9), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 4(%r8), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 4(%r9), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm12[0],ymm8[0],ymm12[1],ymm8[1],ymm12[4],ymm8[4],ymm12[5],ymm8[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 16(%r9), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm8[0],ymm11[0],ymm8[2],ymm11[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 16(%r9), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,2],xmm2[1,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -2403,16 +2389,9 @@ ; AVX1-ONLY-NEXT: vbroadcastss 36(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm12[0],ymm5[0],ymm12[1],ymm5[1],ymm12[4],ymm5[4],ymm12[5],ymm5[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -2429,11 +2408,8 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -2442,15 +2418,16 @@ ; AVX1-ONLY-NEXT: vbroadcastss 68(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm14 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm14[0],ymm6[1],ymm14[1],ymm6[4],ymm14[4],ymm6[5],ymm14[5] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[4],ymm5[4],ymm8[5],ymm5[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -2460,96 +2437,77 @@ ; AVX1-ONLY-NEXT: vbroadcastss 80(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,2],xmm9[1,2] -; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 100(%r8), %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 100(%r9), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[4],ymm3[4],ymm7[5],ymm3[5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm15[2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 112(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 100(%r8), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 100(%r9), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm12[1,2],ymm0[5,6],ymm12[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm13 +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 112(%r9), %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[1,2],ymm13[1,2],ymm8[5,6],ymm13[5,6] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,2],ymm9[1,2],ymm11[5,6],ymm9[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 52(%r9), %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm14[2],ymm6[3],ymm14[3],ymm6[6],ymm14[6],ymm6[7],ymm14[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm14[1,2],mem[1,2],ymm14[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,2],ymm14[1,2],ymm15[5,6],ymm14[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 84(%r8), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 84(%r9), %ymm6 +; AVX1-ONLY-NEXT: vbroadcastss 52(%r9), %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[0,0,0,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm11[0,0,0,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 96(%r9), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[6],ymm5[6],ymm8[7],ymm5[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,2],ymm4[1,2],ymm3[5,6],ymm4[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vbroadcastss 84(%r8), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 84(%r9), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[6],ymm3[6],ymm7[7],ymm3[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,2],ymm2[1,2],ymm10[5,6],ymm2[5,6] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,2],ymm10[1,2],ymm7[5,6],ymm10[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 116(%r8), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 116(%r9), %ymm1 @@ -2558,154 +2516,160 @@ ; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm0 ; AVX1-ONLY-NEXT: vbroadcastss (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,0],ymm11[3,0],ymm9[7,4],ymm11[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vbroadcastss 32(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 32(%r9), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm3[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm12[3,0],mem[3,0],ymm12[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vbroadcastss 32(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm5[0],mem[0],xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,0],ymm15[3,0],ymm14[7,4],ymm15[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vbroadcastss 64(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vbroadcastss 64(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 64(%r9), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3,4],ymm5[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm5[3,0],mem[3,0],ymm5[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vbroadcastss 96(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vbroadcastss 96(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],mem[0],xmm6[1],mem[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 32(%r9), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 96(%r9), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm7[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7] -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm13[3,0],mem[3,0],ymm13[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm15[2,3],ymm6[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vbroadcastss 64(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vbroadcastss 64(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm5[0],mem[0],xmm5[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 64(%r9), %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm12 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm12[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3,4,5,6],ymm12[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm5[3,0],ymm14[3,0],ymm5[7,4],ymm14[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[2,3],ymm12[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3,4,5,6],ymm12[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm10[2],mem[2],xmm10[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm12 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7] -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3],ymm10[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4,5,6],ymm10[7] +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm7[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm10[3,0],mem[3,0],ymm10[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3,4,5,6],ymm9[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm2, 736(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 640(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 736(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 640(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 576(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 544(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 384(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 352(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) @@ -2727,7 +2691,7 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX1-ONLY-NEXT: addq $744, %rsp # imm = 0x2E8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -2749,7 +2713,7 @@ ; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 ; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,3] @@ -2889,7 +2853,7 @@ ; AVX2-SLOW-NEXT: vpbroadcastq %xmm15, %ymm5 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %ymm5 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2909,8 +2873,8 @@ ; AVX2-SLOW-NEXT: vpbroadcastd 84(%r9), %ymm15 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpbroadcastd 96(%rcx), %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastd 96(%rdx), %xmm15 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] @@ -2941,7 +2905,7 @@ ; AVX2-SLOW-NEXT: vpbroadcastd 116(%r9), %ymm15 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,3,2,3] @@ -3018,7 +2982,7 @@ ; AVX2-SLOW-NEXT: # xmm10 = mem[2,2,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpermilps $250, (%rsp), %xmm10 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm10 = mem[2,2,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7] @@ -3120,338 +3084,344 @@ ; ; AVX2-FAST-LABEL: store_i32_stride6_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $872, %rsp # imm = 0x368 +; AVX2-FAST-NEXT: subq $1032, %rsp # imm = 0x408 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX2-FAST-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm6 -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm5 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm7 +; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm10 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm10[0],zero,xmm10[1],zero +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm15 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 36(%r9), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm13 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm8[0],zero,xmm8[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm13[0],zero,xmm13[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 36(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm6 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm7 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm7[0],zero,xmm7[1],zero +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 68(%r9), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm14 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm14[0],zero,xmm14[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 68(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm15 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,1,2,1] ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm11 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 100(%r9), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2],ymm12[3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm11 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm11[2],xmm3[2],xmm11[3],xmm3[3] +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm8 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm8[0],zero,xmm8[1],zero +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 100(%r9), %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm10 -; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm12 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm5 +; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm7 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm8, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpbroadcastd (%r9), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm15, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm7 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm15 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm10[2],ymm2[3],ymm10[3],ymm2[6],ymm10[6],ymm2[7],ymm10[7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm12[3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd 32(%rcx), %xmm8 -; AVX2-FAST-NEXT: vpbroadcastd 32(%rdx), %xmm12 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm13, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 32(%r9), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[6],ymm10[6],ymm15[7],ymm10[7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd 32(%rcx), %xmm4 +; AVX2-FAST-NEXT: vpbroadcastd 32(%rdx), %xmm5 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm14, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm8 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm8 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd 64(%rcx), %xmm1 -; AVX2-FAST-NEXT: vpbroadcastd 64(%rdx), %xmm4 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm14, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 64(%r9), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd 64(%rcx), %xmm0 +; AVX2-FAST-NEXT: vpbroadcastd 64(%rdx), %xmm1 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm12, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r9), %xmm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm6 ; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm4 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm1[1],ymm9[2],ymm1[3],ymm9[4],ymm1[5],ymm9[6],ymm1[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm12 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm9 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 84(%r9), %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm12[2],ymm1[3],ymm12[3],ymm1[6],ymm12[6],ymm1[7],ymm12[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd %xmm15, %xmm1 -; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm11, %ymm1 +; AVX2-FAST-NEXT: vpbroadcastd 84(%r9), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd 96(%rcx), %xmm0 +; AVX2-FAST-NEXT: vpbroadcastd 96(%rdx), %xmm1 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm13, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 96(%r9), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 96(%r9), %xmm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,1,2,2,4,5,6,6] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[1,1,2,3,5,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2],ymm0[3],ymm9[4],ymm0[5],ymm9[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm11 ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm9 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[6],ymm9[6],ymm11[7],ymm9[7] +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 116(%r9), %ymm13 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm13 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm14 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm13 +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [2,2,3,3,2,2,3,3] +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm15[0],ymm10[0],ymm15[1],ymm10[1],ymm15[4],ymm10[4],ymm15[5],ymm10[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[4],ymm2[4],ymm7[5],ymm2[5] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[2,3],ymm5[2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm14[5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[6],ymm2[6],ymm7[7],ymm2[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[2,3],ymm2[2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm7, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [4,6,2,3,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm11, %ymm5 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm14 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[0],ymm5[1],mem[1],ymm5[4],mem[4],ymm5[5],mem[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3,4],ymm13[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7] +; AVX2-FAST-NEXT: vpermd (%r9), %ymm15, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1],ymm2[2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[4],ymm5[4],ymm8[5],ymm5[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm10[5],ymm13[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[6],ymm5[6],ymm8[7],ymm5[7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[2,3],ymm5[2,3] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm7, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm7, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm8 +; AVX2-FAST-NEXT: vpermd 32(%r9), %ymm15, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2,3,4,5,6],ymm10[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm10 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[4],ymm12[4],ymm14[5],ymm12[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[4],ymm4[4],ymm6[5],ymm4[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 80(%r9), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[4],ymm4[4],ymm6[5],ymm4[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 80(%r9), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7] ; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[2,3],ymm4[2,3] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm7, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm15, %ymm6 +; AVX2-FAST-NEXT: vpermd 64(%r9), %ymm15, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm10 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm6, %ymm10, %ymm6 +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2,3,4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[4],ymm9[4],ymm11[5],ymm9[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[4],ymm1[4],ymm3[5],ymm1[5] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 112(%r9), %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[2,3],ymm2[2,3] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm15, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 112(%r9), %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm10[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd 96(%r9), %ymm15, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm2, 736(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 736(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm6, 672(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 640(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 640(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm4, 544(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm12, 480(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 448(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm8, 448(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm5, 352(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm13, 288(%rax) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 256(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm2, 256(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3480,7 +3450,7 @@ ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $872, %rsp # imm = 0x368 +; AVX2-FAST-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -3502,7 +3472,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,3] @@ -3642,7 +3612,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3662,8 +3632,8 @@ ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 84(%r9), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 96(%rcx), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 96(%rdx), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] @@ -3694,7 +3664,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 116(%r9), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,3,2,3] @@ -3771,7 +3741,7 @@ ; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, (%rsp), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7] @@ -3874,550 +3844,558 @@ ; AVX512F-SLOW-LABEL: store_i32_stride6_vf32: ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm17 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rsi), %zmm18 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%r9), %zmm9 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512F-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512F-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm12, %zmm11 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512F-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm14, %zmm13 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512F-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm16, %zmm15 -; AVX512F-SLOW-NEXT: vpermi2d %zmm18, %zmm17, %zmm12 -; AVX512F-SLOW-NEXT: vpermi2d %zmm18, %zmm17, %zmm14 -; AVX512F-SLOW-NEXT: vpermi2d %zmm18, %zmm17, %zmm16 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm19 = zmm17[2],zmm18[2],zmm17[3],zmm18[3],zmm17[6],zmm18[6],zmm17[7],zmm18[7],zmm17[10],zmm18[10],zmm17[11],zmm18[11],zmm17[14],zmm18[14],zmm17[15],zmm18[15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm18, %zmm2, %zmm17 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm20 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] -; AVX512F-SLOW-NEXT: vpermt2d (%rcx), %ymm21, %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rsi), %zmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rcx), %zmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 (%r8), %zmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%r8), %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%r9), %zmm6 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512F-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm16 = zmm15[2],zmm12[2],zmm15[3],zmm12[3],zmm15[6],zmm12[6],zmm15[7],zmm12[7],zmm15[10],zmm12[10],zmm15[11],zmm12[11],zmm15[14],zmm12[14],zmm15[15],zmm12[15] +; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm1, %zmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [3,11,0,8,7,15,4,12] +; AVX512F-SLOW-NEXT: vpermt2d (%rcx), %ymm19, %ymm17 ; AVX512F-SLOW-NEXT: movb $36, %dl ; AVX512F-SLOW-NEXT: kmovw %edx, %k1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm18, %zmm17 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm22, %zmm17 -; AVX512F-SLOW-NEXT: vpermi2d %zmm7, %zmm5, %zmm2 -; AVX512F-SLOW-NEXT: vpermt2d 64(%rcx), %ymm21, %ymm20 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm20[0,1,0,1,2,3,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm18, %zmm2 -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm22, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm20, %zmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm21, %zmm11 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm17[0,1,0,1,2,3,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm20, %zmm15 +; AVX512F-SLOW-NEXT: vpermi2d %zmm4, %zmm3, %zmm1 +; AVX512F-SLOW-NEXT: vpermt2d 64(%rcx), %ymm19, %ymm18 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm18[0,1,0,1,2,3,6,7] +; AVX512F-SLOW-NEXT: vpermt2d %zmm5, %zmm17, %zmm1 +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm20, %zmm1 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512F-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm19, %zmm18 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512F-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm11 +; AVX512F-SLOW-NEXT: movb $-110, %cl +; AVX512F-SLOW-NEXT: kmovw %ecx, %k2 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, %zmm11 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm20, %zmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm21, %zmm11 ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512F-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 ; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 -; AVX512F-SLOW-NEXT: movb $-110, %cl -; AVX512F-SLOW-NEXT: kmovw %ecx, %k2 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512F-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm18, %zmm13 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 {%k2} ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm23, %zmm13 +; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm23, %zmm13 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512F-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm24, %zmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 ; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm26, %zmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm27, %zmm15 -; AVX512F-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm20, %zmm12 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm21, %zmm12 -; AVX512F-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm22 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, %zmm14 {%k2} -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm23, %zmm14 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm24, %zmm14 -; AVX512F-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 {%k2} -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm26, %zmm16 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm27, %zmm16 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512F-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %ymm21 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm22, %zmm20 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm23, %zmm20 -; AVX512F-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512F-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm12, %zmm27, %zmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, %zmm14 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm12, %zmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm26, %zmm14 +; AVX512F-SLOW-NEXT: vpermi2d %zmm7, %zmm2, %zmm19 +; AVX512F-SLOW-NEXT: vpermi2d %zmm4, %zmm3, %zmm17 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, %zmm17 {%k2} +; AVX512F-SLOW-NEXT: vpermt2d %zmm5, %zmm20, %zmm17 +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm21, %zmm17 +; AVX512F-SLOW-NEXT: vpermi2d %zmm7, %zmm2, %zmm22 +; AVX512F-SLOW-NEXT: vpermi2d %zmm4, %zmm3, %zmm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, %zmm18 {%k2} +; AVX512F-SLOW-NEXT: vpermt2d %zmm5, %zmm23, %zmm18 +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm24, %zmm18 +; AVX512F-SLOW-NEXT: vpermi2d %zmm7, %zmm2, %zmm25 +; AVX512F-SLOW-NEXT: vpermi2d %zmm4, %zmm3, %zmm27 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} +; AVX512F-SLOW-NEXT: vpermt2d %zmm5, %zmm12, %zmm27 +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm26, %zmm27 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512F-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm12, %zmm19 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %ymm20 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm20 = ymm20[2],mem[2],ymm20[3],mem[3],ymm20[6],mem[6],ymm20[7],mem[7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm20[2,3,2,3,2,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm20, %zmm19 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm22, %zmm19 +; AVX512F-SLOW-NEXT: vpermi2d %zmm7, %zmm2, %zmm12 ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm21[2,3,2,3,2,3,2,3] -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm22, %zmm18 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm23, %zmm18 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512F-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm5, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm7, %zmm0 -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm21, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm5, %zmm1 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm7, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 576(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 704(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 448(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, 512(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm21[2,3,2,3,2,3,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm5, %zmm20, %zmm12 +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm22, %zmm12 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] +; AVX512F-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm20, %zmm0 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm16[6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm10, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm9, %zmm0 +; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm20, %zmm2 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm3[2],zmm4[2],zmm3[3],zmm4[3],zmm3[6],zmm4[6],zmm3[7],zmm4[7],zmm3[10],zmm4[10],zmm3[11],zmm4[11],zmm3[14],zmm4[14],zmm3[15],zmm4[15] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm3[6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermt2d %zmm5, %zmm10, %zmm2 +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm9, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 704(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, 640(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, 576(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 512(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 448(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, 64(%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i32_stride6_vf32: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %zmm11 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512F-FAST-NEXT: vmovdqa64 (%rcx), %zmm18 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rcx), %zmm24 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512F-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512F-FAST-NEXT: vpermt2d %zmm18, %zmm26, %zmm14 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512F-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm19, %zmm2 -; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] -; AVX512F-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm12, %zmm20 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512F-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm22 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm17, %zmm22 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512F-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm8, %zmm6 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512F-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm21, %zmm23 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %zmm14 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rcx), %zmm8 +; AVX512F-FAST-NEXT: vmovdqa64 (%rcx), %zmm15 +; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 64(%r8), %zmm6 +; AVX512F-FAST-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512F-FAST-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512F-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm14, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rcx), %ymm16 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %ymm17 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm18, %zmm10 +; AVX512F-FAST-NEXT: movb $36, %cl +; AVX512F-FAST-NEXT: kmovw %ecx, %k1 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm20, %zmm1 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512F-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm10, %zmm9 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512F-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm16, %zmm15 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm26, %zmm7 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm26, %zmm25 -; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] -; AVX512F-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm5 -; AVX512F-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm17 -; AVX512F-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm21 -; AVX512F-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm16 -; AVX512F-FAST-NEXT: vpermt2d %zmm18, %zmm26, %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 +; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %ymm21 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rsi), %ymm22 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %ymm23 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %ymm24 +; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] +; AVX512F-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm25, %zmm23 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm21, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm23, %zmm9 +; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm4, %zmm2 +; AVX512F-FAST-NEXT: vpermt2d %zmm16, %zmm18, %zmm17 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm6, %zmm19, %zmm2 +; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm20, %zmm2 +; AVX512F-FAST-NEXT: vpermi2d %zmm8, %zmm3, %zmm10 +; AVX512F-FAST-NEXT: vpermt2d %zmm22, %zmm25, %zmm24 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm6, %zmm21, %zmm10 +; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm23, %zmm10 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512F-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm22, %zmm18 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512F-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512F-FAST-NEXT: vpermt2d %zmm14, %zmm17, %zmm16 +; AVX512F-FAST-NEXT: movb $-110, %cl +; AVX512F-FAST-NEXT: kmovw %ecx, %k2 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, %zmm16 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm23, %zmm16 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm24, %zmm16 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512F-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm25, %zmm20 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512F-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 -; AVX512F-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm12 -; AVX512F-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm8 -; AVX512F-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm10 -; AVX512F-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm24 -; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm19, %zmm1 -; AVX512F-FAST-NEXT: movb $-110, %al -; AVX512F-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm11 -; AVX512F-FAST-NEXT: movb $36, %al -; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm14, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, %zmm15 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm20, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm7, %zmm15 -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm14, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm14, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm19, %zmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm20, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm10 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 (%r9), %zmm12 -; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm22, %zmm10 -; AVX512F-FAST-NEXT: vmovdqa64 64(%r9), %zmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm17, %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm7, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm7, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm4, %zmm6 -; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm14, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm11, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm14, %zmm15 -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm17, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 -; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm7, %zmm5 -; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm11, %zmm10 -; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm14, %zmm16 -; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm17, %zmm1 -; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 640(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512F-FAST-NEXT: vpermt2d %zmm14, %zmm19, %zmm18 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm26, %zmm18 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm27, %zmm18 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm28, %zmm29 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512F-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512F-FAST-NEXT: vpermt2d %zmm14, %zmm21, %zmm20 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm29, %zmm20 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm30, %zmm20 +; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] +; AVX512F-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermt2d %zmm14, %zmm31, %zmm11 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] +; AVX512F-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm14, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm11, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm13, %zmm0 +; AVX512F-FAST-NEXT: vpermi2d %zmm8, %zmm3, %zmm22 +; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm4, %zmm17 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2} +; AVX512F-FAST-NEXT: vpermt2d %zmm6, %zmm23, %zmm17 +; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm24, %zmm17 +; AVX512F-FAST-NEXT: vpermi2d %zmm8, %zmm3, %zmm25 +; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm4, %zmm19 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 {%k2} +; AVX512F-FAST-NEXT: vpermt2d %zmm6, %zmm26, %zmm19 +; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm27, %zmm19 +; AVX512F-FAST-NEXT: vpermi2d %zmm8, %zmm3, %zmm28 +; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm4, %zmm21 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, %zmm21 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm6, %zmm29, %zmm21 +; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm30, %zmm21 +; AVX512F-FAST-NEXT: vpermt2d %zmm5, %zmm31, %zmm4 +; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm14, %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm6, %zmm11, %zmm3 +; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm13, %zmm3 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 704(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, 576(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, 384(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 448(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-SLOW-LABEL: store_i32_stride6_vf32: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm17 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm18 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm10 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm9 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512BW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512BW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm12, %zmm11 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512BW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm14, %zmm13 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm16, %zmm15 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm18, %zmm17, %zmm12 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm18, %zmm17, %zmm14 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm18, %zmm17, %zmm16 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm19 = zmm17[2],zmm18[2],zmm17[3],zmm18[3],zmm17[6],zmm18[6],zmm17[7],zmm18[7],zmm17[10],zmm18[10],zmm17[11],zmm18[11],zmm17[14],zmm18[14],zmm17[15],zmm18[15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm18, %zmm2, %zmm17 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm20 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] -; AVX512BW-SLOW-NEXT: vpermt2d (%rcx), %ymm21, %ymm18 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm15 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm12 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm10 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%r8), %zmm9 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm6 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512BW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm16 = zmm15[2],zmm12[2],zmm15[3],zmm12[3],zmm15[6],zmm12[6],zmm15[7],zmm12[7],zmm15[10],zmm12[10],zmm15[11],zmm12[11],zmm15[14],zmm12[14],zmm15[15],zmm12[15] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm1, %zmm15 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm18 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [3,11,0,8,7,15,4,12] +; AVX512BW-SLOW-NEXT: vpermt2d (%rcx), %ymm19, %ymm17 ; AVX512BW-SLOW-NEXT: movb $36, %dl ; AVX512BW-SLOW-NEXT: kmovd %edx, %k1 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm18, %zmm17 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm22, %zmm17 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm7, %zmm5, %zmm2 -; AVX512BW-SLOW-NEXT: vpermt2d 64(%rcx), %ymm21, %ymm20 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm20[0,1,0,1,2,3,6,7] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm18, %zmm2 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm22, %zmm2 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm20, %zmm11 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm21, %zmm11 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm17[0,1,0,1,2,3,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm15 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm20, %zmm15 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm4, %zmm3, %zmm1 +; AVX512BW-SLOW-NEXT: vpermt2d 64(%rcx), %ymm19, %ymm18 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm18[0,1,0,1,2,3,6,7] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm5, %zmm17, %zmm1 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm20, %zmm1 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512BW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm19, %zmm18 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512BW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm17, %zmm11 +; AVX512BW-SLOW-NEXT: movb $-110, %cl +; AVX512BW-SLOW-NEXT: kmovd %ecx, %k2 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm11 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm20, %zmm11 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm21, %zmm11 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512BW-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 ; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm22, %zmm23 -; AVX512BW-SLOW-NEXT: movb $-110, %cl -; AVX512BW-SLOW-NEXT: kmovd %ecx, %k2 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512BW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm18, %zmm13 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 {%k2} ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm23, %zmm13 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm23, %zmm13 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm24, %zmm13 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512BW-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm24, %zmm13 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 ; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm15 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm26, %zmm15 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm27, %zmm15 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k1} -; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm20, %zmm12 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm21, %zmm12 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm22 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm14 {%k2} -; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm23, %zmm14 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm24, %zmm14 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm25 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 {%k2} -; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm26, %zmm16 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm27, %zmm16 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm18, %zmm20 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm21 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7] -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm22, %zmm20 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm23, %zmm20 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm6, %zmm1, %zmm18 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512BW-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm12, %zmm27, %zmm14 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm14 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm12, %zmm14 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm26, %zmm14 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm7, %zmm2, %zmm19 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm4, %zmm3, %zmm17 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm17 {%k2} +; AVX512BW-SLOW-NEXT: vpermt2d %zmm5, %zmm20, %zmm17 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm21, %zmm17 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm7, %zmm2, %zmm22 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm4, %zmm3, %zmm18 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm18 {%k2} +; AVX512BW-SLOW-NEXT: vpermt2d %zmm5, %zmm23, %zmm18 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm24, %zmm18 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm7, %zmm2, %zmm25 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm4, %zmm3, %zmm27 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} +; AVX512BW-SLOW-NEXT: vpermt2d %zmm5, %zmm12, %zmm27 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm26, %zmm27 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512BW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm12, %zmm19 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm20 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm20 = ymm20[2],mem[2],ymm20[3],mem[3],ymm20[6],mem[6],ymm20[7],mem[7] +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm20[2,3,2,3,2,3,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm20, %zmm19 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm22, %zmm19 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm7, %zmm2, %zmm12 ; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7] -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm21[2,3,2,3,2,3,2,3] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm22, %zmm18 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm23, %zmm18 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512BW-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15] -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm5, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm7, %zmm0 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm21, %zmm1 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[6,7,6,7,6,7,6,7] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm3, %zmm5, %zmm1 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm7, %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, (%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm13, 576(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, 704(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, 448(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, 512(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm21[2,3,2,3,2,3,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm5, %zmm20, %zmm12 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm22, %zmm12 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] +; AVX512BW-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm20, %zmm0 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm16[6,7,6,7,6,7,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm10, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm9, %zmm0 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm20, %zmm2 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm3[2],zmm4[2],zmm3[3],zmm4[3],zmm3[6],zmm4[6],zmm3[7],zmm4[7],zmm3[10],zmm4[10],zmm3[11],zmm4[11],zmm3[14],zmm4[14],zmm3[15],zmm4[15] +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm3[6,7,6,7,6,7,6,7] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm5, %zmm10, %zmm2 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm9, %zmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, 704(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm27, 640(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm18, 576(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, 384(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm14, 256(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm12, 512(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, 448(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, 64(%rax) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: store_i32_stride6_vf32: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm24 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512BW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512BW-FAST-NEXT: vpermt2d %zmm18, %zmm26, %zmm14 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm19, %zmm2 -; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] -; AVX512BW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm12, %zmm20 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512BW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm22 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm17, %zmm22 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512BW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm8, %zmm6 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512BW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm21, %zmm23 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm14 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm8 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm15 +; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%r8), %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512BW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm14, %zmm2, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rcx), %ymm16 +; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdx), %ymm17 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm18, %zmm10 +; AVX512BW-FAST-NEXT: movb $36, %cl +; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm20, %zmm1 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512BW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm10, %zmm9 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm16, %zmm15 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm26, %zmm7 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm26, %zmm25 -; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] -; AVX512BW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm5 -; AVX512BW-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm17 -; AVX512BW-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm21 -; AVX512BW-FAST-NEXT: vpermi2d %zmm18, %zmm4, %zmm16 -; AVX512BW-FAST-NEXT: vpermt2d %zmm18, %zmm26, %zmm4 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm24, %zmm26 +; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm9 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %ymm21 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rsi), %ymm22 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %ymm23 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdi), %ymm24 +; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] +; AVX512BW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm25, %zmm23 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm21, %zmm9 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm23, %zmm9 +; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm4, %zmm2 +; AVX512BW-FAST-NEXT: vpermt2d %zmm16, %zmm18, %zmm17 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm6, %zmm19, %zmm2 +; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm20, %zmm2 +; AVX512BW-FAST-NEXT: vpermi2d %zmm8, %zmm3, %zmm10 +; AVX512BW-FAST-NEXT: vpermt2d %zmm22, %zmm25, %zmm24 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm6, %zmm21, %zmm10 +; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm23, %zmm10 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512BW-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm0 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm22, %zmm18 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512BW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512BW-FAST-NEXT: vpermt2d %zmm14, %zmm17, %zmm16 +; AVX512BW-FAST-NEXT: movb $-110, %cl +; AVX512BW-FAST-NEXT: kmovd %ecx, %k2 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, %zmm16 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm23, %zmm16 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm24, %zmm16 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512BW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm25, %zmm20 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512BW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm18 -; AVX512BW-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm12 -; AVX512BW-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm8 -; AVX512BW-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm10 -; AVX512BW-FAST-NEXT: vpermi2d %zmm11, %zmm1, %zmm24 -; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm19, %zmm1 -; AVX512BW-FAST-NEXT: movb $-110, %al -; AVX512BW-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm11 -; AVX512BW-FAST-NEXT: movb $36, %al -; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, %zmm3 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm14, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm26, %zmm15 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm20, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm7, %zmm15 -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm14, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm14, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} -; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm19, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 {%k1} -; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm20, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm10 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm12 -; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm22, %zmm10 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%r9), %zmm13 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm16 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm17, %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm7, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm7, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm4, %zmm6 -; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm14, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm11, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm14, %zmm15 -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm17, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm17, %zmm18 -; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm7, %zmm5 -; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm4, %zmm8 -; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm11, %zmm10 -; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm14, %zmm16 -; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm17, %zmm1 -; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, 128(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, 256(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, 576(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 640(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512BW-FAST-NEXT: vpermt2d %zmm14, %zmm19, %zmm18 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm26, %zmm18 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm27, %zmm18 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm28, %zmm29 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512BW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512BW-FAST-NEXT: vpermt2d %zmm14, %zmm21, %zmm20 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm29, %zmm20 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm30, %zmm20 +; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] +; AVX512BW-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpermt2d %zmm14, %zmm31, %zmm11 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] +; AVX512BW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm14, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm11, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm13, %zmm0 +; AVX512BW-FAST-NEXT: vpermi2d %zmm8, %zmm3, %zmm22 +; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm4, %zmm17 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2} +; AVX512BW-FAST-NEXT: vpermt2d %zmm6, %zmm23, %zmm17 +; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm24, %zmm17 +; AVX512BW-FAST-NEXT: vpermi2d %zmm8, %zmm3, %zmm25 +; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm4, %zmm19 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 {%k2} +; AVX512BW-FAST-NEXT: vpermt2d %zmm6, %zmm26, %zmm19 +; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm27, %zmm19 +; AVX512BW-FAST-NEXT: vpermi2d %zmm8, %zmm3, %zmm28 +; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm4, %zmm21 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm28, %zmm21 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm6, %zmm29, %zmm21 +; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm30, %zmm21 +; AVX512BW-FAST-NEXT: vpermt2d %zmm5, %zmm31, %zmm4 +; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm14, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm6, %zmm11, %zmm3 +; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm13, %zmm3 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, 704(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, 576(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, 384(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, 256(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, 192(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, 448(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec0 = load <32 x i32>, ptr %in.vecptr0, align 64 @@ -4447,14 +4425,14 @@ ; SSE-NEXT: movaps 16(%rsi), %xmm0 ; SSE-NEXT: movaps (%rdx), %xmm11 ; SSE-NEXT: movaps 16(%rdx), %xmm12 -; SSE-NEXT: movaps (%rcx), %xmm4 +; SSE-NEXT: movaps (%rcx), %xmm5 ; SSE-NEXT: movaps 16(%rcx), %xmm1 ; SSE-NEXT: movaps (%r8), %xmm6 ; SSE-NEXT: movaps 16(%r8), %xmm3 ; SSE-NEXT: movaps (%r9), %xmm7 -; SSE-NEXT: movaps 16(%r9), %xmm5 +; SSE-NEXT: movaps 16(%r9), %xmm4 ; SSE-NEXT: movaps %xmm11, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] ; SSE-NEXT: movaps %xmm9, %xmm8 ; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] ; SSE-NEXT: movaps %xmm7, %xmm14 @@ -4463,57 +4441,57 @@ ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm13[0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm8[0,2] +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm8[2,0] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: movaps %xmm5, %xmm8 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm11[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm9[2,3] +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm9[2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm7[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,0],xmm6[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm7[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm12, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] ; SSE-NEXT: movaps %xmm10, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movaps %xmm4, %xmm6 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,3] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm5[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm5[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm2[0,2] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm2[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm12[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm10[2,3] +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdi), %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm5[3,3] -; SSE-NEXT: movaps 32(%rdx), %xmm5 +; SSE-NEXT: movaps 32(%rdi), %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm3[3,3] +; SSE-NEXT: movaps 32(%rdx), %xmm6 ; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; SSE-NEXT: movaps 32(%rcx), %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movaps %xmm6, %xmm7 ; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE-NEXT: movaps 32(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps 32(%r8), %xmm2 ; SSE-NEXT: movaps 32(%r9), %xmm3 @@ -4523,23 +4501,23 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm3[2,0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdx), %xmm6 ; SSE-NEXT: movaps 48(%rcx), %xmm0 ; SSE-NEXT: movaps %xmm6, %xmm5 @@ -4556,22 +4534,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdx), %xmm6 ; SSE-NEXT: movaps 64(%rcx), %xmm0 @@ -4589,22 +4567,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rdx), %xmm6 ; SSE-NEXT: movaps 80(%rcx), %xmm0 @@ -4622,22 +4600,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 96(%rdx), %xmm6 ; SSE-NEXT: movaps 96(%rcx), %xmm0 @@ -4655,22 +4633,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rdx), %xmm6 ; SSE-NEXT: movaps 112(%rcx), %xmm0 @@ -4688,22 +4666,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%rdx), %xmm6 ; SSE-NEXT: movaps 128(%rcx), %xmm0 @@ -4721,22 +4699,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%rdx), %xmm6 ; SSE-NEXT: movaps 144(%rcx), %xmm0 @@ -4754,22 +4732,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 160(%rdx), %xmm6 ; SSE-NEXT: movaps 160(%rcx), %xmm0 @@ -4787,22 +4765,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 176(%rdx), %xmm6 ; SSE-NEXT: movaps 176(%rcx), %xmm0 @@ -4820,22 +4798,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 192(%rdx), %xmm6 ; SSE-NEXT: movaps 192(%rcx), %xmm0 @@ -4853,22 +4831,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 208(%rdx), %xmm6 ; SSE-NEXT: movaps 208(%rcx), %xmm0 @@ -4886,22 +4864,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm7[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 224(%rdx), %xmm9 ; SSE-NEXT: movaps 224(%rcx), %xmm0 @@ -4917,53 +4895,53 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm13[2,3] ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm4[2,0] ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm1[2],xmm11[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm11[2,3] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm11[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm2[0,2] -; SSE-NEXT: movaps 240(%rdx), %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm3[2,0] +; SSE-NEXT: movaps 240(%rdx), %xmm0 ; SSE-NEXT: movaps 240(%rcx), %xmm12 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] -; SSE-NEXT: movaps 240(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] +; SSE-NEXT: movaps 240(%rdi), %xmm1 ; SSE-NEXT: movaps 240(%rsi), %xmm10 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1] -; SSE-NEXT: movaps 240(%r8), %xmm1 -; SSE-NEXT: movaps 240(%r9), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: movaps %xmm1, %xmm10 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: movaps 240(%r8), %xmm7 +; SSE-NEXT: movaps 240(%r9), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm4[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movaps %xmm5, %xmm10 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm2[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm1[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm7[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm5[2,0] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 1520(%rax) +; SSE-NEXT: movaps %xmm0, 1520(%rax) ; SSE-NEXT: movaps %xmm10, 1504(%rax) -; SSE-NEXT: movaps %xmm2, 1488(%rax) -; SSE-NEXT: movaps %xmm5, 1472(%rax) +; SSE-NEXT: movaps %xmm1, 1488(%rax) +; SSE-NEXT: movaps %xmm3, 1472(%rax) ; SSE-NEXT: movaps %xmm6, 1456(%rax) -; SSE-NEXT: movaps %xmm4, 1440(%rax) +; SSE-NEXT: movaps %xmm2, 1440(%rax) ; SSE-NEXT: movaps %xmm9, 1424(%rax) ; SSE-NEXT: movaps %xmm8, 1408(%rax) ; SSE-NEXT: movaps %xmm11, 1392(%rax) @@ -5143,60 +5121,55 @@ ; ; AVX1-ONLY-LABEL: store_i32_stride6_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2504, %rsp # imm = 0x9C8 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX1-ONLY-NEXT: subq $1864, %rsp # imm = 0x748 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm11 +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm9 ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm5 -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 4(%r8), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 4(%r9), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 4(%r8), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 4(%r9), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm15 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 16(%r9), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 16(%r9), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,2],xmm2[1,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -5205,17 +5178,9 @@ ; AVX1-ONLY-NEXT: vbroadcastss 36(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5225,18 +5190,12 @@ ; AVX1-ONLY-NEXT: vbroadcastss 48(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -5245,17 +5204,16 @@ ; AVX1-ONLY-NEXT: vbroadcastss 68(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm13 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[4],ymm13[4],ymm0[5],ymm13[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5265,18 +5223,12 @@ ; AVX1-ONLY-NEXT: vbroadcastss 80(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -5285,17 +5237,16 @@ ; AVX1-ONLY-NEXT: vbroadcastss 100(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm11 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[4],ymm11[4],ymm0[5],ymm11[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5305,18 +5256,12 @@ ; AVX1-ONLY-NEXT: vbroadcastss 112(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -5327,8 +5272,8 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %ymm11 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[4],ymm11[4],ymm0[5],ymm11[5] +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %ymm6 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[4],ymm6[4],ymm0[5],ymm6[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5344,16 +5289,12 @@ ; AVX1-ONLY-NEXT: vbroadcastss 144(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],xmm0[1,2] +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -5362,17 +5303,16 @@ ; AVX1-ONLY-NEXT: vbroadcastss 164(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5382,16 +5322,12 @@ ; AVX1-ONLY-NEXT: vbroadcastss 176(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -5400,17 +5336,15 @@ ; AVX1-ONLY-NEXT: vbroadcastss 196(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm12[0],ymm3[0],ymm12[1],ymm3[1],ymm12[4],ymm3[4],ymm12[5],ymm3[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5420,18 +5354,12 @@ ; AVX1-ONLY-NEXT: vbroadcastss 208(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -5440,168 +5368,124 @@ ; AVX1-ONLY-NEXT: vbroadcastss 228(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[4],ymm0[4],ymm7[5],ymm0[5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm10 -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm10[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 240(%r9), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm10[1,2],ymm15[1,2],ymm10[5,6],ymm15[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm8[2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm8 +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 240(%r9), %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm8[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm8[1,2],mem[1,2],ymm8[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm4[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm15[2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm12[1,2],mem[1,2],ymm12[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm15[3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm14[1,2],mem[1,2],ymm14[5,6],mem[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm13[4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm8[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 52(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[1,2],mem[1,2],ymm12[5,6],mem[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 84(%r8), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 84(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1] -; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,0,0,0] -; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,0,0,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 96(%r9), %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm13[1,2],mem[1,2],ymm13[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm14 = ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[6],ymm13[6],ymm14[7],ymm13[7] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,2],ymm15[1,2],ymm13[5,6],ymm15[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm12[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 116(%r8), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 116(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 84(%r8), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 84(%r9), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,2],ymm15[1,2],ymm11[5,6],ymm15[5,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,2],ymm14[1,2],ymm11[5,6],ymm14[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 148(%r8), %xmm13 +; AVX1-ONLY-NEXT: vbroadcastss 116(%r8), %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 148(%r9), %ymm13 +; AVX1-ONLY-NEXT: vbroadcastss 116(%r9), %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0,0,0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm13[0,0,0,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r8), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 160(%r9), %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm8[1,2],ymm6[5,6],ymm8[5,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm13[1,2],ymm6[5,6],ymm13[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 180(%r8), %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 180(%r9), %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 148(%r8), %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm11[2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 148(%r9), %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm11[3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0,0,0] -; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0,0,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r8), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 192(%r9), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,2],ymm2[1,2],ymm4[5,6],ymm2[5,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm5[1,2],mem[1,2],ymm5[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 180(%r8), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 180(%r9), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm12[2],ymm3[2],ymm12[3],ymm3[3],ymm12[6],ymm3[6],ymm12[7],ymm3[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,2],ymm10[1,2],ymm9[5,6],ymm10[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 212(%r8), %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 212(%r9), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 212(%r8), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 212(%r9), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm7[2],ymm0[2],ymm7[3],ymm0[3],ymm7[6],ymm0[6],ymm7[7],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm9[2],ymm0[2],ymm9[3],ymm0[3],ymm9[6],ymm0[6],ymm9[7],ymm0[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,2],ymm1[1,2],ymm5[5,6],ymm1[5,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,2],ymm2[1,2],ymm1[5,6],ymm2[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vbroadcastss 244(%r8), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 244(%r9), %ymm1 @@ -5610,9 +5494,8 @@ ; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm0 ; AVX1-ONLY-NEXT: vbroadcastss (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm1, %ymm1 @@ -5620,9 +5503,8 @@ ; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -5635,9 +5517,10 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm10[3,0],ymm0[7,4],ymm10[7,4] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] @@ -5649,9 +5532,8 @@ ; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vbroadcastss 32(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm1, %ymm1 @@ -5659,9 +5541,8 @@ ; AVX1-ONLY-NEXT: vbroadcastss 32(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -5677,8 +5558,7 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] @@ -5690,9 +5570,8 @@ ; AVX1-ONLY-NEXT: vbroadcastss 64(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vbroadcastss 64(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm1, %ymm1 @@ -5700,194 +5579,229 @@ ; AVX1-ONLY-NEXT: vbroadcastss 64(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $51, (%rsp), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm15[3,0],mem[3,0],ymm15[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vbroadcastss 96(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss 96(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 96(%r9), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm3[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0],ymm3[1],ymm0[2,3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm14[3,0],mem[3,0],ymm14[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vbroadcastss 128(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vbroadcastss 128(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r8), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 128(%r9), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm13[3,0],mem[3,0],ymm13[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1],ymm4[2,3,4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vbroadcastss 128(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vbroadcastss 128(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r8), %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 128(%r9), %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm11[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2,3,4,5,6],ymm11[7] -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm15[3,0],mem[3,0],ymm15[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = mem[2,3],ymm11[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4,5,6],ymm11[7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm13[2],mem[2],xmm13[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm12 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm12[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4,5,6],ymm12[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm12[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4,5,6],ymm12[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7] -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm6[2,3],ymm13[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1],ymm2[2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4,5,6],ymm13[7] -; AVX1-ONLY-NEXT: vbroadcastss 224(%rcx), %xmm13 -; AVX1-ONLY-NEXT: vbroadcastss 224(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vbroadcastss 160(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vbroadcastss 160(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm7[0],mem[0],xmm7[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r8), %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 160(%r9), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm9[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm7[0],ymm9[1],ymm7[2,3,4,5,6],ymm9[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3,4,5,6],ymm9[7] +; AVX1-ONLY-NEXT: vbroadcastss 192(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vbroadcastss 192(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm10[0],mem[0],xmm10[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1],ymm9[2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r8), %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 192(%r9), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm10 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm14[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1],ymm10[2,3,4,5,6],ymm14[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm12[2,3],ymm14[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm2[2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4,5,6],ymm14[7] +; AVX1-ONLY-NEXT: vbroadcastss 224(%rcx), %xmm14 +; AVX1-ONLY-NEXT: vbroadcastss 224(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm15, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 224(%r9), %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm1, %ymm14 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm15, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovaps 224(%r9), %xmm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[2,3,4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm6[3,0],mem[3,0],ymm6[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm15[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0],ymm1[1],ymm14[2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm12[3,0],mem[3,0],ymm12[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm14[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1],ymm5[2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4,5,6],ymm14[7] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = mem[2,3],ymm14[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm5, 1504(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1408(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 1504(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 1408(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm13, 1344(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 1312(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 1216(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 1216(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 1152(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm7, 1120(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm11, 1024(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 928(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 832(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 768(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 736(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 640(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 544(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 960(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 928(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 832(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 768(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 736(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 640(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%rax) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5909,18 +5823,12 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1280(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1152(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1088(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 960(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 896(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) @@ -5958,7 +5866,7 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $2504, %rsp # imm = 0x9C8 +; AVX1-ONLY-NEXT: addq $1864, %rsp # imm = 0x748 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -6046,7 +5954,7 @@ ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 100(%r9), %ymm9 @@ -6132,7 +6040,7 @@ ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovdqa 224(%r8), %xmm14 -; AVX2-SLOW-NEXT: vmovdqa %xmm14, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 228(%r9), %ymm14 @@ -6237,13 +6145,13 @@ ; AVX2-SLOW-NEXT: vpbroadcastd 84(%r9), %ymm15 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpbroadcastd 96(%rcx), %xmm14 +; AVX2-SLOW-NEXT: vpbroadcastd 96(%rdx), %xmm15 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm14[2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpbroadcastq (%rsp), %ymm7 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6299,8 +6207,8 @@ ; AVX2-SLOW-NEXT: vpbroadcastd 148(%r9), %ymm15 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpbroadcastd 160(%rcx), %xmm14 +; AVX2-SLOW-NEXT: vpbroadcastd 160(%rdx), %xmm15 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1] @@ -6330,8 +6238,8 @@ ; AVX2-SLOW-NEXT: vpbroadcastd 180(%r9), %ymm15 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpbroadcastd 192(%rcx), %xmm14 +; AVX2-SLOW-NEXT: vpbroadcastd 192(%rdx), %xmm15 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1] @@ -6369,7 +6277,7 @@ ; AVX2-SLOW-NEXT: # xmm15 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd (%rsp), %ymm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vmovaps 224(%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6519,7 +6427,7 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpermilps $250, (%rsp), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[2,2,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] @@ -6527,7 +6435,7 @@ ; AVX2-SLOW-NEXT: # xmm1 = mem[2,2,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] @@ -6661,7 +6569,7 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm13 -; AVX2-SLOW-NEXT: vpermilps $250, (%rsp), %xmm14 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm14 = mem[2,2,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] @@ -6708,7 +6616,7 @@ ; AVX2-SLOW-NEXT: vmovdqa %ymm6, 736(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 672(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 640(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 544(%rax) @@ -6782,392 +6690,408 @@ ; ; AVX2-FAST-LABEL: store_i32_stride6_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $2376, %rsp # imm = 0x948 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm9 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX2-FAST-NEXT: subq $2840, %rsp # imm = 0xB18 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm11 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm11[0],zero,xmm11[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 36(%r9), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 36(%r9), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm7 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm7[0],zero,xmm7[1],zero +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 68(%r9), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm4 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 68(%r9), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm10 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm9 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm10 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm8 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm8[0],zero,xmm8[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 100(%r9), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm8[0],zero,xmm8[1],zero +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 100(%r9), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 128(%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 128(%rsi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqa 128(%r8), %xmm5 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 132(%r9), %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 128(%rsi), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm14 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rcx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa 160(%rdx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqa 128(%r8), %xmm4 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 132(%r9), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 160(%rcx), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa 160(%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FAST-NEXT: vmovdqa 160(%rsi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqa 160(%r8), %xmm6 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm6[0],zero,xmm6[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 164(%r9), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rcx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqa 160(%r8), %xmm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 164(%r9), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 192(%rcx), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FAST-NEXT: vmovdqa 192(%rsi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqa 192(%r8), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 196(%r9), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqa 192(%r8), %xmm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rcx), %xmm3 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 196(%r9), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rcx), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa 224(%rdx), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa 224(%rdx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 224(%rsi), %xmm0 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 224(%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqa 224(%r8), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 228(%r9), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqa 224(%r8), %xmm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm3 -; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm14 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 228(%r9), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm2 +; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm3 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd (%r9), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm14 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd 32(%rcx), %xmm0 -; AVX2-FAST-NEXT: vpbroadcastd 32(%rdx), %xmm1 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm11, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 32(%r9), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[6],ymm5[6],ymm2[7],ymm5[7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpbroadcastd 32(%rcx), %xmm1 +; AVX2-FAST-NEXT: vpbroadcastd 32(%rdx), %xmm2 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm6, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd 64(%rcx), %xmm0 -; AVX2-FAST-NEXT: vpbroadcastd 64(%rdx), %xmm1 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm4, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 64(%r9), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[6],ymm5[6],ymm2[7],ymm5[7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpbroadcastd 64(%rcx), %xmm1 +; AVX2-FAST-NEXT: vpbroadcastd 64(%rdx), %xmm2 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm7, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r9), %xmm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[6],ymm6[6],ymm2[7],ymm6[7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 84(%r9), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd 96(%rcx), %xmm1 +; AVX2-FAST-NEXT: vpbroadcastd 96(%rdx), %xmm2 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm8, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%r9), %xmm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 84(%r9), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm8, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 96(%r9), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 116(%r9), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd 128(%rcx), %xmm0 -; AVX2-FAST-NEXT: vpbroadcastd 128(%rdx), %xmm4 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm5, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 128(%r9), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm1[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 116(%r9), %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd 128(%rcx), %xmm6 +; AVX2-FAST-NEXT: vpbroadcastd 128(%rdx), %xmm7 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm4, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa 128(%r9), %xmm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 128(%rdx), %ymm12 -; AVX2-FAST-NEXT: vmovdqa 128(%rcx), %ymm9 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 128(%rcx), %ymm10 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2],ymm0[3],ymm6[4],ymm0[5],ymm6[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rsi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa 128(%rsi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 148(%r9), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 148(%r9), %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vbroadcastss 160(%rcx), %xmm0 +; AVX2-FAST-NEXT: vbroadcastss 160(%rdx), %xmm6 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm6, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 160(%r9), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdx), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 160(%rcx), %ymm7 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps 160(%r9), %xmm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcastss %xmm1, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 160(%rdx), %ymm11 +; AVX2-FAST-NEXT: vmovdqa 160(%rcx), %ymm9 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[1,1,2,3,5,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rsi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa 160(%rsi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 180(%r9), %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-NEXT: vpbroadcastd %xmm11, %xmm0 -; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcastss 192(%rcx), %xmm0 +; AVX2-FAST-NEXT: vbroadcastss 192(%rdx), %xmm4 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 192(%r9), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps 192(%r9), %xmm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcastss %xmm1, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 192(%rdx), %ymm13 +; AVX2-FAST-NEXT: vmovdqa 192(%rcx), %ymm7 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[1,1,2,3,5,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rsi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa 192(%rsi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero @@ -7183,9 +7107,11 @@ ; AVX2-FAST-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd (%rsp), %ymm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vbroadcastss 224(%r9), %ymm4 +; AVX2-FAST-NEXT: vmovaps 224(%r9), %xmm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcastss %xmm1, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 224(%rdx), %ymm1 @@ -7197,10 +7123,10 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rsi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rsi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] @@ -7213,279 +7139,269 @@ ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm4[2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm15 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[4],mem[4],ymm3[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm14[0],ymm5[0],ymm14[1],ymm5[1],ymm14[4],ymm5[4],ymm14[5],ymm5[5] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3] +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm14[2],ymm5[2],ymm14[3],ymm5[3],ymm14[6],ymm5[6],ymm14[7],ymm5[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[2,3],ymm3[2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [6,5,3,3,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [4,6,2,3,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm5, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm14[5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[2,3],ymm4[2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [6,5,3,3,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1],ymm14[2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [4,6,2,3,4,6,6,7] +; AVX2-FAST-NEXT: vpermd (%r9), %ymm3, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm15 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm13 -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm14 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm14, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm6, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm5, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm13 -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm14 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 80(%r9), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm6, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm5, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm4, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermd 32(%r9), %ymm3, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm14[1],ymm1[2,3,4,5,6],ymm14[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm13 -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm14 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm14[1],ymm1[2,3,4,5,6],ymm14[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 112(%r9), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 80(%r9), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm15[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[6],ymm5[6],ymm0[7],ymm5[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermd 64(%r9), %ymm3, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm5[1],ymm1[2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 128(%r8), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqa 128(%r9), %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm5[1],ymm1[2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[4],ymm9[4],ymm12[5],ymm9[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 144(%r9), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 112(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermd 96(%r9), %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[6],ymm9[6],ymm12[7],ymm9[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 160(%r8), %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqa 160(%r9), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm4[1],ymm0[2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[4],ymm7[4],ymm10[5],ymm7[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 176(%r9), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[6],ymm7[6],ymm10[7],ymm7[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[4],ymm10[4],ymm12[5],ymm10[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 128(%r8), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 144(%r9), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[6],ymm10[6],ymm12[7],ymm10[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermd 128(%r9), %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 192(%r8), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqa 192(%r9), %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0],ymm10[1],ymm0[2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 208(%r9), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[2,3],ymm15[2,3] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm15[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[4],ymm9[4],ymm11[5],ymm9[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 160(%r8), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 176(%r9), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[6],ymm9[6],ymm11[7],ymm9[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[2,3],ymm9[2,3] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermd 160(%r9), %ymm3, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0],ymm9[1],ymm1[2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm11, %ymm3 -; AVX2-FAST-NEXT: vmovdqa 224(%r8), %ymm15 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm3[2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqa 224(%r9), %ymm13 -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm11[1],ymm1[2,3,4,5,6],ymm11[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm0[0],ymm8[0],ymm0[1],ymm8[1],ymm0[4],ymm8[4],ymm0[5],ymm8[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 240(%r9), %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm8 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[2,3],ymm8[2,3] -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm13[0],ymm7[0],ymm13[1],ymm7[1],ymm13[4],ymm7[4],ymm13[5],ymm7[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm15[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 192(%r8), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 208(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm7 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[2,3],ymm7[2,3] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm4, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1],ymm7[2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vpermd 192(%r9), %ymm3, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm13[1],ymm7[2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm0[1],ymm13[2,3,4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = ymm6[0],mem[0],ymm6[1],mem[1],ymm6[4],mem[4],ymm6[5],mem[5] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 224(%r8), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 240(%r9), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[2,3],ymm6[2,3] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermd 224(%r9), %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm5, 1504(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 1440(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 1408(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 1312(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 1248(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 1216(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 1120(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm9, 1056(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 1504(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm13, 1440(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 1408(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 1312(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 1248(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm11, 1216(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm9, 1120(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 1056(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm12, 1024(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm14, 928(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 864(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm10, 928(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 864(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 832(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7520,7 +7436,7 @@ ; AVX2-FAST-NEXT: vmovaps %ymm0, 1280(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 1152(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 1088(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 960(%rax) @@ -7560,7 +7476,7 @@ ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $2376, %rsp # imm = 0x948 +; AVX2-FAST-NEXT: addq $2840, %rsp # imm = 0xB18 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -7648,7 +7564,7 @@ ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%r9), %ymm9 @@ -7734,7 +7650,7 @@ ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%r8), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 228(%r9), %ymm14 @@ -7839,13 +7755,13 @@ ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 84(%r9), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 96(%rcx), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 96(%rdx), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm14[2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq (%rsp), %ymm7 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7901,8 +7817,8 @@ ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 148(%r9), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 160(%rcx), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 160(%rdx), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1] @@ -7932,8 +7848,8 @@ ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 180(%r9), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 192(%rcx), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 192(%rdx), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1] @@ -7971,7 +7887,7 @@ ; AVX2-FAST-PERLANE-NEXT: # xmm15 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rsp), %ymm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8121,7 +8037,7 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, (%rsp), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] @@ -8129,7 +8045,7 @@ ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] @@ -8263,7 +8179,7 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, (%rsp), %xmm14 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm14 = mem[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] @@ -8310,7 +8226,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 736(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 672(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 640(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 544(%rax) @@ -8649,305 +8565,323 @@ ; ; AVX512F-FAST-LABEL: store_i32_stride6_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $1160, %rsp # imm = 0x488 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %zmm5 -; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rsi), %zmm24 -; AVX512F-FAST-NEXT: vmovdqa64 128(%rsi), %zmm29 -; AVX512F-FAST-NEXT: vmovdqa64 192(%rsi), %zmm23 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 (%rcx), %zmm21 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512F-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512F-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512F-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512F-FAST-NEXT: subq $1352, %rsp # imm = 0x548 +; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %zmm11 +; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %zmm21 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm15 +; AVX512F-FAST-NEXT: vmovdqa64 192(%rsi), %zmm14 +; AVX512F-FAST-NEXT: vmovdqa64 128(%rsi), %zmm22 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rsi), %zmm25 +; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %zmm23 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512F-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm3, %zmm12, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 128(%rcx), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %ymm17 +; AVX512F-FAST-NEXT: vmovdqa 128(%rdx), %ymm8 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm5 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %ymm6 +; AVX512F-FAST-NEXT: vmovdqa 128(%rsi), %ymm7 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %ymm20 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %ymm18 +; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %ymm19 ; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512F-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm31 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm25 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512F-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512F-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] -; AVX512F-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm30, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm10, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm11, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm12, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm30, %zmm5 +; AVX512F-FAST-NEXT: vpermt2d %zmm5, %zmm1, %zmm20 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512F-FAST-NEXT: vpermt2d %zmm25, %zmm12, %zmm5 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vpermt2d %zmm29, %zmm7, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm17 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermt2d %zmm6, %zmm1, %zmm18 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512F-FAST-NEXT: vpermt2d %zmm22, %zmm12, %zmm4 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm8 +; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm19 +; AVX512F-FAST-NEXT: vmovdqa 192(%rcx), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa 192(%rdx), %ymm5 +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa 192(%rsi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %ymm24 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512F-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512F-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512F-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512F-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512F-FAST-NEXT: vpermt2d %zmm3, %zmm2, %zmm20 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] +; AVX512F-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm15 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm28 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-FAST-NEXT: vpermt2d %zmm25, %zmm1, %zmm3 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm25, %zmm7, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vpermt2d %zmm29, %zmm8, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm25, %zmm6, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512F-FAST-NEXT: vpermt2d %zmm29, %zmm11, %zmm26 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512F-FAST-NEXT: vpermt2d %zmm29, %zmm12, %zmm28 -; AVX512F-FAST-NEXT: vpermt2d %zmm29, %zmm30, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm9, %zmm19 -; AVX512F-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm7 +; AVX512F-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm13 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512F-FAST-NEXT: vpermt2d %zmm22, %zmm2, %zmm3 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512F-FAST-NEXT: vpermt2d %zmm22, %zmm7, %zmm29 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512F-FAST-NEXT: vpermt2d %zmm22, %zmm6, %zmm30 +; AVX512F-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm21 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512F-FAST-NEXT: vpermi2d %zmm14, %zmm11, %zmm12 +; AVX512F-FAST-NEXT: vpermi2d %zmm14, %zmm11, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermi2d %zmm14, %zmm11, %zmm7 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm10 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm8 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm11 +; AVX512F-FAST-NEXT: vpermi2d %zmm14, %zmm11, %zmm6 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm11 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm12 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermt2d %zmm23, %zmm30, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512F-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm29, %zmm23 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512F-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm8, %zmm7 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512F-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm2, %zmm5 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %zmm21 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512F-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm22, %zmm18 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512F-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm25 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm17, %zmm25 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512F-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm27 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm27 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm21, %zmm26 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] +; AVX512F-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm10, %zmm23 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %zmm31 ; AVX512F-FAST-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm22 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm30 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm4, %zmm10 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm6 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 -; AVX512F-FAST-NEXT: vmovdqa64 128(%rdx), %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, %zmm15 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm15 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm16, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm21, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm31 +; AVX512F-FAST-NEXT: vmovdqa64 128(%rdx), %zmm7 ; AVX512F-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm15 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm12 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 -; AVX512F-FAST-NEXT: vmovdqa64 192(%rdx), %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm24 -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm29 -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 -; AVX512F-FAST-NEXT: movb $-110, %al -; AVX512F-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm16, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm21, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 192(%rdx), %zmm6 +; AVX512F-FAST-NEXT: vmovdqa64 192(%rcx), %zmm3 +; AVX512F-FAST-NEXT: vpermi2d %zmm3, %zmm6, %zmm22 +; AVX512F-FAST-NEXT: vpermi2d %zmm3, %zmm6, %zmm17 +; AVX512F-FAST-NEXT: vpermi2d %zmm3, %zmm6, %zmm16 +; AVX512F-FAST-NEXT: vpermi2d %zmm3, %zmm6, %zmm21 +; AVX512F-FAST-NEXT: vpermt2d %zmm3, %zmm10, %zmm6 ; AVX512F-FAST-NEXT: movb $36, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, %zmm27 {%k2} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm17 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm23 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm16, %zmm19 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm27 {%k2} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, %zmm22 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm16 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm27 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm30 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm10 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm21 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm16 {%k2} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm14 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm12 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm5, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512F-FAST-NEXT: vpermt2d %zmm3, %zmm5, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm5, %zmm4 +; AVX512F-FAST-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm5, %zmm12 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm5, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 64(%r9), %zmm12 +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm5, %zmm10 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 128(%r9), %zmm10 +; AVX512F-FAST-NEXT: vpermt2d %zmm10, %zmm5, %zmm4 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 192(%r9), %zmm13 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm5, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm4, %zmm18 +; AVX512F-FAST-NEXT: vpermt2d %zmm3, %zmm4, %zmm15 +; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm4, %zmm14 +; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm4, %zmm22 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm4, %zmm18 +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm4, %zmm15 +; AVX512F-FAST-NEXT: vpermt2d %zmm10, %zmm4, %zmm14 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm4, %zmm22 +; AVX512F-FAST-NEXT: movb $-110, %al +; AVX512F-FAST-NEXT: kmovw %eax, %k2 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, %zmm25 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, %zmm20 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, %zmm23 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm4, %zmm19 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm28 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm26, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm27, %zmm20 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm24, %zmm23 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, %zmm28 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm15 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm3 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm26 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm28 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm13 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm29 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm18 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm20 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm24 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm5 -; AVX512F-FAST-NEXT: vmovdqa64 64(%r9), %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm27 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm22 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm30 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm19 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm17 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm21 -; AVX512F-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm16 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm15 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm3 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm26 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm28 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm13 -; AVX512F-FAST-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm9 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm10 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm29 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm12 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm31 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm3, %zmm4, %zmm19 +; AVX512F-FAST-NEXT: vpermt2d %zmm3, %zmm26, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-FAST-NEXT: vpermt2d %zmm3, %zmm27, %zmm2 +; AVX512F-FAST-NEXT: vpermt2d %zmm3, %zmm24, %zmm31 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm4, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm26, %zmm29 +; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm27, %zmm30 +; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm24, %zmm7 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm1 {%k2} +; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm4, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 {%k2} +; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm26, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm27, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm1, %zmm28 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm9, %zmm25 +; AVX512F-FAST-NEXT: vpermt2d %zmm11, %zmm24, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm20 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm4, %zmm23 +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm1, %zmm19 +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm9, %zmm5 +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm3, %zmm2 +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm4, %zmm31 +; AVX512F-FAST-NEXT: vpermt2d %zmm10, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm10, %zmm9, %zmm29 +; AVX512F-FAST-NEXT: vpermt2d %zmm10, %zmm3, %zmm30 +; AVX512F-FAST-NEXT: vpermt2d %zmm10, %zmm4, %zmm7 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm1, %zmm17 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm9, %zmm16 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm3, %zmm21 +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm4, %zmm6 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 1472(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 1408(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 1344(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, 1280(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 1216(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 1152(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 1088(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, 1024(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, 960(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 896(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 832(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 768(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, 704(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, 640(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, 576(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 512(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 384(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, 64(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512F-FAST-NEXT: addq $1160, %rsp # imm = 0x488 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 1472(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, 1408(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 1344(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, 1152(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 1088(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 1024(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, 960(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 768(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, 704(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 640(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 576(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, 1280(%rax) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 896(%rax) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512F-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-FAST-NEXT: addq $1352, %rsp # imm = 0x548 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -9218,305 +9152,323 @@ ; ; AVX512BW-FAST-LABEL: store_i32_stride6_vf64: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: subq $1160, %rsp # imm = 0x488 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm24 -; AVX512BW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm29 -; AVX512BW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm23 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm21 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512BW-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512BW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-FAST-NEXT: subq $1352, %rsp # imm = 0x548 +; AVX512BW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm21 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm15 +; AVX512BW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm14 +; AVX512BW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm22 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm25 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm23 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512BW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm3, %zmm12, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512BW-FAST-NEXT: vmovdqa 64(%rcx), %ymm4 +; AVX512BW-FAST-NEXT: vmovdqa 128(%rcx), %ymm2 +; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %ymm5 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdx), %ymm17 +; AVX512BW-FAST-NEXT: vmovdqa 128(%rdx), %ymm8 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm0, %zmm5 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm5 +; AVX512BW-FAST-NEXT: vmovdqa 64(%rsi), %ymm6 +; AVX512BW-FAST-NEXT: vmovdqa 128(%rsi), %ymm7 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %ymm20 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdi), %ymm18 +; AVX512BW-FAST-NEXT: vmovdqa64 128(%rdi), %ymm19 ; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512BW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm31 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm25 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512BW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512BW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] -; AVX512BW-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm30, %zmm25 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm10, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm8, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm11, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm12, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm30, %zmm5 +; AVX512BW-FAST-NEXT: vpermt2d %zmm5, %zmm1, %zmm20 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512BW-FAST-NEXT: vpermt2d %zmm25, %zmm12, %zmm5 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-FAST-NEXT: vpermt2d %zmm29, %zmm7, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm17 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vpermt2d %zmm6, %zmm1, %zmm18 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512BW-FAST-NEXT: vpermt2d %zmm22, %zmm12, %zmm4 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm8 +; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm19 +; AVX512BW-FAST-NEXT: vmovdqa 192(%rcx), %ymm2 +; AVX512BW-FAST-NEXT: vmovdqa 192(%rdx), %ymm5 +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa 192(%rsi), %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa64 192(%rdi), %ymm24 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512BW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512BW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512BW-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512BW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm20 +; AVX512BW-FAST-NEXT: vpermt2d %zmm3, %zmm2, %zmm20 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] +; AVX512BW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm15 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm28 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-FAST-NEXT: vpermt2d %zmm25, %zmm1, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm25, %zmm7, %zmm1 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-FAST-NEXT: vpermt2d %zmm29, %zmm8, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm25, %zmm6, %zmm1 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512BW-FAST-NEXT: vpermt2d %zmm29, %zmm11, %zmm26 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512BW-FAST-NEXT: vpermt2d %zmm29, %zmm12, %zmm28 -; AVX512BW-FAST-NEXT: vpermt2d %zmm29, %zmm30, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm9, %zmm19 -; AVX512BW-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm7 +; AVX512BW-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm13 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512BW-FAST-NEXT: vpermt2d %zmm22, %zmm2, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512BW-FAST-NEXT: vpermt2d %zmm22, %zmm7, %zmm29 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 +; AVX512BW-FAST-NEXT: vpermt2d %zmm22, %zmm6, %zmm30 +; AVX512BW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm21 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512BW-FAST-NEXT: vpermi2d %zmm14, %zmm11, %zmm12 +; AVX512BW-FAST-NEXT: vpermi2d %zmm14, %zmm11, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vpermi2d %zmm14, %zmm11, %zmm7 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm10 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm11 +; AVX512BW-FAST-NEXT: vpermi2d %zmm14, %zmm11, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm11 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm23, %zmm2, %zmm12 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermt2d %zmm23, %zmm30, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm29, %zmm23 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512BW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm4, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm8, %zmm7 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512BW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm2, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm21 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512BW-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm22, %zmm18 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512BW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, %zmm25 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm17, %zmm25 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512BW-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, %zmm27 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm27 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [3,19,0,16,11,27,8,24,15,31,12,28,15,31,12,28] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm21, %zmm26 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] +; AVX512BW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm10, %zmm23 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm31 ; AVX512BW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm22 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm14 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, %zmm24 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm14 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm30 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm10 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm4, %zmm10 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm6 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm21 -; AVX512BW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm31, %zmm15 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm15 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm16, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm21, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm31 +; AVX512BW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm7 ; AVX512BW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm15 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm12 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm29, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm9 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm13 -; AVX512BW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm20 -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm24 -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm29 -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 -; AVX512BW-FAST-NEXT: movb $-110, %al -; AVX512BW-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm14 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm16, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm21, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm3 +; AVX512BW-FAST-NEXT: vpermi2d %zmm3, %zmm6, %zmm22 +; AVX512BW-FAST-NEXT: vpermi2d %zmm3, %zmm6, %zmm17 +; AVX512BW-FAST-NEXT: vpermi2d %zmm3, %zmm6, %zmm16 +; AVX512BW-FAST-NEXT: vpermi2d %zmm3, %zmm6, %zmm21 +; AVX512BW-FAST-NEXT: vpermt2d %zmm3, %zmm10, %zmm6 ; AVX512BW-FAST-NEXT: movb $36, %al ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm31, %zmm23 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, %zmm27 {%k2} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm17 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm23 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm27 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm16, %zmm19 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, %zmm27 {%k2} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm14, %zmm22 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm16 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm27 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm22 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm30 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm10 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm21 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm16 {%k2} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, %zmm15 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, %zmm14 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm12 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm5, %zmm9 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512BW-FAST-NEXT: vpermt2d %zmm3, %zmm5, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm5, %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa64 192(%r8), %zmm11 +; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm5, %zmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm5, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 64(%r9), %zmm12 +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm5, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 128(%r9), %zmm10 +; AVX512BW-FAST-NEXT: vpermt2d %zmm10, %zmm5, %zmm4 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 192(%r9), %zmm13 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm5, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm4, %zmm18 +; AVX512BW-FAST-NEXT: vpermt2d %zmm3, %zmm4, %zmm15 +; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm4, %zmm14 +; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm4, %zmm22 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm4, %zmm18 +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm4, %zmm15 +; AVX512BW-FAST-NEXT: vpermt2d %zmm10, %zmm4, %zmm14 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm4, %zmm22 +; AVX512BW-FAST-NEXT: movb $-110, %al +; AVX512BW-FAST-NEXT: kmovd %eax, %k2 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm27, %zmm25 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm26, %zmm20 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm28, %zmm23 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm4, %zmm19 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, %zmm28 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm26, %zmm25 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm27, %zmm20 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm24, %zmm23 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, %zmm28 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm16 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm15 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm3 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm26 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm28 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm13 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 {%k1} -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2} ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm29 {%k1} -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm29 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm12 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512BW-FAST-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm20 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm23 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm24 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm25 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%r9), %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm27 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm22 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm30 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm19 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm17 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm21 -; AVX512BW-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm16 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm15 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm3 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm26 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm28 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm13 -; AVX512BW-FAST-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm4, %zmm9 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm11, %zmm10 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm29 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm12 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm14 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm31 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm3, %zmm4, %zmm19 +; AVX512BW-FAST-NEXT: vpermt2d %zmm3, %zmm26, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-FAST-NEXT: vpermt2d %zmm3, %zmm27, %zmm2 +; AVX512BW-FAST-NEXT: vpermt2d %zmm3, %zmm24, %zmm31 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm4, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm26, %zmm29 +; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm27, %zmm30 +; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm24, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm1 {%k2} +; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm4, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 {%k2} +; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm26, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm16 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm27, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm1, %zmm28 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm9, %zmm25 +; AVX512BW-FAST-NEXT: vpermt2d %zmm11, %zmm24, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm20 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm4, %zmm23 +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm1, %zmm19 +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm9, %zmm5 +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm3, %zmm2 +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm4, %zmm31 +; AVX512BW-FAST-NEXT: vpermt2d %zmm10, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm10, %zmm9, %zmm29 +; AVX512BW-FAST-NEXT: vpermt2d %zmm10, %zmm3, %zmm30 +; AVX512BW-FAST-NEXT: vpermt2d %zmm10, %zmm4, %zmm7 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm1, %zmm17 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm9, %zmm16 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm3, %zmm21 +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm4, %zmm6 ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 1472(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm14, 1408(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, 1344(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm29, 1280(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 1216(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, 1152(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, 1088(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm28, 1024(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm26, 960(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, 896(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, 832(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, 768(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, 704(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, 640(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, 576(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm30, 512(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, 448(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm27, 384(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, 256(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, 192(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, 128(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, 64(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512BW-FAST-NEXT: addq $1160, %rsp # imm = 0x488 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 1472(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, 1408(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, 1344(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, 1152(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, 1088(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm30, 1024(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm29, 960(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 768(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm31, 704(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, 640(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, 576(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, 320(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, 256(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, 192(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm28, (%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, 1280(%rax) +; AVX512BW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm14, 896(%rax) +; AVX512BW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512BW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512BW-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-FAST-NEXT: addq $1352, %rsp # imm = 0x548 ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec0 = load <64 x i32>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll @@ -27,23 +27,23 @@ ; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm5 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm6 = mem[0],zero -; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm0[1,0] ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm7[2,0] ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm6[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,3,3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm4[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm4[0,2] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE-NEXT: movaps %xmm1, 32(%rax) -; SSE-NEXT: movaps %xmm7, 16(%rax) +; SSE-NEXT: movaps %xmm6, 16(%rax) ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq %xmm2, 48(%rax) +; SSE-NEXT: movaps %xmm1, 32(%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf2: @@ -52,138 +52,76 @@ ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm5[0],xmm4[0] -; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,0],ymm0[1,0],ymm1[7,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2],ymm0[2,1],ymm6[4,6],ymm0[6,5] -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,2],xmm4[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,3],ymm1[4,6],ymm0[4,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[u,u,0,2,u,u,u,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,3],ymm2[4,6],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero +; AVX1-ONLY-NEXT: vbroadcastsd (%r10), %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm8 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[4],ymm7[4],ymm8[5],ymm7[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm8 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[1],ymm8[1],ymm0[4],ymm8[4],ymm0[5],ymm8[5] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,0],ymm5[1,0],ymm6[5,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[1],xmm2[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rax) +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm1 +; AVX1-ONLY-NEXT: vmovlps %xmm1, 48(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm0 -; AVX1-ONLY-NEXT: vmovlps %xmm0, 48(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; -; AVX2-SLOW-LABEL: store_i32_stride7_vf2: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero -; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,1,4,6,6,5] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm3 = <3,5,7,u> -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [3,5,0,1,3,5,0,1] -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vmovlps %xmm0, 48(%rax) -; AVX2-SLOW-NEXT: vmovaps %xmm2, 32(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rax) -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: store_i32_stride7_vf2: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero -; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm1 = <3,5,7,u> -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [3,5,0,1,3,5,0,1] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = <0,2,4,6,u,u,u,1> -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,2,4,0,0,2,4,0] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX2-FAST-NEXT: vmovlps %xmm2, 48(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-NEXT: vmovaps %xmm1, 32(%rax) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: store_i32_stride7_vf2: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,1,4,6,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm3 = <3,5,7,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [3,5,0,1,3,5,0,1] -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovlps %xmm0, 48(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rax) -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-ONLY-LABEL: store_i32_stride7_vf2: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX2-ONLY-NEXT: movq (%r10), %rcx +; AVX2-ONLY-NEXT: vmovq %rcx, %xmm6 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,4,0,4] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm8 +; AVX2-ONLY-NEXT: vpermd %ymm8, %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,4,0,1,0,4,0,1] +; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm8, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm7 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,4,0,4,0,4,0,4] +; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm8, %ymm7 +; AVX2-ONLY-NEXT: vmovd %ecx, %xmm8 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm8, %ymm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm7 = [1,5,1,5] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm7, %ymm1 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm2 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,5,1,5,1,5,1,5] +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-ONLY-NEXT: vmovq %xmm2, 48(%rax) +; AVX2-ONLY-NEXT: vmovdqa %xmm1, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq ; ; AVX512-SLOW-LABEL: store_i32_stride7_vf2: ; AVX512-SLOW: # %bb.0: @@ -197,16 +135,16 @@ ; AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,2,4,6,16,20,18,1,3,5,7,17,21,19,u,u> -; AVX512-SLOW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 -; AVX512-SLOW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) -; AVX512-SLOW-NEXT: vextracti32x4 $3, %zmm1, %xmm0 +; AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX512-SLOW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm1 +; AVX512-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,2,4,6,16,20,18,1,3,5,7,17,21,19,u,u> +; AVX512-SLOW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; AVX512-SLOW-NEXT: vextracti32x4 $2, %zmm2, 32(%rax) +; AVX512-SLOW-NEXT: vextracti32x4 $3, %zmm2, %xmm0 ; AVX512-SLOW-NEXT: vmovq %xmm0, 48(%rax) -; AVX512-SLOW-NEXT: vmovdqa %ymm1, (%rax) +; AVX512-SLOW-NEXT: vmovdqa %ymm2, (%rax) ; AVX512-SLOW-NEXT: vzeroupper ; AVX512-SLOW-NEXT: retq ; @@ -222,12 +160,12 @@ ; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 ; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,u> -; AVX512-FAST-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 -; AVX512-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 +; AVX512-FAST-NEXT: vpermi2q %ymm4, %ymm0, %ymm1 +; AVX512-FAST-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm0 +; AVX512-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,2,4,6,8,10,12,1,3,5,7,9,11,13,u,u> ; AVX512-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512-FAST-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) @@ -262,48 +200,48 @@ ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm6 +; SSE-NEXT: movaps (%rsi), %xmm4 ; SSE-NEXT: movaps (%rdx), %xmm5 ; SSE-NEXT: movaps (%rcx), %xmm1 -; SSE-NEXT: movaps (%r8), %xmm4 -; SSE-NEXT: movaps (%r9), %xmm2 +; SSE-NEXT: movaps (%r8), %xmm9 +; SSE-NEXT: movaps (%r9), %xmm3 ; SSE-NEXT: movaps (%r10), %xmm8 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0] -; SSE-NEXT: movaps %xmm4, %xmm9 -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm9[0] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: movaps %xmm9, %xmm7 +; SSE-NEXT: movaps %xmm9, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm1[1,1] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm9[0] +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; SSE-NEXT: movaps %xmm8, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm2[3,3] -; SSE-NEXT: movaps %xmm4, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm1[1,1] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,0],xmm3[3,3] +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm8[2,3,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm0[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[0,2] +; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm10[2,0] +; SSE-NEXT: movaps %xmm0, %xmm10 +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm4[2],xmm10[3],xmm4[3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm10[0,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm9[2,0] -; SSE-NEXT: movaps %xmm0, %xmm9 -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm9[0,1] -; SSE-NEXT: movaps %xmm6, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm10[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm6[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm5[2,0] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm10[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm4, 16(%rax) -; SSE-NEXT: movaps %xmm9, 32(%rax) -; SSE-NEXT: movaps %xmm2, 48(%rax) +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[2,0] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm11[0],xmm0[1,2,3] ; SSE-NEXT: movaps %xmm1, 96(%rax) -; SSE-NEXT: movaps %xmm7, 64(%rax) -; SSE-NEXT: movaps %xmm3, (%rax) +; SSE-NEXT: movaps %xmm3, 48(%rax) +; SSE-NEXT: movaps %xmm8, 32(%rax) +; SSE-NEXT: movaps %xmm7, 16(%rax) +; SSE-NEXT: movaps %xmm6, 64(%rax) +; SSE-NEXT: movaps %xmm2, (%rax) ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: retq ; @@ -311,48 +249,48 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm1 ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm2 ; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm8 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm9 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm9[1,0],ymm8[1,0],ymm9[5,4],ymm8[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0],ymm8[2,1],ymm10[6,4],ymm8[6,5] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm5[1],ymm7[3],ymm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[1,1],ymm6[2,0],ymm5[5,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm3[1,1],xmm4[1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm11[1,2],ymm6[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm10[3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm5[3,3],ymm7[3,3],ymm5[7,7],ymm7[7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm11 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm3[2],ymm11[2],ymm3[3],ymm11[3],ymm3[6],ymm11[6],ymm3[7],ymm11[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5,6],ymm11[7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1],ymm11[2,0],ymm9[6,5],ymm11[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4],ymm10[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm4[1,1],ymm5[2,0],ymm4[5,5],ymm5[6,4] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm11 = zero,xmm3[1],xmm6[1],zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm11[1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm10[3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm10 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] ; AVX1-ONLY-NEXT: vbroadcastss (%r10), %ymm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[0,0],xmm3[0,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm4[0],ymm7[1],ymm4[1],ymm7[4],ymm4[4],ymm7[5],ymm4[5] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm6[0],xmm3[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,2,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6],ymm10[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,3],ymm7[3,3],ymm4[7,7],ymm7[7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6],ymm3[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[2,1],ymm4[2,0],ymm9[6,5],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] ; AVX1-ONLY-NEXT: vbroadcastss 12(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -401,11 +339,11 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm2[1,2],zero +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm2[1,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vmovaps %xmm0, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm8, 64(%rax) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -416,14 +354,14 @@ ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm3 -; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm1 +; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovaps (%r8), %xmm4 ; AVX2-FAST-NEXT: vmovaps (%r9), %xmm5 -; AVX2-FAST-NEXT: vmovaps (%r10), %xmm0 +; AVX2-FAST-NEXT: vmovaps (%r10), %xmm1 ; AVX2-FAST-NEXT: vinsertf128 $1, (%rsi), %ymm2, %ymm2 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm7 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm6[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm8 = [5,1,5,1,5,1,5,1] @@ -439,7 +377,7 @@ ; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6],ymm9[7] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4],ymm8[5,6,7] @@ -457,12 +395,12 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm3 = [7,3,7,3,7,3,7,3] ; AVX2-FAST-NEXT: vpermps %ymm6, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3] -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax) +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm7, 32(%rax) -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax) ; AVX2-FAST-NEXT: vmovaps %xmm0, 96(%rax) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -512,11 +450,11 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm2[1,2],zero +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm2[1,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -528,17 +466,17 @@ ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-NEXT: vmovdqa (%r8), %xmm2 +; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] +; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = <10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u> -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 +; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -565,229 +503,225 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride7_vf8: ; SSE: # %bb.0: -; SSE-NEXT: subq $24, %rsp +; SSE-NEXT: subq $40, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa 16(%rdi), %xmm9 -; SSE-NEXT: movdqa (%rsi), %xmm1 -; SSE-NEXT: movdqa 16(%rsi), %xmm5 -; SSE-NEXT: movdqa 16(%rdx), %xmm6 -; SSE-NEXT: movdqa 16(%rcx), %xmm12 -; SSE-NEXT: movdqa 16(%r8), %xmm11 -; SSE-NEXT: movdqa (%r9), %xmm8 -; SSE-NEXT: movaps 16(%r9), %xmm0 -; SSE-NEXT: movdqa (%rax), %xmm10 -; SSE-NEXT: movaps 16(%rax), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3] -; SSE-NEXT: movaps %xmm0, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm12[1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movdqa (%rdi), %xmm13 +; SSE-NEXT: movdqa (%rsi), %xmm6 +; SSE-NEXT: movdqa 16(%rsi), %xmm7 +; SSE-NEXT: movaps (%rdx), %xmm5 +; SSE-NEXT: movdqa 16(%rdx), %xmm15 +; SSE-NEXT: movaps (%rcx), %xmm1 +; SSE-NEXT: movaps 16(%rcx), %xmm14 +; SSE-NEXT: movaps (%r8), %xmm11 +; SSE-NEXT: movaps 16(%r8), %xmm12 +; SSE-NEXT: movdqa (%r9), %xmm4 +; SSE-NEXT: movdqa 16(%r9), %xmm10 +; SSE-NEXT: movdqa (%rax), %xmm9 +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm14[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rax), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rdx), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; SSE-NEXT: movaps %xmm2, %xmm13 -; SSE-NEXT: movss {{.*#+}} xmm13 = xmm5[0],xmm13[1,2,3] -; SSE-NEXT: movaps (%rcx), %xmm0 -; SSE-NEXT: movaps (%r8), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm5[2,0] -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm12[2],xmm5[3],xmm12[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm6[0] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm14[3,3] -; SSE-NEXT: movdqa %xmm9, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm7[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,1],xmm9[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm6[2,0] -; SSE-NEXT: movdqa %xmm11, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1] -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm14[2,0] -; SSE-NEXT: movaps %xmm4, %xmm15 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; SSE-NEXT: movaps %xmm2, %xmm14 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] -; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm3[1,3] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm15[0,2] -; SSE-NEXT: movaps %xmm2, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm15[0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm11[0] -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1] -; SSE-NEXT: shufps $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[0,1],mem[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm10[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm10[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm1[0] +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,0],xmm13[1,0] +; SSE-NEXT: movaps %xmm11, %xmm9 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm6[0,2] +; SSE-NEXT: movaps %xmm11, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm10[2],xmm6[3],xmm10[3] +; SSE-NEXT: movaps %xmm10, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[3,3] +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm14[2],xmm10[3],xmm14[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm14 = xmm14[0],xmm15[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm14[2,0] +; SSE-NEXT: movaps %xmm12, %xmm14 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm4[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm12[0] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm15[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm13 = xmm4[0],xmm13[1,2,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,3],xmm0[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm11 = xmm4[0],xmm11[1,2,3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm9, 112(%rax) -; SSE-NEXT: movdqa %xmm5, 176(%rax) +; SSE-NEXT: movaps %xmm3, 192(%rax) +; SSE-NEXT: movdqa %xmm10, 176(%rax) +; SSE-NEXT: movaps %xmm14, 128(%rax) +; SSE-NEXT: movaps %xmm2, 112(%rax) +; SSE-NEXT: movaps %xmm6, 64(%rax) +; SSE-NEXT: movaps %xmm9, 16(%rax) ; SSE-NEXT: movdqa %xmm8, (%rax) -; SSE-NEXT: movaps %xmm3, 16(%rax) -; SSE-NEXT: movaps %xmm14, 64(%rax) -; SSE-NEXT: movaps %xmm6, 128(%rax) -; SSE-NEXT: movaps %xmm7, 192(%rax) -; SSE-NEXT: movaps %xmm13, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps %xmm4, 96(%rax) +; SSE-NEXT: movaps %xmm7, 208(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) -; SSE-NEXT: movaps %xmm1, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) +; SSE-NEXT: movaps %xmm11, 96(%rax) +; SSE-NEXT: movaps %xmm13, 80(%rax) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 208(%rax) -; SSE-NEXT: addq $24, %rsp +; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rax) +; SSE-NEXT: addq $40, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf8: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm3 ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm7 -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm2[1,1],ymm3[1,1],ymm2[5,5],ymm3[5,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,1],ymm0[1,1],ymm1[5,5],ymm0[5,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6],ymm5[7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm7[2,1],ymm5[6,4],ymm7[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm5[2,3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6],ymm5[7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rax), %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm5 -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[1,1],xmm5[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm12[0],ymm6[2],ymm12[2] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm11[1,1],xmm9[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm15 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm11[0],xmm10[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,0],xmm10[2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm13 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm14[1],xmm15[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2],ymm9[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm13[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm13[0],ymm4[2],ymm13[2] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm11[0],xmm12[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,0],xmm12[2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1],ymm9[2,3],ymm13[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6],ymm9[7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1],ymm9[0,2],ymm8[5,5],ymm9[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm15[2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6],ymm8[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm2[1,1],ymm3[1,1],ymm2[5,5],ymm3[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,1],ymm4[1,1],ymm1[5,5],ymm4[5,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4],ymm8[5,6],ymm15[7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm7[2,1],ymm15[6,4],ymm7[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm15[2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm10[1],xmm11[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm11[1,1],xmm12[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,xmm14[1],xmm13[1],zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1,2],ymm12[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm0[3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm13[1],ymm9[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm0[3,3],ymm1[3,3],ymm0[7,7],ymm1[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2],ymm9[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,3],ymm8[3,3],ymm7[7,7],ymm8[7,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm9[1,1],ymm14[0,2],ymm9[5,5],ymm14[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm4[3,3],ymm1[3,3],ymm4[7,7],ymm1[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,3],ymm9[3,3],ymm7[7,7],ymm9[7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%rax), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[3,3],xmm11[3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm9 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm9[2],mem[2],xmm9[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6],ymm9[7] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%rax), %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm12[3,3],xmm11[3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6],ymm9[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm11[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0,1,2],xmm9[3] -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rax), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1],ymm1[0,2],ymm3[7,5],ymm1[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[3,3],xmm6[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm10[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rax), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,1],ymm2[0,2],ymm3[7,5],ymm2[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[3,3],xmm11[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -795,124 +729,126 @@ ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm2 -; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm9 -; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm3 -; AVX2-SLOW-NEXT: vmovaps (%r8), %ymm6 -; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm7 -; AVX2-SLOW-NEXT: vmovaps (%rax), %xmm1 -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm4 -; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm5[1,1,1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm4[1],xmm10[2,3] -; AVX2-SLOW-NEXT: vbroadcastsd %xmm10, %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm12 -; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm13 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm13[1],xmm12[1],zero -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm14 -; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm15 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm15[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm7 +; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm2 +; AVX2-SLOW-NEXT: vmovaps (%r8), %ymm5 +; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm6 +; AVX2-SLOW-NEXT: vmovaps (%rax), %xmm10 +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm3 +; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm4[1,1,1,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm3[1],xmm9[2,3] +; AVX2-SLOW-NEXT: vbroadcastsd %xmm9, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm11 +; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm12 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm12[1],xmm11[1],zero +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm13 +; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm14 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm14[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2],xmm15[3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0],ymm9[1,2],ymm15[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5],ymm9[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm10 = ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[6],ymm3[6],ymm9[7],ymm3[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm7[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm6[2],ymm11[3,4,5],ymm6[6],ymm11[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm9 = ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[6],ymm2[6],ymm7[7],ymm2[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm6[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm5[2],ymm15[3,4,5],ymm5[6],ymm15[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm14[3,3],xmm15[3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0],ymm8[1],ymm15[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1],ymm9[2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vbroadcastss %xmm11, %xmm8 +; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm15 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2,3],ymm15[4,5,6,7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vbroadcastsd %xmm10, %ymm10 +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm15[0],ymm10[0],ymm15[2],ymm10[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm8[0,1,2,3],ymm10[4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm13[3,3],xmm14[3,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm11 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm11 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6],ymm11[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm5[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm4[0,1,2],xmm11[3] -; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm10[2,3,4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm8 -; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm10 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm10 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm12 -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm8[0,1,2,3],ymm10[4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm9[1,1],ymm3[1,1],ymm9[5,5],ymm3[5,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6],ymm10[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm6[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm4[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm3[0,1,2],xmm11[3] +; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm11[2,3,4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm7[1,1],ymm2[1,1],ymm7[5,5],ymm2[5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3,4],ymm8[5,6],ymm12[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm6[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm5[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6],ymm10[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm9 = ymm3[2],ymm9[2],ymm3[3],ymm9[3],ymm3[6],ymm9[6],ymm3[7],ymm9[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm10 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,3],ymm7[3,3],ymm6[7,7],ymm7[7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6],ymm9[7] -; AVX2-SLOW-NEXT: vbroadcastsd 24(%rax), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] -; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,3],xmm5[3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2],ymm8[3,4,5,6],ymm12[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm7[2],ymm2[3],ymm7[3],ymm2[6],ymm7[6],ymm2[7],ymm7[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm12 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2],ymm7[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,3],ymm6[3,3],ymm5[7,7],ymm6[7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vbroadcastsd 24(%rax), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3],xmm4[3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 192(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm8, 128(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm12, (%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm11, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm11, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm10, (%rax) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i32_stride7_vf8: ; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: subq $24, %rsp ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm2 -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm10 -; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm3 -; AVX2-FAST-NEXT: vmovaps (%r8), %ymm7 -; AVX2-FAST-NEXT: vmovaps (%r9), %ymm8 -; AVX2-FAST-NEXT: vmovaps (%rax), %xmm0 +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm5 +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm8 +; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%r8), %ymm3 +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%r9), %ymm6 +; AVX2-FAST-NEXT: vmovaps (%rax), %xmm10 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vmovaps (%r8), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vmovaps (%r8), %xmm4 -; AVX2-FAST-NEXT: vmovaps (%r9), %xmm5 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm5[1,1,1,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm4[1],xmm9[2,3] +; AVX2-FAST-NEXT: vmovaps (%r9), %xmm4 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm4[1,1,1,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm0[1],xmm9[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm9, %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5],ymm9[6,7] ; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm11 ; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm12 ; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm12[1],xmm11[1],zero @@ -922,84 +858,85 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2],xmm15[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0],ymm9[1,2],ymm15[3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2],ymm6[3,4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm6 = ymm10[2],ymm3[2],ymm10[3],ymm3[3],ymm10[6],ymm3[6],ymm10[7],ymm3[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm9 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm7[2],ymm9[3,4,5],ymm7[6],ymm9[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm15[1],ymm9[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm6[2,3,4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm3[1,1],ymm10[5,5],ymm3[5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5,6],ymm6[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm8[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm7[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm15[2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2],ymm6[3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2],ymm7[3,4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm13[3,3],xmm14[3,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm9 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm9 = ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[6],ymm1[6],ymm8[7],ymm1[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[6],ymm5[6],ymm2[7],ymm5[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm6[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm3[2],ymm15[3,4,5],ymm3[6],ymm15[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0],ymm7[1],ymm15[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1],ymm9[2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vbroadcastss %xmm11, %xmm7 +; AVX2-FAST-NEXT: vbroadcastss %xmm12, %xmm15 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm15 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,1,2,2,0,1,2,2] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6],ymm9[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm5[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm4[0,1,2],xmm9[3] -; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm9 = ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[6],ymm10[6],ymm3[7],ymm10[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm10 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,3],ymm8[3,3],ymm7[7,7],ymm8[7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6],ymm9[7] -; AVX2-FAST-NEXT: vbroadcastsd 24(%rax), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vbroadcastss %xmm11, %xmm8 -; AVX2-FAST-NEXT: vbroadcastss %xmm12, %xmm9 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm9 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm8 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastsd %xmm10, %ymm9 -; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3],ymm15[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm15 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vbroadcastsd %xmm10, %ymm10 +; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm15[0],ymm10[0],ymm15[2],ymm10[2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm10[4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm13[3,3],xmm14[3,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm11 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6],ymm0[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm4[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] +; AVX2-FAST-NEXT: vmovaps %xmm1, %xmm3 +; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm7[2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm8[1,1],ymm2[1,1],ymm8[5,5],ymm2[5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6],ymm7[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm6[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm13[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0],ymm7[1],ymm12[2,3,4],ymm7[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm12[2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm8[2],ymm2[3],ymm8[3],ymm2[6],ymm8[6],ymm2[7],ymm8[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm8 = ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[6],ymm1[6],ymm5[7],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm13[3,3],ymm6[3,3],ymm13[7,7],ymm6[7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-FAST-NEXT: vbroadcastsd 24(%rax), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5] +; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm5 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6],ymm2[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,3],xmm5[3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm10[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3],xmm4[3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-NEXT: vmovaps %ymm7, 192(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm6, 64(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm9, 160(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm6, 192(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm11, 64(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm10, (%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FAST-NEXT: addq $24, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -1007,103 +944,101 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rax), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm5[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm4[1],xmm10[2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm13[1],xmm12[1],zero -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm15[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rax), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm4[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm3[1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm12[1],xmm11[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm14[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2],xmm15[3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0],ymm9[1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm10 = ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[6],ymm3[6],ymm9[7],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm7[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm6[2],ymm11[3,4,5],ymm6[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm9 = ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[6],ymm2[6],ymm7[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm6[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm5[2],ymm15[3,4,5],ymm5[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm14[3,3],xmm15[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0],ymm8[1],ymm15[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1],ymm9[2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm11, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2,3],ymm15[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm15[0],ymm10[0],ymm15[2],ymm10[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm8[0,1,2,3],ymm10[4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm13[3,3],xmm14[3,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm11 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm11 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm5[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm4[0,1,2],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm10[2,3,4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm10 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm8[0,1,2,3],ymm10[4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm9[1,1],ymm3[1,1],ymm9[5,5],ymm3[5,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm6[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm4[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm3[0,1,2],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm11[2,3,4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm7[1,1],ymm2[1,1],ymm7[5,5],ymm2[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3,4],ymm8[5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm6[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm5[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm9 = ymm3[2],ymm9[2],ymm3[3],ymm9[3],ymm3[6],ymm9[6],ymm3[7],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm10 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,3],ymm7[3,3],ymm6[7,7],ymm7[7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,3],xmm5[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2],ymm8[3,4,5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm7[2],ymm2[3],ymm7[3],ymm2[6],ymm7[6],ymm2[7],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm12 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,3],ymm6[3,3],ymm5[7,7],ymm6[7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rax), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3],xmm4[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, (%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, (%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -1116,39 +1051,39 @@ ; AVX512F-NEXT: vmovdqa (%r8), %ymm2 ; AVX512F-NEXT: vmovdqa (%r10), %ymm3 ; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4 -; AVX512F-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5 +; AVX512F-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512F-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [31,7,15,23,31,7,15,23] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,23,31,7,6,23,31,7] ; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,23,31,7,6,23,31,7] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10> -; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [31,7,15,23,31,7,15,23] +; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3,4],ymm0[5,6,7] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10> +; AVX512F-NEXT: vpermi2d %zmm1, %zmm4, %zmm6 ; AVX512F-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12> -; AVX512F-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12> +; AVX512F-NEXT: vpermi2d %zmm4, %zmm1, %zmm7 ; AVX512F-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14> -; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512F-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14> +; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm1 ; AVX512F-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512F-NEXT: vmovdqa %ymm0, 192(%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1162,39 +1097,39 @@ ; AVX512BW-NEXT: vmovdqa (%r8), %ymm2 ; AVX512BW-NEXT: vmovdqa (%r10), %ymm3 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4 -; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5 +; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [31,7,15,23,31,7,15,23] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,23,31,7,6,23,31,7] ; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,23,31,7,6,23,31,7] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10> -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [31,7,15,23,31,7,15,23] +; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3,4],ymm0[5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10> +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm4, %zmm6 ; AVX512BW-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12> -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12> +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm1, %zmm7 ; AVX512BW-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14> -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14> +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm1 ; AVX512BW-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512BW-NEXT: vmovdqa %ymm0, 192(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1221,73 +1156,69 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $520, %rsp # imm = 0x208 +; SSE-NEXT: subq $456, %rsp # imm = 0x1C8 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm4 -; SSE-NEXT: movdqa 16(%rsi), %xmm6 -; SSE-NEXT: movaps (%rdx), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdx), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm7 +; SSE-NEXT: movdqa (%rsi), %xmm6 +; SSE-NEXT: movdqa 16(%rsi), %xmm10 +; SSE-NEXT: movaps (%rdx), %xmm14 +; SSE-NEXT: movdqa 16(%rdx), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%rcx), %xmm8 -; SSE-NEXT: movaps 16(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%r8), %xmm15 -; SSE-NEXT: movaps 16(%r8), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm13 -; SSE-NEXT: movdqa 16(%r9), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm11 -; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps 16(%rcx), %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%r8), %xmm9 +; SSE-NEXT: movaps %xmm9, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa (%r9), %xmm15 +; SSE-NEXT: movdqa 16(%r9), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rax), %xmm13 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm1[0],xmm5[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: movaps %xmm14, %xmm2 +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa 16(%rax), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rsi), %xmm1 -; SSE-NEXT: movaps 32(%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 32(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rsi), %xmm4 +; SSE-NEXT: movaps 32(%rdx), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE-NEXT: movaps 32(%rcx), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%r9), %xmm1 @@ -1297,10 +1228,10 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rsi), %xmm1 @@ -1308,178 +1239,177 @@ ; SSE-NEXT: movdqa 48(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movaps 48(%rcx), %xmm11 ; SSE-NEXT: movaps 48(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm2 -; SSE-NEXT: movaps 48(%rax), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[0,3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%r9), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm11[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movaps 48(%r9), %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rax), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[1,1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm9[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm2[1,3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm5 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm11[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm8[2],xmm14[3],xmm8[3] -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,0],xmm7[1,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm13[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm10[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, %xmm11 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm4[2,0] -; SSE-NEXT: movaps %xmm5, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[1,0],mem[0,0] +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm14 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm12[2,0] +; SSE-NEXT: movaps %xmm6, %xmm10 ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm2[2],xmm10[3],xmm2[3] -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm9[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm5[1,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, %xmm15 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movdqa %xmm12, %xmm5 ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm0, %xmm12 +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movaps 48(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,0] +; SSE-NEXT: movaps %xmm0, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm9[0,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm11[2,0] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[3,3] +; SSE-NEXT: movaps %xmm2, %xmm11 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm3[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm11[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm1[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0,1],mem[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm7[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm7[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm7[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm7[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm7[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm7[0],xmm2[1,2,3] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm7[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm12 = xmm7[0],xmm12[1,2,3] -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm9 = xmm7[0],xmm9[1,2,3] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm8 = xmm7[0],xmm8[1,2,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[3,0],mem[3,3] +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,3],xmm0[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm11 = xmm0[0],xmm11[1,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm13 = xmm0[0],xmm13[1,2,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] +; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm1[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm8 = xmm0[0],xmm8[1,2,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm1[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm7 = xmm0[0],xmm7[1,2,3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm1, 416(%rax) -; SSE-NEXT: movaps %xmm3, 400(%rax) -; SSE-NEXT: movaps %xmm4, 384(%rax) +; SSE-NEXT: movaps %xmm9, 416(%rax) +; SSE-NEXT: movaps %xmm12, 400(%rax) +; SSE-NEXT: movaps %xmm2, 384(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 352(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 336(%rax) +; SSE-NEXT: movaps %xmm4, 336(%rax) ; SSE-NEXT: movdqa %xmm5, 288(%rax) -; SSE-NEXT: movaps %xmm6, 240(%rax) -; SSE-NEXT: movdqa %xmm15, 224(%rax) +; SSE-NEXT: movaps %xmm15, 240(%rax) +; SSE-NEXT: movdqa %xmm6, 224(%rax) ; SSE-NEXT: movaps %xmm10, 176(%rax) -; SSE-NEXT: movaps %xmm11, 128(%rax) -; SSE-NEXT: movaps %xmm13, 112(%rax) -; SSE-NEXT: movaps %xmm14, 64(%rax) +; SSE-NEXT: movaps %xmm14, 128(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1488,1028 +1418,987 @@ ; SSE-NEXT: movaps %xmm0, 432(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 368(%rax) -; SSE-NEXT: movaps %xmm8, 320(%rax) -; SSE-NEXT: movaps %xmm9, 304(%rax) +; SSE-NEXT: movaps %xmm7, 320(%rax) +; SSE-NEXT: movaps %xmm8, 304(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%rax) -; SSE-NEXT: movaps %xmm12, 208(%rax) -; SSE-NEXT: movaps %xmm2, 192(%rax) +; SSE-NEXT: movaps %xmm3, 208(%rax) +; SSE-NEXT: movaps %xmm13, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rax) +; SSE-NEXT: movaps %xmm11, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: addq $520, %rsp # imm = 0x208 +; SSE-NEXT: addq $456, %rsp # imm = 0x1C8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 +; AVX1-ONLY-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm2 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[6],ymm7[6],ymm4[7],ymm7[7] -; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm8 -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,0],ymm5[4,5],ymm4[6,4] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm9 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,1,2,2,5,5,6,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm10 -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm11 -; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm12[0],xmm4[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm13 -; AVX1-ONLY-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovaps %xmm14, %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm14 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[1,1],xmm11[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm13[1],xmm12[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm12[1,1],xmm4[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm15[1],xmm14[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[1,1],ymm9[1,1],ymm6[5,5],ymm9[5,5] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,1],ymm8[1,1],ymm7[5,5],ymm8[5,5] -; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm13 -; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm11 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6],ymm3[7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,1],ymm1[6,4],ymm0[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm14 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm14[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1],xmm0[0,2] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm8[0],xmm1[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm12 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm12[1],xmm2[1],zero -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm4[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1],xmm6[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm5[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm2[1],xmm1[1],zero +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm8[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[1,1],ymm10[1,1],ymm9[5,5],ymm10[5,5] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1],ymm7[1,1],ymm15[5,5],ymm7[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm8 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,1],ymm7[1,1],ymm8[5,5],ymm7[5,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm2[5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm15 -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm15[0],ymm1[2],ymm15[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm15[2,1],ymm2[6,4],ymm15[6,5] -; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm5[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm13 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1],ymm14[1,1],ymm13[5,5],ymm14[5,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] +; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,1],ymm2[0,2],ymm1[5,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm9[0],xmm11[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm11[2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm8[2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm8 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rax), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm0[0],ymm10[2],ymm0[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm8[1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1],xmm2[1],zero +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm11[1],xmm9[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,3],ymm11[3,3],ymm13[7,7],ymm11[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm4[1,1],ymm5[5,5],ymm4[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,3],ymm6[3,3],ymm11[7,7],ymm6[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 60(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 60(%r9), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rax), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1],ymm3[1,1],ymm10[5,5],ymm3[5,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,3],ymm8[3,3],ymm7[7,7],ymm8[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm10[3,3],ymm9[3,3],ymm10[7,7],ymm9[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3],ymm1[3,3],ymm15[7,7],ymm1[7,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3],ymm1[1,2],ymm5[6,7],ymm1[5,6] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm7[1],ymm15[3],ymm7[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm12[1],ymm6[3],ymm12[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1],ymm1[0,2],ymm12[5,5],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rax), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3],xmm14[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm12[2],mem[2],xmm12[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,3],ymm10[3,3],ymm3[7,7],ymm10[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,3],ymm5[3,3],ymm4[7,7],ymm5[7,7] +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm10 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 60(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 60(%r9), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rax), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm0[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm14[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rax), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm13[0],ymm3[2],ymm13[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm13[3,1],ymm3[0,2],ymm13[7,5],ymm3[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[3,3],xmm6[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[4],ymm9[4],ymm10[5],ymm9[5] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[3,1],ymm5[0,2],ymm7[7,5],ymm5[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm12[3,3],xmm14[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 192(%rax) +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm14[3,3],ymm13[3,3],ymm14[7,7],ymm13[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[3,3],ymm15[3,3],ymm7[7,7],ymm15[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm6[3,3],mem[3,3],ymm6[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,3],ymm3[1,2],ymm5[6,7],ymm3[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm11[3,3],xmm9[3,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6],ymm5[7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rax), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm7[0],ymm15[0],ymm7[1],ymm15[1],ymm7[4],ymm15[4],ymm7[5],ymm15[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm14[3,1],ymm6[0,2],ymm14[7,5],ymm6[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[3,3],xmm12[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[4],ymm10[4],ymm0[5],ymm10[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[3,1],ymm7[0,2],ymm9[7,5],ymm7[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm8[3,3],xmm11[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm4[1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm4, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 288(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) -; AVX1-ONLY-NEXT: addq $456, %rsp # imm = 0x1C8 +; AVX1-ONLY-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i32_stride7_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $488, %rsp # imm = 0x1E8 +; AVX2-SLOW-NEXT: subq $424, %rsp # imm = 0x1A8 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps (%rax), %xmm6 -; AVX2-SLOW-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rax), %xmm0 -; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm5 +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm0 +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%r8), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%r9), %ymm3 +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm0[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%rax), %xmm0 +; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm8 -; AVX2-SLOW-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%r8), %xmm2 -; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm9 -; AVX2-SLOW-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm1 -; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm15 +; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm11 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,1,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm3 -; AVX2-SLOW-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm12 -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm10 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm10[1],xmm12[1],zero -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm11 -; AVX2-SLOW-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm1 -; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm5[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] -; AVX2-SLOW-NEXT: vbroadcastsd %xmm2, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm11[2],xmm2[3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm13 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm13[1],xmm3[1],zero -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm13 +; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm10 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm6 +; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm5[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm6[2],xmm3[3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm10[1],xmm13[1],zero +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5],ymm3[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm14 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[6],ymm14[6],ymm1[7],ymm14[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps (%r8), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm4 +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm7[2],ymm4[2],ymm7[3],ymm4[3],ymm7[6],ymm4[6],ymm7[7],ymm4[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm0[2,2,2,2] +; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm14 +; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm3 +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm9 = ymm14[2],ymm3[2],ymm14[3],ymm3[3],ymm14[6],ymm3[6],ymm14[7],ymm3[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps (%r8), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm6 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm4 -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm8 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[6],ymm8[6],ymm4[7],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 32(%r8), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovaps 32(%r9), %ymm9 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4,5,6],ymm15[7] -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm0[2],ymm9[3,4,5],ymm0[6],ymm9[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm8[2,3,4,5],ymm9[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[6],ymm4[6],ymm8[7],ymm4[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm11[2],ymm6[3],ymm11[3],ymm6[6],ymm11[6],ymm6[7],ymm11[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 60(%r8), %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 60(%r9), %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 56(%rax), %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps 32(%r8), %xmm0 +; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm1 +; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm0[1],xmm8[2,3] +; AVX2-SLOW-NEXT: vbroadcastsd %xmm8, %ymm8 +; AVX2-SLOW-NEXT: vmovaps 32(%rax), %xmm0 +; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm2 +; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm2[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2],xmm8[3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,1,2,1] +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm9 +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm8 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm12 = zero,xmm8[1],xmm9[1],zero +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm5[3,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm3 +; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm0 ; AVX2-SLOW-NEXT: vbroadcastss %xmm10, %xmm1 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastsd %xmm15, %ymm3 -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm1[4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastsd %xmm2, %ymm12 +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm12[3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm13[2],xmm10[3],xmm13[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm11[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastss %xmm2, %xmm0 -; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm3 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] +; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm4[0],ymm7[1],ymm4[1],ymm7[4],ymm4[4],ymm7[5],ymm4[5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm15[3,3],xmm11[3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1],ymm3[1,1],ymm14[5,5],ymm3[5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm11[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm3[2],ymm14[2],ymm3[3],ymm14[3],ymm3[6],ymm14[6],ymm3[7],ymm14[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm2 = ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[6],ymm7[6],ymm4[7],ymm7[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,3],ymm10[3,3],ymm11[7,7],ymm10[7,7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3,4],ymm2[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm2 +; AVX2-SLOW-NEXT: vbroadcastss %xmm8, %xmm3 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastsd %xmm12, %ymm5 -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1],ymm8[1,1],ymm4[5,5],ymm8[5,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3,4],ymm2[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-SLOW-NEXT: vbroadcastsd 48(%rax), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2],ymm0[3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,1],ymm14[1,1],ymm0[5,5],ymm14[5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm1[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm2[3,4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm2 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[6],ymm9[6],ymm4[7],ymm9[7] -; AVX2-SLOW-NEXT: vmovaps %ymm9, %ymm0 -; AVX2-SLOW-NEXT: vmovaps %ymm4, %ymm13 -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm1[3,3],ymm3[3,3],ymm1[7,7],ymm3[7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0],ymm7[1,2],ymm9[3,4],ymm7[5,6],ymm9[7] +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm11[3,3],xmm10[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2] +; AVX2-SLOW-NEXT: vmovaps %xmm6, %xmm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vmovaps %xmm7, %xmm10 +; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm4[2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm2 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6],ymm6[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovaps %ymm8, %ymm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm7, %ymm9 +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1],ymm11[1,1],ymm12[5,5],ymm11[5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm6 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm8[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6],ymm4[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = xmm1[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm15[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[4],ymm13[4],ymm0[5],ymm13[5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm7 = xmm0[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm12[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 48(%rax), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm8 = ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[6],ymm9[6],ymm10[7],ymm9[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 60(%r8), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 60(%r9), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 56(%rax), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6],ymm8[7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm4, 320(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm2, 192(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm5, 128(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm10, 352(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 416(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 352(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm2, 320(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 288(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 256(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 224(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 416(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-SLOW-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX2-SLOW-NEXT: addq $424, %rsp # imm = 0x1A8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i32_stride7_vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $536, %rsp # imm = 0x218 +; AVX2-FAST-NEXT: subq $376, %rsp # imm = 0x178 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps (%rax), %xmm3 -; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rax), %xmm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps (%r8), %xmm5 -; AVX2-FAST-NEXT: vmovaps %xmm5, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%r8), %xmm2 -; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps (%r9), %xmm7 -; AVX2-FAST-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm4 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] -; AVX2-FAST-NEXT: vmovaps %xmm4, %xmm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm8 -; AVX2-FAST-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm11 -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm10 -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm10[1],xmm11[1],zero -; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm12 -; AVX2-FAST-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm4 -; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm6 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2],xmm2[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,1,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] -; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2],xmm1[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm2 -; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm8[1],zero -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm5 +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm2 -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps (%r8), %ymm2 +; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%r9), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm12 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm8 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm3[2],ymm8[2],ymm3[3],ymm8[3],ymm3[6],ymm8[6],ymm3[7],ymm8[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vmovaps 32(%r9), %ymm4 -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm15 = [5,6,5,6,5,6,5,6] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm15, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[6],ymm3[6],ymm8[7],ymm3[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[6],ymm13[6],ymm12[7],ymm13[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 60(%r8), %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vbroadcastss 60(%r9), %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vbroadcastsd 56(%rax), %ymm1 +; AVX2-FAST-NEXT: vmovaps 32(%r9), %ymm3 +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm1 = [5,6,5,6,5,6,5,6] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm0[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm9[3,3],xmm6[3,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm15 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,1,2,2,0,1,2,2] -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm5, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm1[5,6],ymm15[7] -; AVX2-FAST-NEXT: vmovaps %xmm14, %xmm1 -; AVX2-FAST-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm14[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1,2],xmm15[3] -; AVX2-FAST-NEXT: vbroadcastsd 40(%rax), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss %xmm11, %xmm2 -; AVX2-FAST-NEXT: vbroadcastss %xmm10, %xmm10 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm5, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastsd %xmm14, %ymm9 -; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm6[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovaps (%rax), %xmm1 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps %xmm1, %xmm14 +; AVX2-FAST-NEXT: vmovaps (%r8), %xmm15 +; AVX2-FAST-NEXT: vmovaps (%r9), %xmm11 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,1,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3] +; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm12 +; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm8 +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm3 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2],xmm1[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm8[1],xmm12[1],zero +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm4 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm7[2],ymm4[2],ymm7[3],ymm4[3],ymm7[6],ymm4[6],ymm7[7],ymm4[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm2 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm9 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps (%r8), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%r9), %ymm6 +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm6[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm1[2],ymm9[3,4,5],ymm1[6],ymm9[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%r8), %xmm1 +; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm0 +; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-FAST-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps 32(%rax), %xmm1 +; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm6 +; AVX2-FAST-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm6[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2],xmm9[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,1,2,1] +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm10 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm9 +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm13 = zero,xmm9[1],xmm10[1],zero +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm8[1,1],ymm3[5,5],ymm8[5,5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FAST-NEXT: vbroadcastss %xmm12, %xmm0 +; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm13 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [0,1,2,2,0,1,2,2] +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm6, %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm13 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vmovaps %xmm14, %xmm0 +; AVX2-FAST-NEXT: vbroadcastsd %xmm14, %ymm14 +; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm13[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm5[3,3],xmm3[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm12[2],xmm8[3],xmm12[3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm6, %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6],ymm3[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm11[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1],ymm3[2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6],ymm3[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm3 = ymm7[0],ymm4[0],ymm7[1],ymm4[1],ymm7[4],ymm4[4],ymm7[5],ymm4[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm15[3,3],xmm11[3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm11[1,1],ymm2[1,1],ymm11[5,5],ymm2[5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm8[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm11[2],ymm2[3],ymm11[3],ymm2[6],ymm11[6],ymm2[7],ymm11[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm2 = ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[6],ymm7[6],ymm4[7],ymm7[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm8[3,3],ymm0[3,3],ymm8[7,7],ymm0[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3,4],ymm2[5,6],ymm4[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FAST-NEXT: vbroadcastsd 48(%rax), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3],xmm6[3,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss %xmm10, %xmm2 +; AVX2-FAST-NEXT: vbroadcastss %xmm9, %xmm4 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm9[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1,2],xmm3[3] -; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss %xmm0, %xmm3 -; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm4 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastsd %xmm6, %ymm4 -; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm5[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm11[3,3],xmm0[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6],ymm0[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm8[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps %xmm8, %xmm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vmovaps %xmm7, %xmm9 +; AVX2-FAST-NEXT: vbroadcastsd 40(%rax), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm2 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6],ymm5[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,1],ymm1[1,1],ymm7[5,5],ymm1[5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm2[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2,3,4],ymm4[5],ymm9[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2],ymm3[3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm9 = ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[6],ymm0[6],ymm5[7],ymm0[7] -; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovaps %ymm5, %ymm0 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm2[3,3],ymm10[3,3],ymm2[7,7],ymm10[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3,4],ymm9[5,6],ymm10[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0],ymm4[1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm7 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm8[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6],ymm5[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm7 = xmm2[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm14[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm3[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6],ymm8[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm8 = xmm15[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm5 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm10[3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovaps %ymm8, %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm9 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1],ymm11[1,1],ymm12[5,5],ymm11[5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX2-FAST-NEXT: vbroadcastsd 48(%rax), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm7 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm8 = ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[6],ymm9[6],ymm10[7],ymm9[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 60(%r8), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7] +; AVX2-FAST-NEXT: vbroadcastss 60(%r9), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vbroadcastsd 56(%rax), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6],ymm8[7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm7, 96(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm5, 320(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm4, 192(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm11, 128(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, 416(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm5, 352(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm2, 320(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm4, 224(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm3, 128(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm15, 96(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm14, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 416(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm13, (%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-FAST-NEXT: addq $536, %rsp # imm = 0x218 +; AVX2-FAST-NEXT: addq $376, %rsp # imm = 0x178 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i32_stride7_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $488, %rsp # imm = 0x1E8 +; AVX2-FAST-PERLANE-NEXT: subq $424, %rsp # imm = 0x1A8 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rax), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rax), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rax), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm10[1],xmm12[1],zero -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm5[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm11[2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm13[1],xmm3[1],zero -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm5[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm6[2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm10[1],xmm13[1],zero +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[6],ymm14[6],ymm1[7],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm7[2],ymm4[2],ymm7[3],ymm4[3],ymm7[6],ymm4[6],ymm7[7],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm9 = ymm14[2],ymm3[2],ymm14[3],ymm3[3],ymm14[6],ymm3[6],ymm14[7],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[6],ymm8[6],ymm4[7],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,2,2,3,5,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm0[2],ymm9[3,4,5],ymm0[6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm8[2,3,4,5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[6],ymm4[6],ymm8[7],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm11[2],ymm6[3],ymm11[3],ymm6[6],ymm11[6],ymm6[7],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 60(%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 60(%r9), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rax), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm0[1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rax), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm2[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm12 = zero,xmm8[1],xmm9[1],zero +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm5[3,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm10, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm15, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm1[4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm2, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm12[3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm13[2],xmm10[3],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm2[2],xmm13[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm11[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm2, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm4[0],ymm7[1],ymm4[1],ymm7[4],ymm4[4],ymm7[5],ymm4[5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm15[3,3],xmm11[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1],ymm3[1,1],ymm14[5,5],ymm3[5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm11[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm3[2],ymm14[2],ymm3[3],ymm14[3],ymm3[6],ymm14[6],ymm3[7],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm2 = ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[6],ymm7[6],ymm4[7],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,3],ymm10[3,3],ymm11[7,7],ymm10[7,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3,4],ymm2[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm8, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm12, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1],ymm8[1,1],ymm4[5,5],ymm8[5,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3,4],ymm2[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%rax), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2],ymm0[3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,1],ymm14[1,1],ymm0[5,5],ymm14[5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm1[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm2[3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm2 = ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[6],ymm0[6],ymm14[7],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[6],ymm9[6],ymm4[7],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm1[3,3],ymm3[3,3],ymm1[7,7],ymm3[7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0],ymm7[1,2],ymm9[3,4],ymm7[5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm11[3,3],xmm10[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm6, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm7, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm4[2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3],xmm9[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1],ymm11[1,1],ymm12[5,5],ymm11[5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm6 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm8[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = xmm1[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[4],ymm13[4],ymm0[5],ymm13[5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm0[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%rax), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm8 = ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[6],ymm9[6],ymm10[7],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 60(%r8), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 60(%r9), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rax), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 416(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 320(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 288(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 256(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 288(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 416(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX2-FAST-PERLANE-NEXT: addq $424, %rsp # imm = 0x1A8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -2765,72 +2654,72 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride7_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $1256, %rsp # imm = 0x4E8 +; SSE-NEXT: subq $1272, %rsp # imm = 0x4F8 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm8 -; SSE-NEXT: movdqa (%rsi), %xmm10 -; SSE-NEXT: movdqa 16(%rsi), %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm9 +; SSE-NEXT: movdqa (%rsi), %xmm6 +; SSE-NEXT: movdqa 16(%rsi), %xmm5 ; SSE-NEXT: movaps (%rdx), %xmm14 -; SSE-NEXT: movdqa 16(%rdx), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdx), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%rcx), %xmm13 -; SSE-NEXT: movaps 16(%rcx), %xmm9 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%r8), %xmm11 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r9), %xmm12 +; SSE-NEXT: movaps 16(%r8), %xmm8 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r9), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r9), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rax), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] -; SSE-NEXT: movaps %xmm14, %xmm3 -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: movaps %xmm14, %xmm4 +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa 16(%rax), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movdqa 16(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rsi), %xmm1 -; SSE-NEXT: movaps 32(%rdx), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rdx), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 32(%rcx), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rcx), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%r9), %xmm1 @@ -2842,20 +2731,21 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 48(%rsi), %xmm7 ; SSE-NEXT: movdqa 48(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rcx), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%r9), %xmm1 @@ -2865,19 +2755,16 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rsi), %xmm1 +; SSE-NEXT: movdqa 64(%rsi), %xmm2 ; SSE-NEXT: movaps 64(%rdx), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 64(%rcx), %xmm3 @@ -2894,23 +2781,23 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: movdqa 64(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rsi), %xmm3 +; SSE-NEXT: movdqa 80(%rsi), %xmm2 ; SSE-NEXT: movdqa 80(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 80(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rcx), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%r9), %xmm1 @@ -2922,378 +2809,378 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 80(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rsi), %xmm6 +; SSE-NEXT: movdqa 96(%rsi), %xmm7 ; SSE-NEXT: movaps 96(%rdx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 96(%rcx), %xmm3 -; SSE-NEXT: movaps 96(%r8), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rcx), %xmm5 +; SSE-NEXT: movaps 96(%r8), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%r9), %xmm2 -; SSE-NEXT: movdqa 96(%rax), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%r9), %xmm3 +; SSE-NEXT: movdqa 96(%rax), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movaps 112(%rcx), %xmm2 -; SSE-NEXT: movaps 112(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps 112(%r9), %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps 112(%rax), %xmm0 +; SSE-NEXT: movaps 112(%rcx), %xmm3 +; SSE-NEXT: movaps 112(%r8), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[1,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%r9), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: movaps 112(%rax), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[3,2] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm2[1,1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movdqa %xmm9, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm9[1,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[1,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm15[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm15[0,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[1,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: shufps $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[1,0],mem[0,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm9[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm10[1,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm15 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, %xmm12 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] +; SSE-NEXT: shufps $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[1,0],mem[0,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm10 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm10 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm2[2],xmm9[3],xmm2[3] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, %xmm8 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm14[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm11[1,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] +; SSE-NEXT: movaps %xmm1, %xmm7 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSE-NEXT: movaps 112(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[0,2] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movaps 112(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm12[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE-NEXT: movaps %xmm1, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,0],xmm0[1,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm15[0,2] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[0,1] +; SSE-NEXT: shufps $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[0,1],mem[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm12[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[2,0] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm0[0],xmm4[1,2,3] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm0[2,0] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,3],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm12 = xmm0[0],xmm12[1,2,3] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] +; SSE-NEXT: movss {{.*#+}} xmm15 = xmm0[0],xmm15[1,2,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm1[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm0[2,0] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,3],xmm0[2,0] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm14 = xmm0[0],xmm14[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm11 = xmm0[0],xmm11[1,2,3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm5, 864(%rax) -; SSE-NEXT: movaps %xmm7, 848(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 832(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 800(%rax) -; SSE-NEXT: movaps %xmm6, 784(%rax) -; SSE-NEXT: movaps %xmm8, 736(%rax) -; SSE-NEXT: movaps %xmm9, 688(%rax) -; SSE-NEXT: movaps %xmm10, 672(%rax) -; SSE-NEXT: movaps %xmm11, 624(%rax) -; SSE-NEXT: movaps %xmm13, 576(%rax) -; SSE-NEXT: movaps %xmm15, 560(%rax) +; SSE-NEXT: movaps %xmm0, 864(%rax) +; SSE-NEXT: movaps %xmm5, 848(%rax) +; SSE-NEXT: movaps %xmm14, 832(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 512(%rax) +; SSE-NEXT: movaps %xmm0, 800(%rax) +; SSE-NEXT: movaps %xmm4, 784(%rax) +; SSE-NEXT: movaps %xmm6, 736(%rax) +; SSE-NEXT: movaps %xmm7, 688(%rax) +; SSE-NEXT: movaps %xmm8, 672(%rax) +; SSE-NEXT: movaps %xmm9, 624(%rax) +; SSE-NEXT: movaps %xmm10, 576(%rax) +; SSE-NEXT: movaps %xmm12, 560(%rax) +; SSE-NEXT: movdqa %xmm13, 512(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 464(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3328,7 +3215,7 @@ ; SSE-NEXT: movaps %xmm0, 816(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 768(%rax) -; SSE-NEXT: movaps %xmm14, 752(%rax) +; SSE-NEXT: movaps %xmm11, 752(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 720(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3340,7 +3227,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 592(%rax) ; SSE-NEXT: movaps %xmm3, 544(%rax) -; SSE-NEXT: movaps %xmm12, 528(%rax) +; SSE-NEXT: movaps %xmm15, 528(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 496(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3363,25 +3250,26 @@ ; SSE-NEXT: movaps %xmm0, 256(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) -; SSE-NEXT: movaps %xmm4, 192(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: addq $1256, %rsp # imm = 0x4E8 +; SSE-NEXT: addq $1272, %rsp # imm = 0x4F8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1624, %rsp # imm = 0x658 +; AVX1-ONLY-NEXT: subq $1496, %rsp # imm = 0x5D8 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3407,56 +3295,56 @@ ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm8 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm9 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm4[1,1] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],mem[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm8[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm3[1],xmm2[1],zero +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1],ymm4[1,1],ymm2[5,5],ymm4[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm7[1,1],ymm2[5,5],ymm7[5,5] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3466,49 +3354,46 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm7[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1],xmm1[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm6[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm2[1],xmm1[1],zero +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm6[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[1,1],ymm1[5,5],ymm0[5,5] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3517,7 +3402,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] @@ -3530,158 +3415,152 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm10 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm5[1,1] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm7[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1],xmm1[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm10[1],xmm9[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm2[1],xmm1[1],zero +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm14[1,1],ymm0[5,5],ymm14[5,5] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1],ymm12[1,1],ymm11[5,5],ymm12[5,5] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm9[1,1],ymm2[5,5],ymm9[5,5] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm13 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1],ymm1[1,1],ymm6[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm7[2,1],ymm1[6,4],ymm7[6,5] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm5[0],ymm10[2],ymm5[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm5[2,1],ymm1[6,4],ymm5[6,5] ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm2[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm6[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm9[0],xmm3[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,0],xmm3[2,1] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm11 -; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm8[0,1],ymm15[2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm8 +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rax), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm11[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm2[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,1],xmm1[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm3[1],xmm4[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1,2,3],ymm14[4,5,6],ymm15[7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[1,1],xmm8[1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1],xmm2[1],zero +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm9[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm11[1],ymm5[3],ymm11[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm8[1],ymm14[3],ymm8[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[6],ymm4[6],ymm12[7],ymm4[7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm1[0,2],ymm8[5,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[0,2],ymm2[5,5],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm6 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[0,2],ymm2[5,5],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm9[2],ymm13[3],ymm9[3],ymm13[6],ymm9[6],ymm13[7],ymm9[7] -; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm14 +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm10[1],ymm5[3],ymm10[3] +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1],ymm1[0,2],ymm10[5,5],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 80(%rax), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 80(%rax), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,3],ymm15[3,3],ymm3[7,7],ymm15[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3],ymm15[3,3],ymm8[7,7],ymm15[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3],ymm2[3,3],ymm12[7,7],ymm2[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 124(%r8), %ymm1 @@ -3691,12 +3570,9 @@ ; AVX1-ONLY-NEXT: vbroadcastsd 120(%rax), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm8[0],ymm15[0],ymm8[1],ymm15[1],ymm8[4],ymm15[4],ymm8[5],ymm15[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm3[0],ymm15[2],ymm3[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,1],ymm0[0,2],ymm3[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[4],ymm2[4],ymm12[5],ymm2[5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 108(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] @@ -3705,8 +3581,8 @@ ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1],ymm2[1,1],ymm3[5,5],ymm2[5,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,1],ymm8[1,1],ymm15[5,5],ymm8[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1],ymm3[1,1],ymm15[5,5],ymm3[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm12[1,1],ymm2[5,5],ymm12[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vbroadcastsd 112(%r8), %ymm1 @@ -3719,28 +3595,27 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rax), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rax), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,3],ymm12[3,3],ymm4[7,7],ymm12[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,3],ymm13[3,3],ymm7[7,7],ymm13[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3],ymm5[3,3],ymm11[7,7],ymm5[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3],ymm14[3,3],ymm8[7,7],ymm14[7,7] +; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm8 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm4[3,3],mem[3,3],ymm4[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] @@ -3748,131 +3623,128 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rax), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm0[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rax), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm15 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,3],ymm9[3,3],ymm13[7,7],ymm9[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,3],ymm12[3,3],ymm11[7,7],ymm12[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3],ymm6[3,3],ymm10[7,7],ymm6[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vbroadcastsd 72(%rax), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[3,3],ymm14[3,3],ymm6[7,7],ymm14[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,3],ymm14[3,3],ymm4[7,7],ymm14[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rax), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,3],ymm12[3,3],ymm0[7,7],ymm12[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[2,3],ymm10[1,2],ymm1[6,7],ymm10[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0],ymm2[1,2,3,4],ymm10[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm1[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm11, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6],ymm5[7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vbroadcastsd 72(%rax), %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm5[2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,3],ymm9[3,3],ymm7[7,7],ymm9[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm2[0,2],ymm5[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm5[3,3],ymm1[3,3],ymm5[7,7],ymm1[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm6[3,3],mem[3,3],ymm6[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,3],ymm3[1,2],ymm4[6,7],ymm3[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm2[1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm10[0],ymm3[0],ymm10[1],ymm3[1],ymm10[4],ymm3[4],ymm10[5],ymm3[5] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm11[3,1],ymm3[0,2],ymm11[7,5],ymm3[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm13[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm4[0],ymm14[0],ymm4[1],ymm14[1],ymm4[4],ymm14[4],ymm4[5],ymm14[5] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,1],ymm5[0,2],ymm6[7,5],ymm5[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm8[3,3],xmm9[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6],ymm4[7] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rax), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[4],ymm8[4],ymm4[5],ymm8[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[3,1],ymm6[0,2],ymm8[7,5],ymm6[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm12[0],ymm0[1],ymm12[1],ymm0[4],ymm12[4],ymm0[5],ymm12[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm13[3,1],ymm8[0,2],ymm13[7,5],ymm8[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm0[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[4],ymm1[4],ymm5[5],ymm1[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,1],ymm9[0,2],ymm7[7,5],ymm9[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm14[3,3],xmm10[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm3, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 736(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 640(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 544(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 736(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 640(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) @@ -3914,7 +3786,7 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 864(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 832(%rax) -; AVX1-ONLY-NEXT: addq $1624, %rsp # imm = 0x658 +; AVX1-ONLY-NEXT: addq $1496, %rsp # imm = 0x5D8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -5489,407 +5361,391 @@ ; ; AVX512F-LABEL: store_i32_stride7_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: pushq %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm8 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm12 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm23 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm9 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm12 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm17 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm16 ; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm13 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm22 -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512F-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm21, %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm6, %zmm3 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm8 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm4 +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rax), %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm18, %zmm9 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm2, %zmm14 ; AVX512F-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512F-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm20, %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm23, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> -; AVX512F-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm28, %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm25, %zmm7 +; AVX512F-NEXT: vmovdqa32 %zmm9, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm20, %zmm9 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm22, %zmm15 +; AVX512F-NEXT: vmovdqa32 %zmm9, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm21, %zmm19 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 ; AVX512F-NEXT: movw $1548, %cx # imm = 0x60C +; AVX512F-NEXT: vpermi2d %zmm12, %zmm0, %zmm18 +; AVX512F-NEXT: vpermi2d %zmm17, %zmm16, %zmm2 +; AVX512F-NEXT: vmovdqa32 %zmm18, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512F-NEXT: vpermi2d %zmm6, %zmm8, %zmm18 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = +; AVX512F-NEXT: vpermi2d %zmm7, %zmm18, %zmm23 +; AVX512F-NEXT: movw $-30962, %dx # imm = 0x870E +; AVX512F-NEXT: kmovw %edx, %k2 +; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm2 {%k2} ; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm30 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm19 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm28, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm27 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> +; AVX512F-NEXT: vpermt2d %zmm16, %zmm20, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm27, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm22, %zmm0 +; AVX512F-NEXT: vmovdqa32 %zmm17, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> +; AVX512F-NEXT: vpermi2d %zmm8, %zmm6, %zmm17 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> +; AVX512F-NEXT: vpermi2d %zmm7, %zmm17, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm28, %zmm17 +; AVX512F-NEXT: movw $-7741, %cx # imm = 0xE1C3 +; AVX512F-NEXT: kmovw %ecx, %k1 +; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = +; AVX512F-NEXT: vpermt2d %zmm7, %zmm22, %zmm17 ; AVX512F-NEXT: movw $14448, %cx # imm = 0x3870 -; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} -; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm28 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512F-NEXT: vpermt2d %zmm26, %zmm25, %zmm22 -; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm25 -; AVX512F-NEXT: vpermt2d %zmm25, %zmm2, %zmm22 -; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm18 {%k2} -; AVX512F-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 -; AVX512F-NEXT: vpermi2d %zmm8, %zmm14, %zmm6 -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> -; AVX512F-NEXT: vpermi2d %zmm4, %zmm10, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> -; AVX512F-NEXT: vpermi2d %zmm15, %zmm2, %zmm21 -; AVX512F-NEXT: movw $-7741, %ax # imm = 0xE1C3 -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vpermt2d %zmm13, %zmm2, %zmm30 -; AVX512F-NEXT: movw $-31994, %ax # imm = 0x8306 +; AVX512F-NEXT: kmovw %ecx, %k3 +; AVX512F-NEXT: vmovdqa32 %zmm17, %zmm9 {%k3} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm31 = +; AVX512F-NEXT: vpermt2d %zmm12, %zmm31, %zmm29 +; AVX512F-NEXT: vpermi2d %zmm13, %zmm5, %zmm21 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm11, %zmm10 +; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> +; AVX512F-NEXT: vpermt2d %zmm16, %zmm17, %zmm18 +; AVX512F-NEXT: movw $12384, %cx # imm = 0x3060 +; AVX512F-NEXT: kmovw %ecx, %k1 +; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm21 +; AVX512F-NEXT: vpermi2d %zmm3, %zmm4, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm22, %zmm28 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = +; AVX512F-NEXT: vpermt2d %zmm6, %zmm22, %zmm30 +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm10 {%k3} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512F-NEXT: vpermt2d %zmm7, %zmm28, %zmm30 +; AVX512F-NEXT: movw $3612, %ax # imm = 0xE1C ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm16 {%k2} -; AVX512F-NEXT: vpermi2d %zmm8, %zmm14, %zmm20 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 -; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = -; AVX512F-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512F-NEXT: vpermi2d %zmm15, %zmm20, %zmm21 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = -; AVX512F-NEXT: vpermt2d %zmm26, %zmm20, %zmm5 -; AVX512F-NEXT: movw $-30962, %ax # imm = 0x870E -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512F-NEXT: vpermt2d %zmm25, %zmm21, %zmm5 -; AVX512F-NEXT: movw $7224, %ax # imm = 0x1C38 +; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm18 {%k2} +; AVX512F-NEXT: vpermi2d %zmm1, %zmm11, %zmm31 +; AVX512F-NEXT: vpermi2d %zmm13, %zmm5, %zmm17 +; AVX512F-NEXT: vmovdqa32 %zmm31, %zmm17 {%k1} +; AVX512F-NEXT: vpermi2d %zmm3, %zmm4, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm28, %zmm22 +; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm17 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = +; AVX512F-NEXT: vpermt2d %zmm16, %zmm22, %zmm26 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512F-NEXT: vpermt2d %zmm16, %zmm28, %zmm24 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512F-NEXT: vpermt2d %zmm16, %zmm29, %zmm23 +; AVX512F-NEXT: vpermi2d %zmm13, %zmm5, %zmm22 +; AVX512F-NEXT: vpermi2d %zmm13, %zmm5, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm29, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512F-NEXT: vpermt2d %zmm12, %zmm16, %zmm25 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512F-NEXT: vpermt2d %zmm12, %zmm13, %zmm19 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm11, %zmm16 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm11, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm27, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> +; AVX512F-NEXT: vpermi2d %zmm4, %zmm14, %zmm12 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512F-NEXT: vpermi2d %zmm4, %zmm15, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> +; AVX512F-NEXT: vpermt2d %zmm6, %zmm15, %zmm11 +; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> +; AVX512F-NEXT: vpermt2d %zmm7, %zmm22, %zmm11 +; AVX512F-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512F-NEXT: vpermt2d %zmm13, %zmm22, %zmm27 -; AVX512F-NEXT: vpermi2d %zmm9, %zmm1, %zmm2 -; AVX512F-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512F-NEXT: vpermt2d %zmm12, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm11 {%k1} +; AVX512F-NEXT: vpermi2d %zmm3, %zmm4, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm22, %zmm15 +; AVX512F-NEXT: vmovdqa32 %zmm16, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm16 ; AVX512F-NEXT: movw $3096, %ax # imm = 0xC18 -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm3 {%k2} -; AVX512F-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm21, %zmm20 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vpermt2d %zmm26, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> -; AVX512F-NEXT: vpermt2d %zmm25, %zmm20, %zmm0 +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = +; AVX512F-NEXT: vpermt2d %zmm6, %zmm22, %zmm16 +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> +; AVX512F-NEXT: vpermt2d %zmm7, %zmm24, %zmm16 ; AVX512F-NEXT: movw $28897, %ax # imm = 0x70E1 -; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm31 -; AVX512F-NEXT: vpermi2d %zmm9, %zmm1, %zmm22 -; AVX512F-NEXT: vpermi2d %zmm8, %zmm14, %zmm5 -; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512F-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 -; AVX512F-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm31, %zmm24 {%k1} -; AVX512F-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> -; AVX512F-NEXT: vpermt2d %zmm26, %zmm20, %zmm23 -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm5 {%k3} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> -; AVX512F-NEXT: vpermt2d %zmm25, %zmm2, %zmm23 -; AVX512F-NEXT: movw $15480, %ax # imm = 0x3C78 -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm23 {%k2} -; AVX512F-NEXT: vpermi2d %zmm9, %zmm1, %zmm0 -; AVX512F-NEXT: vpermi2d %zmm8, %zmm14, %zmm21 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} -; AVX512F-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm2, %zmm20 -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm19 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm28 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm25, %zmm0, %zmm28 -; AVX512F-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} -; AVX512F-NEXT: movw $3612, %ax # imm = 0xE1C +; AVX512F-NEXT: vmovdqa32 %zmm16, %zmm19 {%k1} +; AVX512F-NEXT: vpermi2d %zmm3, %zmm4, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm24, %zmm22 +; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm13 {%k1} +; AVX512F-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512F-NEXT: vpermt2d %zmm6, %zmm16, %zmm8 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512F-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm1 {%k1} +; AVX512F-NEXT: movw $7224, %ax # imm = 0x1C38 +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovdqa32 %zmm8, %zmm20 {%k1} +; AVX512F-NEXT: vpermt2d %zmm3, %zmm16, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm6, %zmm4 ; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm26, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm26, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> +; AVX512F-NEXT: vpermi2d %zmm3, %zmm12, %zmm4 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> +; AVX512F-NEXT: vpermi2d %zmm3, %zmm14, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512F-NEXT: vpermi2d %zmm21, %zmm4, %zmm3 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] -; AVX512F-NEXT: vpermi2d %zmm25, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] -; AVX512F-NEXT: vpermi2d %zmm25, %zmm2, %zmm0 +; AVX512F-NEXT: vpermi2d %zmm21, %zmm5, %zmm4 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm20, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm23, 576(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm16, 704(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm0, 768(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, 704(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm13, 640(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, 512(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, 448(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, 384(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm20, 256(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm19, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512F-NEXT: vmovdqa64 %zmm4, 832(%rax) -; AVX512F-NEXT: popq %rax +; AVX512F-NEXT: vmovdqa64 %zmm3, 768(%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i32_stride7_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: pushq %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm23 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm9 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm16 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm13 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm22 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm8 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm18, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm2, %zmm14 ; AVX512BW-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm20, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> -; AVX512BW-NEXT: vpermi2d %zmm22, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm25, %zmm7 +; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm22, %zmm15 +; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm21, %zmm19 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 ; AVX512BW-NEXT: movw $1548, %cx # imm = 0x60C +; AVX512BW-NEXT: vpermi2d %zmm12, %zmm0, %zmm18 +; AVX512BW-NEXT: vpermi2d %zmm17, %zmm16, %zmm2 +; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm8, %zmm18 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm18, %zmm23 +; AVX512BW-NEXT: movw $-30962, %dx # imm = 0x870E +; AVX512BW-NEXT: kmovd %edx, %k2 +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm2 {%k2} ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm30 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm28, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm20, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm27, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm22, %zmm0 +; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm6, %zmm17 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm17, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm28, %zmm17 +; AVX512BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 +; AVX512BW-NEXT: kmovd %ecx, %k1 +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm22, %zmm17 ; AVX512BW-NEXT: movw $14448, %cx # imm = 0x3870 -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} -; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm25, %zmm22 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm2, %zmm22 -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm18 {%k2} -; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm21 -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm6 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> -; AVX512BW-NEXT: vpermi2d %zmm15, %zmm2, %zmm21 -; AVX512BW-NEXT: movw $-7741, %ax # imm = 0xE1C3 -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm2, %zmm30 -; AVX512BW-NEXT: movw $-31994, %ax # imm = 0x8306 +; AVX512BW-NEXT: kmovd %ecx, %k3 +; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm9 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm31, %zmm29 +; AVX512BW-NEXT: vpermi2d %zmm13, %zmm5, %zmm21 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm11, %zmm10 +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm17, %zmm18 +; AVX512BW-NEXT: movw $12384, %cx # imm = 0x3060 +; AVX512BW-NEXT: kmovd %ecx, %k1 +; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm21 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm22, %zmm28 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm22, %zmm30 +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm10 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm28, %zmm30 +; AVX512BW-NEXT: movw $3612, %ax # imm = 0xE1C ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm16 {%k2} -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm20 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512BW-NEXT: vpermi2d %zmm15, %zmm20, %zmm21 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm20, %zmm5 -; AVX512BW-NEXT: movw $-30962, %ax # imm = 0x870E -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm21, %zmm5 -; AVX512BW-NEXT: movw $7224, %ax # imm = 0x1C38 +; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm18 {%k2} +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm11, %zmm31 +; AVX512BW-NEXT: vpermi2d %zmm13, %zmm5, %zmm17 +; AVX512BW-NEXT: vmovdqa32 %zmm31, %zmm17 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm28, %zmm22 +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm22, %zmm26 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm28, %zmm24 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm29, %zmm23 +; AVX512BW-NEXT: vpermi2d %zmm13, %zmm5, %zmm22 +; AVX512BW-NEXT: vpermi2d %zmm13, %zmm5, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm29, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm16, %zmm25 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm13, %zmm19 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm11, %zmm16 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm11, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm27, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm14, %zmm12 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm15, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm15, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm22, %zmm11 +; AVX512BW-NEXT: movw $15480, %ax # imm = 0x3C78 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm22, %zmm27 -; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm2 -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm8, %zmm17 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm11 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm22, %zmm15 +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm16 ; AVX512BW-NEXT: movw $3096, %ax # imm = 0xC18 -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm3 {%k2} -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm21, %zmm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm20, %zmm0 +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm22, %zmm16 +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm24, %zmm16 ; AVX512BW-NEXT: movw $28897, %ax # imm = 0x70E1 -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm31 -; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm22 -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm5 -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm21, %zmm24 -; AVX512BW-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm31, %zmm24 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm20, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm20, %zmm23 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm5 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm2, %zmm23 -; AVX512BW-NEXT: movw $15480, %ax # imm = 0x3C78 -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm23 {%k2} -; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm14, %zmm21 -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm10, %zmm4, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm2, %zmm20 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm20 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1} -; AVX512BW-NEXT: movw $3612, %ax # imm = 0xE1C +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm19 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm4, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm24, %zmm22 +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm13 {%k1} +; AVX512BW-NEXT: movw $-31994, %ax # imm = 0x8306 +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm16, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm1 {%k1} +; AVX512BW-NEXT: movw $7224, %ax # imm = 0x1C38 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm20 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm16, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm6, %zmm4 ; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm26, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm26, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm12, %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm14, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512BW-NEXT: vpermi2d %zmm21, %zmm4, %zmm3 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] -; AVX512BW-NEXT: vpermi2d %zmm25, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] -; AVX512BW-NEXT: vpermi2d %zmm25, %zmm2, %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm21, %zmm5, %zmm4 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 832(%rax) -; AVX512BW-NEXT: popq %rax +; AVX512BW-NEXT: vmovdqa64 %zmm3, 768(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <32 x i32>, ptr %in.vecptr0, align 64 @@ -5915,38 +5771,38 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride7_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $2760, %rsp # imm = 0xAC8 +; SSE-NEXT: subq $2776, %rsp # imm = 0xAD8 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa (%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rsi), %xmm4 ; SSE-NEXT: movdqa 16(%rsi), %xmm3 -; SSE-NEXT: movaps (%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rdx), %xmm13 ; SSE-NEXT: movdqa 16(%rdx), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm13 -; SSE-NEXT: movaps 16(%rcx), %xmm9 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rcx), %xmm11 +; SSE-NEXT: movaps 16(%rcx), %xmm8 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%r8), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%r8), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%r9), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r9), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[1,1] +; SSE-NEXT: movdqa 16(%r9), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rax), %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm11[1,1] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movaps %xmm13, %xmm2 ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] @@ -5955,14 +5811,14 @@ ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm8[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rax), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5973,7 +5829,7 @@ ; SSE-NEXT: movaps 32(%rdx), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] @@ -5993,21 +5849,22 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm2 +; SSE-NEXT: movdqa 48(%rsi), %xmm3 ; SSE-NEXT: movdqa 48(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rcx), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%r9), %xmm1 @@ -6017,16 +5874,16 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rsi), %xmm1 ; SSE-NEXT: movaps 64(%rdx), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] @@ -6044,23 +5901,24 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa 64(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rsi), %xmm2 +; SSE-NEXT: movdqa 80(%rsi), %xmm4 ; SSE-NEXT: movdqa 80(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps 80(%rcx), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%r9), %xmm1 @@ -6070,23 +5928,24 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 80(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa 80(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rsi), %xmm1 ; SSE-NEXT: movaps 96(%rdx), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 96(%rcx), %xmm10 -; SSE-NEXT: movaps 96(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[1,1] +; SSE-NEXT: movaps 96(%rcx), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%r8), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%r9), %xmm1 @@ -6096,23 +5955,24 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 96(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa 96(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rsi), %xmm2 +; SSE-NEXT: movdqa 112(%rsi), %xmm4 ; SSE-NEXT: movdqa 112(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps 112(%rcx), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: movaps 112(%r8), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%r9), %xmm1 @@ -6122,24 +5982,24 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 112(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa 112(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rsi), %xmm1 -; SSE-NEXT: movaps 128(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdx), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm6, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 128(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rcx), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm6[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%r9), %xmm1 @@ -6151,21 +6011,21 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rsi), %xmm2 +; SSE-NEXT: movdqa 144(%rsi), %xmm4 ; SSE-NEXT: movdqa 144(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 144(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%rcx), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%r9), %xmm1 @@ -6177,22 +6037,22 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 144(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rsi), %xmm1 -; SSE-NEXT: movaps 160(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdx), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm6, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 160(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rcx), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 160(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm6[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%r9), %xmm1 @@ -6204,21 +6064,21 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 160(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rsi), %xmm2 +; SSE-NEXT: movdqa 176(%rsi), %xmm4 ; SSE-NEXT: movdqa 176(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 176(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 176(%rcx), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 176(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 176(%r9), %xmm1 @@ -6230,22 +6090,22 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 176(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rsi), %xmm1 -; SSE-NEXT: movaps 192(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rdx), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm6, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 192(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rcx), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 192(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm6[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%r9), %xmm1 @@ -6257,72 +6117,73 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 192(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rsi), %xmm3 +; SSE-NEXT: movdqa 208(%rsi), %xmm4 ; SSE-NEXT: movdqa 208(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 208(%rcx), %xmm6 -; SSE-NEXT: movaps 208(%r8), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm6[1,1] -; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: movaps 208(%rcx), %xmm2 +; SSE-NEXT: movaps 208(%r8), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[1,1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%r9), %xmm6 -; SSE-NEXT: movdqa 208(%rax), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%r9), %xmm10 +; SSE-NEXT: movdqa 208(%rax), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm6[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm10[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 224(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%rdx), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps 224(%rcx), %xmm4 +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm0[0],xmm4[1,2,3] +; SSE-NEXT: movaps 224(%rcx), %xmm2 ; SSE-NEXT: movaps 224(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps 224(%r9), %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%r9), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] +; SSE-NEXT: movaps 224(%rax), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[3,2] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm2[1,1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps 224(%rax), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 240(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6334,239 +6195,240 @@ ; SSE-NEXT: movaps 240(%rcx), %xmm6 ; SSE-NEXT: movaps 240(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm6[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps 240(%r9), %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] +; SSE-NEXT: movaps 240(%r9), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps 240(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm10[3,2] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm10[1,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps 240(%rax), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,0],xmm3[1,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm14[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm13, %xmm2 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm4[1,3] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[1,0],mem[0,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm15[0,2] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[1,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm2 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[1,3] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: shufps $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[1,0],mem[0,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm15 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm4[1,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: shufps $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[1,0],mem[0,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm9[1,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm8, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm12[1,3] -; SSE-NEXT: movaps (%rsp), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[1,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: shufps $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[1,0],mem[0,0] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm14[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,0] +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -6576,46 +6438,46 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] +; SSE-NEXT: shufps $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[1,0],mem[0,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -6625,14 +6487,14 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] +; SSE-NEXT: shufps $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[1,0],mem[0,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -6656,7 +6518,7 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -6670,8 +6532,8 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm14[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm14[3,3] ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm13 @@ -6689,296 +6551,295 @@ ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,0] ; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm3[0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps 224(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movaps 224(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm11 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm9 -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm7[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm3[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; SSE-NEXT: shufps $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[0,1],mem[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: movaps 240(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: movaps 240(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,0] -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm1[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[2,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm11[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm11[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm0[1,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[3,3],mem[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm5[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm0[0],xmm4[1,2,3] +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[3,0],mem[3,3] +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm0[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm0[0],xmm6[1,2,3] +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm4[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm6[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm6[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm11 = xmm4[0],xmm11[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm4[2,0] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm4[0],xmm6[1,2,3] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] +; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm1[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm0[0],xmm6[1,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm0[0],xmm4[1,2,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 1760(%rax) -; SSE-NEXT: movaps %xmm7, 1744(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1760(%rax) +; SSE-NEXT: movaps %xmm8, 1744(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1728(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1696(%rax) -; SSE-NEXT: movaps %xmm5, 1680(%rax) -; SSE-NEXT: movaps %xmm8, 1648(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1696(%rax) +; SSE-NEXT: movaps %xmm7, 1680(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1648(%rax) ; SSE-NEXT: movaps %xmm9, 1632(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1616(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1584(%rax) -; SSE-NEXT: movaps %xmm10, 1568(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1584(%rax) +; SSE-NEXT: movaps %xmm11, 1568(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1536(%rax) ; SSE-NEXT: movaps %xmm13, 1520(%rax) @@ -7001,89 +6862,88 @@ ; SSE-NEXT: movaps %xmm0, 1136(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1120(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1072(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1024(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1008(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 960(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 912(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 896(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 848(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 800(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 784(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 736(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 688(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 672(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 624(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 576(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 560(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 512(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 464(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 448(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 400(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 352(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 336(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 288(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 240(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 224(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 176(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 128(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1776(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1712(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1664(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1600(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1552(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1504(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1488(%rax) -; SSE-NEXT: movaps %xmm4, 1440(%rax) -; SSE-NEXT: movaps %xmm6, 1424(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1072(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1024(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1008(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 960(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 912(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 896(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 848(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 800(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 784(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 736(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 688(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 672(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 624(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 576(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 560(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 512(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 464(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 448(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 400(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 352(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 336(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 288(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 240(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 224(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1776(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1712(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1664(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1600(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1552(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1504(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1488(%rax) +; SSE-NEXT: movaps %xmm1, 1440(%rax) +; SSE-NEXT: movaps %xmm4, 1424(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1392(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1376(%rax) -; SSE-NEXT: movaps %xmm11, 1328(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1312(%rax) +; SSE-NEXT: movaps %xmm6, 1328(%rax) +; SSE-NEXT: movaps %xmm10, 1312(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1280(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7104,7 +6964,7 @@ ; SSE-NEXT: movaps %xmm0, 1056(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1040(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 992(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 976(%rax) @@ -7120,7 +6980,7 @@ ; SSE-NEXT: movaps %xmm0, 832(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 816(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 768(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 752(%rax) @@ -7154,7 +7014,8 @@ ; SSE-NEXT: movaps %xmm0, 368(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 320(%rax) -; SSE-NEXT: movaps %xmm2, 304(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 304(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7167,98 +7028,95 @@ ; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rax) +; SSE-NEXT: movaps %xmm5, 96(%rax) +; SSE-NEXT: movaps %xmm3, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: addq $2760, %rsp # imm = 0xAC8 +; SSE-NEXT: addq $2776, %rsp # imm = 0xAD8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $3432, %rsp # imm = 0xD68 +; AVX1-ONLY-NEXT: subq $3160, %rsp # imm = 0xC58 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 224(%rax), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rax), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[6],ymm5[6],ymm0[7],ymm5[7] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm5 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm8 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm8 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[1,1],xmm7[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm2[1],xmm1[1],zero ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm7[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm5[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm2[1,1],ymm0[5,5],ymm2[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm13[1,1],ymm1[5,5],ymm13[5,5] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7274,43 +7132,39 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm10 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm12 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1],xmm7[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm2[1],xmm1[1],zero ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm6[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,1],xmm1[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm12[1],xmm10[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm9[1],xmm6[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[1,1],ymm1[5,5],ymm0[5,5] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7332,41 +7186,38 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1],xmm9[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,1],xmm1[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm5[1],xmm9[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm2[1],xmm1[1],zero +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm7[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 @@ -7383,9 +7234,8 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm2[0],ymm14[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7395,42 +7245,38 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm12[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm15 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6],ymm5[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm6[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm7[1,1],xmm9[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm3[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm1[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm7[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm2[1],xmm1[1],zero +; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm15[1],xmm12[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 @@ -7447,8 +7293,8 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm2[0],ymm14[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX1-ONLY-NEXT: vmovaps 96(%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7458,42 +7304,39 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%r8), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%r8), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 128(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 128(%rax), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm6[1,1] +; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1],xmm1[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm7[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm2[1],xmm1[1],zero +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm7[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 @@ -7502,17 +7345,16 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1],ymm1[1,1],ymm9[5,5],ymm1[5,5] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 128(%r9), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm2[0],ymm8[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX1-ONLY-NEXT: vmovaps 128(%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7521,60 +7363,58 @@ ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm1[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm11 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm7[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[2,1] ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm5[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1],xmm9[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm2[1],xmm1[1],zero ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm6[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,1],xmm1[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm3[1],xmm7[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm15[1,1],ymm0[5,5],ymm15[5,5] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %ymm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm11[2,1],ymm1[6,4],ymm11[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[1,1],ymm1[5,5],ymm0[5,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,1],ymm12[1,1],ymm11[5,5],ymm12[5,5] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 160(%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm7[2,1],ymm1[6,4],ymm7[6,5] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm1[2,3] @@ -7582,219 +7422,215 @@ ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm1[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm6[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,1] ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%rax), %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm9[0],ymm3[2],ymm9[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm5[1,1] +; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm3[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm1[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm6[1],xmm7[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm2[1],xmm1[1],zero +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm4[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1],ymm0[1,1],ymm7[5,5],ymm0[5,5] +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1],ymm0[1,1],ymm6[5,5],ymm0[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4],ymm2[5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 192(%r9), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0],ymm1[2,1],ymm12[6,4],ymm1[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm12[2,3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[0],ymm0[0],ymm12[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0],ymm3[2,1],ymm9[6,4],ymm3[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm9[2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm13[2],ymm5[3],ymm13[3],ymm5[6],ymm13[6],ymm5[7],ymm13[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm8[1],ymm4[3],ymm8[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[1,1],ymm4[0,2],ymm8[5,5],ymm4[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm1[0,2],ymm4[5,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[6],ymm10[6],ymm3[7],ymm10[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm10[2],ymm13[3],ymm10[3],ymm13[6],ymm10[6],ymm13[7],ymm10[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm3[1,1],ymm4[0,2],ymm3[5,5],ymm4[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm1[0,2],ymm4[5,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm3[1,1],ymm4[0,2],ymm3[5,5],ymm4[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 80(%rax), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm14[1],ymm1[3],ymm14[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1],ymm1[0,2],ymm14[5,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 80(%rax), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm14[1],ymm4[3],ymm14[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm14[1,1],ymm4[0,2],ymm14[5,5],ymm4[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 112(%rax), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm15[1],ymm1[3],ymm15[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,1],ymm1[0,2],ymm15[5,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 112(%rax), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,1],ymm4[0,2],ymm6[5,5],ymm4[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 144(%rax), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm1[0,2],ymm8[5,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 144(%rax), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1],ymm4[0,2],ymm13[5,5],ymm4[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 176(%rax), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm1[0,2],ymm4[5,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 176(%rax), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[0,2],ymm2[5,5],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 208(%rax), %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm1[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm12 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm12[1],xmm4[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 228(%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 228(%r9), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm4[1],xmm9[1],zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm0[1,2],ymm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 228(%r8), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm15[3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 228(%r9), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rax), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm12[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[2,3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[0,2],xmm0[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3],xmm3[3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[3,3],xmm1[3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 232(%r9), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 232(%rax), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 232(%r9), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 232(%rax), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[3,3],ymm6[3,3],ymm8[7,7],ymm6[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,3],ymm11[3,3],ymm12[7,7],ymm11[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3],ymm7[3,3],ymm11[7,7],ymm7[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 220(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 220(%r9), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 216(%rax), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[3,3],ymm6[3,3],ymm7[7,7],ymm6[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 220(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 220(%r9), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 216(%rax), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 @@ -7806,13 +7642,13 @@ ; AVX1-ONLY-NEXT: vbroadcastss 224(%rax), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[3,3],ymm3[3,3],ymm8[7,7],ymm3[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,3],ymm4[3,3],ymm14[7,7],ymm4[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,3],ymm2[3,3],ymm4[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,3],ymm3[3,3],ymm2[7,7],ymm3[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 252(%r8), %ymm1 @@ -7822,9 +7658,9 @@ ; AVX1-ONLY-NEXT: vbroadcastsd 248(%rax), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm8[0],ymm3[2],ymm8[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[3,1],ymm0[0,2],ymm8[7,5],ymm0[4,6] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm14[0],ymm4[2],ymm14[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,1],ymm0[0,2],ymm14[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 236(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] @@ -7833,8 +7669,8 @@ ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1],ymm8[1,1],ymm3[5,5],ymm8[5,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm4[1,1],ymm2[5,5],ymm4[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1],ymm14[1,1],ymm4[5,5],ymm14[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm2[1,1],ymm3[5,5],ymm2[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vbroadcastsd 240(%r8), %ymm1 @@ -7847,26 +7683,25 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm8[2],mem[2],xmm8[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps $7, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rax), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm5[3,3],mem[3,3],ymm5[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3],ymm5[3,3],ymm0[7,7],ymm5[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,3],ymm6[3,3],ymm1[7,7],ymm6[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm7[3,3],mem[3,3],ymm7[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7881,13 +7716,12 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpermilps $170, (%rsp), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] @@ -7896,12 +7730,11 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3],ymm13[3,3],ymm0[7,7],ymm13[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm4[3,3],mem[3,3],ymm4[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7916,10 +7749,9 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -7931,12 +7763,11 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3],ymm10[3,3],ymm0[7,7],ymm10[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,3],ymm6[3,3],ymm1[7,7],ymm6[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7951,10 +7782,9 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -7966,18 +7796,19 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3],ymm10[3,3],ymm0[7,7],ymm10[7,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm2[3,3],mem[3,3],ymm2[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3],ymm1[1,2],ymm3[6,7],ymm1[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] @@ -7985,184 +7816,180 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vbroadcastsd 136(%rax), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 136(%rax), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,3],ymm15[3,3],ymm12[7,7],ymm15[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,3],ymm11[3,3],ymm2[7,7],ymm11[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm11[3,3],mem[3,3],ymm11[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3],ymm1[1,2],ymm3[6,7],ymm1[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,3],ymm1[3,3],ymm3[7,7],ymm1[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm3[3,3],mem[3,3],ymm3[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,3],ymm3[1,2],ymm4[6,7],ymm3[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6],ymm3[7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vbroadcastsd 168(%rax), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6],ymm3[7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm15[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vbroadcastsd 168(%rax), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3],ymm9[3,3],ymm10[7,7],ymm9[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,3],ymm9[3,3],ymm0[7,7],ymm9[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[3,3],ymm13[3,3],ymm10[7,7],ymm13[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm9 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[3,3],ymm8[3,3],ymm9[7,7],ymm8[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[3,3],mem[3,3],ymm3[7,7],mem[7,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,3],ymm3[1,2],ymm5[6,7],ymm3[5,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,3],ymm3[1,2],ymm4[6,7],ymm3[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6],ymm3[7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm12[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vbroadcastsd 200(%rax), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[1],ymm6[1],ymm1[4],ymm6[4],ymm1[5],ymm6[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vbroadcastsd 200(%rax), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[4],mem[4],ymm7[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[3,1],ymm3[0,2],ymm6[7,5],ymm3[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[2],ymm7[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[3,1],ymm3[0,2],ymm7[7,5],ymm3[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[3,3],mem[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm4[0],mem[0],ymm4[1],mem[1],ymm4[4],mem[4],ymm4[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm3[1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm3[0,2],ymm4[7,5],ymm3[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[2],ymm7[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[3,1],ymm3[0,2],ymm7[7,5],ymm3[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, (%rsp), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[3,3],mem[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0],ymm3[1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[4],ymm6[4],ymm0[5],ymm6[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm3[0,2],ymm4[7,5],ymm3[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[3,1],ymm3[0,2],ymm6[7,5],ymm3[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[3,3],mem[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0],ymm3[1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm3[0,2],ymm4[7,5],ymm3[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[3,1],ymm3[0,2],ymm2[7,5],ymm3[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm2[3,3],mem[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm2[0],ymm11[0],ymm2[1],ymm11[1],ymm2[4],ymm11[4],ymm2[5],ymm11[5] -; AVX1-ONLY-NEXT: vmovaps %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[3,1],ymm12[0,2],ymm2[7,5],ymm12[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm2[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm12[1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm0[0],ymm9[0],ymm0[1],ymm9[1],ymm0[4],ymm9[4],ymm0[5],ymm9[5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm10[3,1],ymm15[0,2],ymm10[7,5],ymm15[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm13[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX1-ONLY-NEXT: vmovaps %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm11[0],ymm2[2],ymm11[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm1[3,1],ymm11[0,2],ymm1[7,5],ymm11[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm3[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm1[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm11[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm11[1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm13[0],ymm10[0],ymm13[2],ymm10[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[3,1],ymm9[0,2],ymm10[7,5],ymm9[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm14[3,3],xmm15[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm10[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[3,1],ymm13[0,2],ymm0[7,5],ymm13[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm7[3,3],xmm8[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[3,1],ymm11[0,2],ymm1[7,5],ymm11[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm5[3,3],xmm12[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm10[1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm10, 1440(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 1216(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 1440(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 1216(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 992(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 768(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 768(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 544(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1504(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8261,57 +8088,57 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1632(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1600(%rax) -; AVX1-ONLY-NEXT: addq $3432, %rsp # imm = 0xD68 +; AVX1-ONLY-NEXT: addq $3160, %rsp # imm = 0xC58 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i32_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $2968, %rsp # imm = 0xB98 +; AVX2-SLOW-NEXT: subq $2984, %rsp # imm = 0xBA8 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps (%rax), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rax), %xmm3 +; AVX2-SLOW-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm13 -; AVX2-SLOW-NEXT: vmovaps 32(%r8), %xmm4 -; AVX2-SLOW-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%r8), %xmm5 +; AVX2-SLOW-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm5 -; AVX2-SLOW-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm9 +; AVX2-SLOW-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm10 -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm3 -; AVX2-SLOW-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm9 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm9[1],xmm10[1],zero +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm4 +; AVX2-SLOW-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm8 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm8[1],xmm10[1],zero ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm7 -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm6 -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm11 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm6 +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm12 +; AVX2-SLOW-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2],xmm2[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rax), %xmm0 -; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1,1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,1,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm12[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm3[1],zero +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm9 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm4[1],zero ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8474,11 +8301,11 @@ ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 32(%r8), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8583,11 +8410,11 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rsi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vmovaps 192(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 192(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8651,14 +8478,14 @@ ; AVX2-SLOW-NEXT: vbroadcastss 232(%rax), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm12 ; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %ymm4 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %ymm12 +; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %ymm11 ; AVX2-SLOW-NEXT: vmovaps 224(%rcx), %ymm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm12[1,1],ymm2[1,1],ymm12[5,5],ymm2[5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm11[1,1],ymm2[1,1],ymm11[5,5],ymm2[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6],ymm14[7] ; AVX2-SLOW-NEXT: vbroadcastsd 240(%r8), %ymm15 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7] @@ -8668,7 +8495,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastss %xmm10, %xmm14 -; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm15 +; AVX2-SLOW-NEXT: vbroadcastss %xmm8, %xmm15 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] @@ -8684,20 +8511,20 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6],ymm14[7] ; AVX2-SLOW-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm7[3,3],xmm6[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] -; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vbroadcastss %xmm1, %xmm6 -; AVX2-SLOW-NEXT: vbroadcastss %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm7 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -8709,12 +8536,12 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm14[3,3],xmm15[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm7 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] @@ -8880,13 +8707,13 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 220(%r8), %ymm7 @@ -8899,15 +8726,15 @@ ; AVX2-SLOW-NEXT: vbroadcastss 240(%rdx), %ymm6 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm2[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm7 = ymm11[0],ymm4[0],ymm11[1],ymm4[1],ymm11[4],ymm4[4],ymm11[5],ymm4[5] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm7 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[4],ymm4[4],ymm12[5],ymm4[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vbroadcastss 236(%r8), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm4[2],ymm11[3],ymm4[3],ymm11[6],ymm4[6],ymm11[7],ymm4[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm6 = ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[6],ymm4[6],ymm12[7],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[6],ymm2[6],ymm12[7],ymm2[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm11[2],ymm2[2],ymm11[3],ymm2[3],ymm11[6],ymm2[6],ymm11[7],ymm2[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = mem[1,2,2,3,5,6,6,7] @@ -8919,8 +8746,8 @@ ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm7[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm11[2],ymm4[3],ymm11[3],ymm4[6],ymm11[6],ymm4[7],ymm11[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm11[2],ymm2[3],ymm11[3],ymm2[6],ymm11[6],ymm2[7],ymm11[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm12[2],ymm4[3],ymm12[3],ymm4[6],ymm12[6],ymm4[7],ymm12[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -8931,33 +8758,33 @@ ; AVX2-SLOW-NEXT: vbroadcastsd 248(%rax), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1],ymm6[1,1],ymm12[5,5],ymm6[5,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,1],ymm7[1,1],ymm11[5,5],ymm7[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm11[1,1],ymm1[5,5],ymm11[5,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,1],ymm10[1,1],ymm15[5,5],ymm10[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -9046,9 +8873,10 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm10[1,1],ymm0[5,5],ymm10[5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm9[1,1],ymm0[5,5],ymm9[5,5] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4],ymm8[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -9059,43 +8887,44 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 208(%rax), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm12[2],ymm6[3],ymm12[3],ymm6[6],ymm12[6],ymm6[7],ymm12[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm7[2],ymm11[2],ymm7[3],ymm11[3],ymm7[6],ymm11[6],ymm7[7],ymm11[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,3],ymm14[3,3],ymm13[7,7],ymm14[7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3],ymm13[3,3],ymm12[7,7],ymm13[7,7] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vmovaps %ymm11, %ymm6 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vmovaps %ymm10, %ymm7 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm15[0],ymm4[1],ymm15[1],ymm4[4],ymm15[4],ymm4[5],ymm15[5] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm14[0],ymm4[1],ymm14[1],ymm4[4],ymm14[4],ymm4[5],ymm14[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm15[2],ymm4[2],ymm15[3],ymm4[3],ymm15[6],ymm4[6],ymm15[7],ymm4[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm7[2],ymm15[2],ymm7[3],ymm15[3],ymm7[6],ymm15[6],ymm7[7],ymm15[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm4[2],ymm14[3],ymm4[3],ymm14[6],ymm4[6],ymm14[7],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -9105,11 +8934,10 @@ ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 80(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -9122,8 +8950,8 @@ ; AVX2-SLOW-NEXT: # xmm1 = xmm1[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -9134,7 +8962,8 @@ ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastss 112(%rdx), %ymm0 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1,2,0,7,5,6,4] @@ -9147,7 +8976,7 @@ ; AVX2-SLOW-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] @@ -9160,94 +8989,94 @@ ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 144(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[4],ymm2[4],ymm15[5],ymm2[5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[6],ymm15[6],ymm2[7],ymm15[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm14 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0],ymm1[1,2],ymm14[3,4],ymm1[5,6],ymm14[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 176(%rdx), %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 176(%rdx), %ymm1 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5],ymm1[6],ymm14[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm0[6],ymm15[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm15 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[4],ymm0[4],ymm14[5],ymm0[5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm15 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm15 = xmm7[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm15 = xmm15[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm14 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm14 = xmm9[3,3],mem[3,3] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm14 = xmm14[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm14[1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4],ymm15[5,6],ymm14[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-SLOW-NEXT: vpermilps $39, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vbroadcastss 208(%rdx), %ymm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6],ymm15[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2],ymm13[3,4],ymm14[5,6],ymm13[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4],ymm13[5,6,7] +; AVX2-SLOW-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm8[0],mem[0],ymm8[1],mem[1],ymm8[4],mem[4],ymm8[5],mem[5] +; AVX2-SLOW-NEXT: vpermilps $39, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vbroadcastss 208(%rdx), %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm14 = xmm1[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm14 = xmm14[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2,3],ymm13[4,5,6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps %ymm13, 1440(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1312(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm2, 1216(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 1088(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm4, 992(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm5, 864(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm6, 768(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm8, 640(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 1216(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 1088(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 992(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 864(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 768(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 640(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 544(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm12, 416(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm10, 320(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm11, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 416(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm11, 320(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm12, 192(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm10, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1504(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 1472(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 1472(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1280(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9330,63 +9159,63 @@ ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1600(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1568(%rax) -; AVX2-SLOW-NEXT: addq $2968, %rsp # imm = 0xB98 +; AVX2-SLOW-NEXT: addq $2984, %rsp # imm = 0xBA8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i32_stride7_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $3080, %rsp # imm = 0xC08 +; AVX2-FAST-NEXT: subq $3096, %rsp # imm = 0xC18 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovaps (%rax), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rax), %xmm3 +; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovaps (%r8), %xmm2 ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%r8), %xmm3 -; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%r8), %xmm4 +; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps (%r9), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm4 -; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm5 +; AVX2-FAST-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm2 ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm5 -; AVX2-FAST-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm6 +; AVX2-FAST-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1],xmm2[1],zero -; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-FAST-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm9 +; AVX2-FAST-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm2 ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX2-FAST-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm8 +; AVX2-FAST-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2],xmm2[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2],xmm2[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rax), %xmm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1,1,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2],xmm1[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm2 ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm5[1],zero +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm6[1],zero ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9532,22 +9361,22 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps (%r8), %ymm15 -; AVX2-FAST-NEXT: vmovaps (%r9), %ymm13 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm15[2],ymm1[3,4,5],ymm15[6],ymm1[7] +; AVX2-FAST-NEXT: vmovaps (%r8), %ymm12 +; AVX2-FAST-NEXT: vmovaps (%r9), %ymm15 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm12[2],ymm1[3,4,5],ymm12[6],ymm1[7] +; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9612,11 +9441,11 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 128(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-NEXT: vmovaps 128(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 128(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9635,11 +9464,11 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 160(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-NEXT: vmovaps 160(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 160(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9659,17 +9488,18 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm12 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm12[2],ymm1[3],ymm12[3],ymm1[6],ymm12[6],ymm1[7],ymm12[7] -; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm10 +; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm2 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm10 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm11 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 192(%rdx), %ymm7 -; AVX2-FAST-NEXT: vmovaps 192(%rcx), %ymm8 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] +; AVX2-FAST-NEXT: vmovaps 192(%rcx), %ymm6 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] ; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 192(%r8), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9688,40 +9518,40 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 224(%rcx), %xmm3 -; AVX2-FAST-NEXT: vmovaps 224(%rdx), %xmm6 -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm6[1],xmm3[1],zero +; AVX2-FAST-NEXT: vmovaps 224(%rdx), %xmm8 +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm8[1],xmm3[1],zero ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 228(%r8), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 224(%r9), %xmm4 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm4[1,1,1,1] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm14 = xmm4[1,1,1,1] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vinsertf128 $1, 224(%rax), %ymm5, %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss %xmm3, %xmm2 -; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm5 +; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm5 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,1,2,2,0,1,2,2] ; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm9, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 224(%r8), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vbroadcastss %xmm4, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5],ymm2[6,7] -; AVX2-FAST-NEXT: vbroadcastss 224(%rax), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6],ymm2[7] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm9, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm2[2,3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 224(%r8), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vbroadcastss %xmm4, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5],ymm2[6,7] +; AVX2-FAST-NEXT: vbroadcastss 224(%rax), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm14[6],ymm2[7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovaps 224(%r8), %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 224(%r8), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 232(%rax), %ymm1 @@ -9731,52 +9561,52 @@ ; AVX2-FAST-NEXT: vmovaps 224(%rsi), %ymm1 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 224(%rdx), %ymm3 ; AVX2-FAST-NEXT: vmovaps 224(%rcx), %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm3[1,1],ymm0[1,1],ymm3[5,5],ymm0[5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6],ymm11[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm3[1,1],ymm0[1,1],ymm3[5,5],ymm0[5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6],ymm14[7] ; AVX2-FAST-NEXT: vbroadcastsd 240(%r8), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0],ymm11[1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4,5,6],ymm14[7] ; AVX2-FAST-NEXT: vbroadcastss 240(%r9), %xmm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 240(%rax), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm11 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[6],ymm10[6],ymm12[7],ymm10[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm13 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 220(%r8), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5],ymm11[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] ; AVX2-FAST-NEXT: vbroadcastss 220(%r9), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FAST-NEXT: vbroadcastsd 216(%rax), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0],ymm11[1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6],ymm14[7] ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss 240(%rdx), %ymm11 +; AVX2-FAST-NEXT: vbroadcastss 240(%rdx), %ymm13 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5],ymm11[6],ymm14[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6],ymm14[7] ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] ; AVX2-FAST-NEXT: vbroadcastss 236(%r8), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm11 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm4[2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm13 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm11 = [5,6,5,6,5,6,5,6] -; AVX2-FAST-NEXT: vpermps 224(%r9), %ymm11, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0],ymm6[1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vmovaps 224(%rax), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm13 = [5,6,5,6,5,6,5,6] +; AVX2-FAST-NEXT: vpermps 224(%r9), %ymm13, %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0],ymm8[1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vmovaps 224(%rax), %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm11[2,3],ymm14[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm13[2,3],ymm14[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0],ymm4[1],ymm8[2,3,4],ymm4[5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] @@ -9796,25 +9626,25 @@ ; AVX2-FAST-NEXT: vbroadcastss %xmm4, %xmm1 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm6[3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm6[3,3] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] @@ -9828,8 +9658,8 @@ ; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm2 @@ -9838,29 +9668,29 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vbroadcastss %xmm3, %xmm0 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm1 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm1 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm6[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm4[3,3],xmm5[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 40(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] @@ -9884,30 +9714,30 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm1 +; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm0 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm5, %xmm1 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm3[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm4[3,3],xmm3[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 72(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -9932,43 +9762,43 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm1 +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm0 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm1 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm4[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 104(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm11[1,1],ymm1[5,5],ymm11[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -9980,43 +9810,43 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm1 +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm0 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm1 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm4[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 136(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4],ymm15[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm13[1,1],ymm1[5,5],ymm13[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -10028,30 +9858,30 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm1 +; AVX2-FAST-NEXT: vbroadcastss %xmm7, %xmm0 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm1 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm5[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm4[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 168(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -10062,9 +9892,9 @@ ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm14[1,1],ymm1[5,5],ymm14[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -10105,209 +9935,210 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm12[1,1],ymm0[5,5],ymm12[5,5] -; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vbroadcastsd 208(%rax), %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm10[1,1],ymm0[5,5],ymm10[5,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm8[1],ymm2[2,3,4],ymm8[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-NEXT: vbroadcastsd 208(%rax), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6],ymm3[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = xmm3[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = xmm3[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm5 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm0 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm4[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7] +; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm3 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6],ymm4[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm4[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm4[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm4[1,2],ymm6[3,4],ymm4[5,6],ymm6[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0],ymm3[1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 80(%rdx), %ymm3 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = xmm3[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = xmm6[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm6[1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6],ymm4[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm4[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm4[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0],ymm4[1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3,4],ymm6[5,6],ymm7[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm0[1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 80(%rdx), %ymm0 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm4[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = xmm3[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = xmm6[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm6[1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1,2],ymm7[3,4],ymm4[5,6],ymm7[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0],ymm3[1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 112(%rdx), %ymm3 +; AVX2-FAST-NEXT: vmovaps %ymm11, %ymm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6],ymm4[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3,4],ymm6[5,6],ymm9[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 112(%rdx), %ymm1 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm4 = ymm1[0],ymm12[0],ymm1[1],ymm12[1],ymm1[4],ymm12[4],ymm1[5],ymm12[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm4[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm4[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0],ymm4[1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm12[2],ymm1[2],ymm12[3],ymm1[3],ymm12[6],ymm1[6],ymm12[7],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm0[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm1[6],ymm9[7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[4],ymm13[4],ymm10[5],ymm13[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm9 = xmm8[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm9 = xmm9[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm10 = ymm13[2],ymm10[2],ymm13[3],ymm10[3],ymm13[6],ymm10[6],ymm13[7],ymm10[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm12 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0],ymm4[1,2],ymm12[3,4],ymm4[5,6],ymm12[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm4[0],ymm3[1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 144(%rdx), %ymm3 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm13[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6],ymm4[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm13 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1,2],ymm13[3,4],ymm10[5,6],ymm13[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 144(%rdx), %ymm10 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm1[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5],ymm10[6],ymm13[7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm13 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm13 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm13 = xmm8[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm13 = xmm13[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm13[1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[6],ymm14[6],ymm15[7],ymm14[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm4 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[4],ymm0[4],ymm15[5],ymm0[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm4[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm4[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm13[2],mem[2],ymm13[3],mem[3],ymm13[6],mem[6],ymm13[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm13 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 176(%rdx), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3,4],ymm13[5,6],ymm15[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0],ymm4[1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 176(%rdx), %ymm13 +; AVX2-FAST-NEXT: vmovaps %ymm14, %ymm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm14[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6],ymm15[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6],ymm15[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm15 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[4],ymm0[4],ymm4[5],ymm0[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm15 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm15 = xmm8[3,3],mem[3,3] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm15 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm15 = xmm14[3,3],mem[3,3] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm15 = xmm15[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm15[1,2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm15[1,2],ymm8[3,4],ymm15[5,6],ymm8[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm12[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vbroadcastss 208(%rdx), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5],ymm4[6],ymm15[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm14 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4],ymm15[5,6],ymm14[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm14 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm10[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vbroadcastss 208(%rdx), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6],ymm15[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm8 = xmm1[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm8 = xmm8[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm14 = xmm1[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm14[1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm4, 1440(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm8, 1440(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm0, 1312(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm14, 1216(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm13, 1088(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm10, 992(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm9, 864(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm2, 768(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm6, 640(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm3, 544(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm7, 416(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm5, 320(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm13, 1216(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm4, 1088(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm3, 992(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm12, 864(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm11, 768(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm7, 640(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm5, 544(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm6, 416(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10398,57 +10229,57 @@ ; AVX2-FAST-NEXT: vmovaps %ymm0, 1568(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 1600(%rax) -; AVX2-FAST-NEXT: addq $3080, %rsp # imm = 0xC08 +; AVX2-FAST-NEXT: addq $3096, %rsp # imm = 0xC18 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i32_stride7_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $2968, %rsp # imm = 0xB98 +; AVX2-FAST-PERLANE-NEXT: subq $2984, %rsp # imm = 0xBA8 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rax), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rax), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm9[1],xmm10[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm8[1],xmm10[1],zero ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rax), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm12[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm3[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm4[1],zero ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10611,11 +10442,11 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10720,11 +10551,11 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10788,14 +10619,14 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 232(%rax), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm12[1,1],ymm2[1,1],ymm12[5,5],ymm2[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm11[1,1],ymm2[1,1],ymm11[5,5],ymm2[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 240(%r8), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7] @@ -10805,7 +10636,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm10, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm8, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] @@ -10821,20 +10652,20 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm7[3,3],xmm6[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm1, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -10846,12 +10677,12 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm14[3,3],xmm15[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm7 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] @@ -11017,13 +10848,13 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 220(%r8), %ymm7 @@ -11036,15 +10867,15 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%rdx), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm2[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm7 = ymm11[0],ymm4[0],ymm11[1],ymm4[1],ymm11[4],ymm4[4],ymm11[5],ymm4[5] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm7 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[4],ymm4[4],ymm12[5],ymm4[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 236(%r8), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm4[2],ymm11[3],ymm4[3],ymm11[6],ymm4[6],ymm11[7],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm6 = ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[6],ymm4[6],ymm12[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[6],ymm2[6],ymm12[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm11[2],ymm2[2],ymm11[3],ymm2[3],ymm11[6],ymm2[6],ymm11[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = mem[1,2,2,3,5,6,6,7] @@ -11056,8 +10887,8 @@ ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm7[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm11[2],ymm4[3],ymm11[3],ymm4[6],ymm11[6],ymm4[7],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm11[2],ymm2[3],ymm11[3],ymm2[6],ymm11[6],ymm2[7],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm12[2],ymm4[3],ymm12[3],ymm4[6],ymm12[6],ymm4[7],ymm12[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -11068,33 +10899,33 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 248(%rax), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1],ymm6[1,1],ymm12[5,5],ymm6[5,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,1],ymm7[1,1],ymm11[5,5],ymm7[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm11[1,1],ymm1[5,5],ymm11[5,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,1],ymm10[1,1],ymm15[5,5],ymm10[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -11183,9 +11014,10 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm10[1,1],ymm0[5,5],ymm10[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm9[1,1],ymm0[5,5],ymm9[5,5] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4],ymm8[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -11196,43 +11028,44 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 208(%rax), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm12[2],ymm6[3],ymm12[3],ymm6[6],ymm12[6],ymm6[7],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm7[2],ymm11[2],ymm7[3],ymm11[3],ymm7[6],ymm11[6],ymm7[7],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,3],ymm14[3,3],ymm13[7,7],ymm14[7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3],ymm13[3,3],ymm12[7,7],ymm13[7,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm15[0],ymm4[1],ymm15[1],ymm4[4],ymm15[4],ymm4[5],ymm15[5] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm14[0],ymm4[1],ymm14[1],ymm4[4],ymm14[4],ymm4[5],ymm14[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm15[2],ymm4[2],ymm15[3],ymm4[3],ymm15[6],ymm4[6],ymm15[7],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm7[2],ymm15[2],ymm7[3],ymm15[3],ymm7[6],ymm15[6],ymm7[7],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm4[2],ymm14[3],ymm4[3],ymm14[6],ymm4[6],ymm14[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -11242,11 +11075,10 @@ ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -11259,8 +11091,8 @@ ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -11271,7 +11103,8 @@ ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 112(%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1,2,0,7,5,6,4] @@ -11284,7 +11117,7 @@ ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] @@ -11297,94 +11130,94 @@ ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 144(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[4],ymm2[4],ymm15[5],ymm2[5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[6],ymm15[6],ymm2[7],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm14 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0],ymm1[1,2],ymm14[3,4],ymm1[5,6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 176(%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 176(%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5],ymm1[6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm0[6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm15 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[4],ymm0[4],ymm14[5],ymm0[5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm15 = xmm7[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm15 = xmm15[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm14 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm14 = xmm9[3,3],mem[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm14 = xmm14[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm14[1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4],ymm15[5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FAST-PERLANE-NEXT: vpermilps $39, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2],ymm13[3,4],ymm14[5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm8[0],mem[0],ymm8[1],mem[1],ymm8[4],mem[4],ymm8[5],mem[5] +; AVX2-FAST-PERLANE-NEXT: vpermilps $39, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdx), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm14 = xmm1[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm14 = xmm14[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2,3],ymm13[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 1440(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1312(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 1216(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 1088(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 992(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 864(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 768(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 640(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 1216(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 1088(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 992(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 864(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 768(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 640(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 544(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, 416(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 320(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 416(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1504(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 1472(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1472(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1280(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11467,7 +11300,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1600(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1568(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $2968, %rsp # imm = 0xB98 +; AVX2-FAST-PERLANE-NEXT: addq $2984, %rsp # imm = 0xBA8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -12445,7 +12278,6 @@ ; AVX: {{.*}} ; AVX1: {{.*}} ; AVX2: {{.*}} -; AVX2-ONLY: {{.*}} ; AVX512BW-FAST: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} ; AVX512BW-ONLY-SLOW: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll @@ -151,45 +151,45 @@ ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm5 -; SSE-NEXT: movaps (%rdx), %xmm1 -; SSE-NEXT: movaps (%rcx), %xmm6 -; SSE-NEXT: movaps (%r8), %xmm2 -; SSE-NEXT: movaps (%r9), %xmm7 -; SSE-NEXT: movaps (%r11), %xmm8 +; SSE-NEXT: movaps (%rsi), %xmm6 +; SSE-NEXT: movaps (%rdx), %xmm4 +; SSE-NEXT: movaps (%rcx), %xmm7 +; SSE-NEXT: movaps (%r8), %xmm1 +; SSE-NEXT: movaps (%r9), %xmm8 +; SSE-NEXT: movaps (%r11), %xmm5 ; SSE-NEXT: movaps (%r10), %xmm9 -; SSE-NEXT: movaps %xmm1, %xmm10 -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0] -; SSE-NEXT: movaps %xmm8, %xmm11 +; SSE-NEXT: movaps %xmm4, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm10[0] +; SSE-NEXT: movaps %xmm5, %xmm11 ; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; SSE-NEXT: movaps %xmm2, %xmm12 -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] +; SSE-NEXT: movaps %xmm1, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1] ; SSE-NEXT: movaps %xmm12, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm8[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movaps %xmm0, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE-NEXT: movaps %xmm1, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] +; SSE-NEXT: movaps %xmm1, 112(%rax) ; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: movaps %xmm2, 112(%rax) -; SSE-NEXT: movaps %xmm6, 80(%rax) -; SSE-NEXT: movaps %xmm5, 64(%rax) -; SSE-NEXT: movaps %xmm4, 32(%rax) +; SSE-NEXT: movaps %xmm7, 80(%rax) +; SSE-NEXT: movaps %xmm6, 64(%rax) ; SSE-NEXT: movaps %xmm12, 48(%rax) +; SSE-NEXT: movaps %xmm3, 32(%rax) ; SSE-NEXT: movaps %xmm13, 16(%rax) -; SSE-NEXT: movaps %xmm3, (%rax) +; SSE-NEXT: movaps %xmm2, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride8_vf4: @@ -197,50 +197,50 @@ ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovaps (%r11), %xmm6 -; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[4],ymm8[4],ymm4[5],ymm8[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,0],ymm5[4,5],ymm7[6,4] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm8[1,0],ymm4[1,0],ymm8[5,4],ymm4[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[2,0],ymm7[2,3],ymm10[6,4],ymm7[6,7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = xmm0[1],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm3 +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%r11), %xmm5 +; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,0],ymm4[4,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm6[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm10[0,1],xmm8[0,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,0],ymm3[1,0],ymm2[5,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[2,0],ymm8[2,3],ymm10[6,4],ymm8[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = xmm9[1],xmm0[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[6],ymm8[6],ymm4[7],ymm8[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm7[1],ymm5[1],ymm7[3],ymm5[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm2[2],xmm3[2] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm6[2],xmm1[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[3,0],ymm4[3,0],ymm8[7,4],ymm4[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm6[2,3],ymm4[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,0],xmm0[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[6],ymm7[6],ymm5[7],ymm7[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm3[3,0],ymm2[7,4],ymm3[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm5[2,3],ymm2[6,4],ymm5[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm9[3,0],xmm0[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -352,134 +352,117 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride8_vf8: ; SSE: # %bb.0: -; SSE-NEXT: subq $72, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movaps (%rdi), %xmm8 +; SSE-NEXT: movaps (%rdi), %xmm9 +; SSE-NEXT: movaps 16(%rdi), %xmm5 ; SSE-NEXT: movaps (%rsi), %xmm0 -; SSE-NEXT: movaps (%rdx), %xmm9 -; SSE-NEXT: movaps (%rcx), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%r8), %xmm15 -; SSE-NEXT: movaps 16(%r8), %xmm10 -; SSE-NEXT: movaps (%r9), %xmm1 +; SSE-NEXT: movaps 16(%rsi), %xmm7 +; SSE-NEXT: movaps (%rdx), %xmm12 +; SSE-NEXT: movaps 16(%rdx), %xmm2 +; SSE-NEXT: movaps (%rcx), %xmm13 +; SSE-NEXT: movaps (%r8), %xmm6 +; SSE-NEXT: movaps (%r9), %xmm3 ; SSE-NEXT: movaps (%r10), %xmm14 -; SSE-NEXT: movaps 16(%r10), %xmm12 -; SSE-NEXT: movaps (%rax), %xmm4 -; SSE-NEXT: movaps 16(%rax), %xmm7 -; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm4, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm14[0] +; SSE-NEXT: movaps (%rax), %xmm15 +; SSE-NEXT: movaps %xmm13, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm12[0] +; SSE-NEXT: movaps %xmm9, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm15, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0] -; SSE-NEXT: movaps %xmm8, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] -; SSE-NEXT: movaps %xmm11, %xmm4 -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm14[0] +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm3 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm12[0] -; SSE-NEXT: movaps 16(%r9), %xmm7 -; SSE-NEXT: movaps %xmm10, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1] -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps %xmm13, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm12[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm4[2,0] +; SSE-NEXT: movaps %xmm15, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm14[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm4[2,0] +; SSE-NEXT: movaps %xmm13, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm12[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movaps %xmm6, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rdx), %xmm6 -; SSE-NEXT: movaps 16(%rcx), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm6[0] -; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps 16(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm11[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; SSE-NEXT: movaps %xmm12, %xmm7 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] -; SSE-NEXT: movaps %xmm10, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm7[0,2] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movaps %xmm6, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm14[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm1[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm9[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[2,0] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm14[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm3[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm9[0,2] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm12[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm1[2,0] -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm6[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,3],xmm9[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm12[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm8[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,2] +; SSE-NEXT: movaps 16(%rcx), %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,0],xmm12[3,3] +; SSE-NEXT: movaps 16(%r10), %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm13[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,0],xmm14[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm15[2,0] +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movaps %xmm5, %xmm14 +; SSE-NEXT: movaps %xmm7, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1] +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rax), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm12[0] +; SSE-NEXT: movaps 16(%r8), %xmm15 +; SSE-NEXT: movaps 16(%r9), %xmm4 +; SSE-NEXT: movaps %xmm15, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm3, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm12[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm0[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm4[2],xmm15[3],xmm4[3] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] +; SSE-NEXT: movaps %xmm15, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,0],xmm13[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm11[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm12[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm1[2,0] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 224(%rax) -; SSE-NEXT: movaps %xmm10, 240(%rax) -; SSE-NEXT: movaps %xmm0, 160(%rax) -; SSE-NEXT: movaps %xmm13, 176(%rax) -; SSE-NEXT: movaps %xmm14, 96(%rax) -; SSE-NEXT: movaps %xmm15, 112(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: movaps %xmm5, 48(%rax) -; SSE-NEXT: movaps %xmm7, 192(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 208(%rax) +; SSE-NEXT: movaps %xmm15, 240(%rax) +; SSE-NEXT: movaps %xmm5, 224(%rax) +; SSE-NEXT: movaps %xmm4, 208(%rax) +; SSE-NEXT: movaps %xmm2, 192(%rax) +; SSE-NEXT: movaps %xmm3, 176(%rax) +; SSE-NEXT: movaps %xmm14, 160(%rax) +; SSE-NEXT: movaps %xmm7, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rax) +; SSE-NEXT: movaps %xmm6, 112(%rax) +; SSE-NEXT: movaps %xmm9, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: movaps %xmm0, 64(%rax) +; SSE-NEXT: movaps %xmm8, 48(%rax) +; SSE-NEXT: movaps %xmm10, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: addq $72, %rsp +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride8_vf8: @@ -487,227 +470,219 @@ ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm4 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm5 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm6 ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm7 ; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm8 ; AVX1-ONLY-NEXT: vmovaps (%r10), %ymm9 ; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm10 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm7[1,0],ymm8[5,4],ymm7[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[4],ymm6[4],ymm4[5],ymm6[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm3[1,0],ymm5[5,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[3,0],ymm7[3,0],ymm8[7,4],ymm7[7,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,0],ymm7[1,0],ymm8[5,4],ymm7[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[2,3],ymm2[6,4],ymm1[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[6],ymm6[6],ymm4[7],ymm6[7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm5[3,0],ymm3[3,0],ymm5[7,4],ymm3[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,0],ymm11[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,0],ymm7[4,5],ymm9[6,4] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[3,0],ymm7[3,0],ymm8[7,4],ymm7[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0],ymm9[2,3],ymm7[6,4],ymm9[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm4 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm5 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm6 -; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm13 -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm14[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm13[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm12 +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm13 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm7 +; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm8 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm14[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm15[0],xmm9[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm11[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm10[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm13[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm12[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm0[1],xmm15[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm10[0],xmm9[0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm12[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm8[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm5[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm10[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm10[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i32_stride8_vf8: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm5 -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm4 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm8 ; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm9 -; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm10 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] -; AVX2-ONLY-NEXT: vbroadcastss 20(%r8), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm11 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm3[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 28(%r10), %ymm11 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 24(%rax), %ymm10 +; AVX2-ONLY-NEXT: vmovaps (%r11), %ymm3 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[4],ymm3[4],ymm9[5],ymm3[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[4],ymm5[4],ymm2[5],ymm5[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[4],ymm6[4],ymm4[5],ymm6[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 20(%r8), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm10 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm5[1],ymm11[2,3,4],ymm5[5],ymm11[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 24(%r11), %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm10 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm8[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[6],ymm5[6],ymm2[7],ymm5[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm2 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[6],ymm6[6],ymm4[7],ymm6[7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 28(%r10), %ymm6 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm9 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm4 +; AVX2-ONLY-NEXT: vbroadcastss %xmm4, %xmm7 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm5 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm10 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm6 -; AVX2-ONLY-NEXT: vmovaps (%r10), %xmm7 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm12 -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm13 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm12[1],xmm14[2,3] +; AVX2-ONLY-NEXT: vbroadcastss %xmm5, %xmm10 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm11 +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm12 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vmovaps (%r11), %xmm7 +; AVX2-ONLY-NEXT: vbroadcastss %xmm7, %xmm14 +; AVX2-ONLY-NEXT: vmovaps (%r10), %xmm10 +; AVX2-ONLY-NEXT: vbroadcastss %xmm10, %xmm15 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm14[2,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm8[1],xmm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm12[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm11[1],xmm15[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm0[1],xmm15[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-ONLY-NEXT: vbroadcastss %xmm4, %xmm10 -; AVX2-ONLY-NEXT: vbroadcastss %xmm5, %xmm15 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm12 -; AVX2-ONLY-NEXT: vbroadcastss %xmm7, %xmm13 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, (%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 96(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 32(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%rax) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm4[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0,1,2],xmm9[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm7[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm10[0,1,2],xmm14[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm8[1],xmm4[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm11[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -846,44 +821,44 @@ ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm8[2,0] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm6, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm5[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm8[2,0] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] ; SSE-NEXT: movaps %xmm9, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] ; SSE-NEXT: movaps %xmm11, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 16(%rcx), %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: movaps 16(%r10), %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] +; SSE-NEXT: movaps 16(%r10), %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm6[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,0],xmm5[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm6[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movaps %xmm10, %xmm6 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] ; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rax), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: movaps 16(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 16(%r8), %xmm11 ; SSE-NEXT: movaps 16(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm9 @@ -892,35 +867,35 @@ ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm3[1,1] +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm10, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdx), %xmm3 -; SSE-NEXT: movaps 32(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movaps 32(%rdx), %xmm0 +; SSE-NEXT: movaps 32(%rcx), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps 32(%rdi), %xmm14 ; SSE-NEXT: movaps 32(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm14, %xmm5 @@ -929,10 +904,10 @@ ; SSE-NEXT: movaps %xmm5, %xmm7 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%r10), %xmm1 -; SSE-NEXT: movaps 32(%rax), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 32(%r10), %xmm2 +; SSE-NEXT: movaps 32(%rax), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 32(%r8), %xmm11 ; SSE-NEXT: movaps 32(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm13 @@ -940,75 +915,75 @@ ; SSE-NEXT: movaps %xmm13, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm3[1,1] +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm5[2,0] ; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm4[2],xmm14[3],xmm4[3] ; SSE-NEXT: movaps %xmm3, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSE-NEXT: movaps %xmm14, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm3[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] -; SSE-NEXT: movaps 48(%rdx), %xmm1 -; SSE-NEXT: movaps 48(%rcx), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm3[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[2,0] +; SSE-NEXT: movaps 48(%rdx), %xmm8 +; SSE-NEXT: movaps 48(%rcx), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm8[0] ; SSE-NEXT: movaps 48(%rdi), %xmm3 ; SSE-NEXT: movaps 48(%rsi), %xmm12 ; SSE-NEXT: movaps %xmm3, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] ; SSE-NEXT: movaps %xmm4, %xmm15 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] -; SSE-NEXT: movaps 48(%r10), %xmm0 -; SSE-NEXT: movaps 48(%rax), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 48(%r10), %xmm6 +; SSE-NEXT: movaps 48(%rax), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] ; SSE-NEXT: movaps 48(%r8), %xmm5 ; SSE-NEXT: movaps 48(%r9), %xmm9 -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm8, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm2[2,0] -; SSE-NEXT: movaps %xmm7, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[2,0] +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm8[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[2,0] ; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm8[1] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] ; SSE-NEXT: movaps %xmm3, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,0] ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] ; SSE-NEXT: movaps %xmm5, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm8[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm8[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm6[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm1[2,0] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm5, 496(%rax) ; SSE-NEXT: movaps %xmm3, 480(%rax) ; SSE-NEXT: movaps %xmm9, 464(%rax) ; SSE-NEXT: movaps %xmm12, 448(%rax) -; SSE-NEXT: movaps %xmm6, 432(%rax) +; SSE-NEXT: movaps %xmm7, 432(%rax) ; SSE-NEXT: movaps %xmm4, 416(%rax) ; SSE-NEXT: movaps %xmm10, 400(%rax) ; SSE-NEXT: movaps %xmm15, 384(%rax) @@ -1062,493 +1037,463 @@ ; ; AVX1-ONLY-LABEL: store_i32_stride8_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $264, %rsp # imm = 0x108 +; AVX1-ONLY-NEXT: subq $136, %rsp ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 32(%r10), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,0],ymm10[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm6[3,0],ymm5[3,0],ymm6[7,4],ymm5[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm10[2,0],ymm2[2,3],ymm10[6,4],ymm2[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1],ymm2[2,0],ymm10[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm2[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm10 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[4],ymm12[4],ymm11[5],ymm12[5] -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,0],ymm5[1,0],ymm6[5,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm9 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm6 +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm13 +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm14 ; AVX1-ONLY-NEXT: vmovaps (%r10), %ymm12 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[2,0],ymm13[2,3],ymm5[6,4],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[4],ymm14[4],ymm13[5],ymm14[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm12[1],ymm5[3],ymm12[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm12[0],ymm15[0],ymm12[1],ymm15[1],ymm12[4],ymm15[4],ymm12[5],ymm15[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,0],ymm13[1,0],ymm14[5,4],ymm13[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm2[1],ymm9[1],ymm2[3],ymm9[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm12[2],ymm5[2],ymm12[3],ymm5[3],ymm12[6],ymm5[6],ymm12[7],ymm5[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,0],ymm10[3,0],ymm11[7,4],ymm10[7,4] +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm11 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm12[2],ymm15[2],ymm12[3],ymm15[3],ymm12[6],ymm15[6],ymm12[7],ymm15[7] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,0],ymm13[3,0],ymm14[7,4],ymm13[7,4] +; AVX1-ONLY-NEXT: vmovaps 32(%r10), %ymm13 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[6],ymm2[6],ymm9[7],ymm2[7] +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm14 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[3,0],ymm7[3,0],ymm8[7,4],ymm7[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm12[0],ymm5[2],ymm12[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[4],ymm12[4],ymm11[5],ymm12[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm12[0],ymm5[0],ymm12[1],ymm5[1],ymm12[4],ymm5[4],ymm12[5],ymm5[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,0],ymm10[1,0],ymm11[5,4],ymm10[5,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[4],ymm14[4],ymm13[5],ymm14[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,0],ymm11[1,0],ymm12[5,4],ymm11[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[4],ymm2[4],ymm9[5],ymm2[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,0],ymm7[1,0],ymm8[5,4],ymm7[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,0],ymm7[1,0],ymm8[5,4],ymm7[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm15 -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm13[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm15[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm14 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm5 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0,1,2],xmm12[3] -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm0[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,0],ymm11[3,0],ymm12[7,4],ymm11[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[3,0],ymm7[3,0],ymm8[7,4],ymm7[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm11 +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm12 +; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm13 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm14 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm11[0],xmm14[1],xmm11[1] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm15 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm15[1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm2[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm11[2],mem[2],xmm11[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm7[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm14[2],xmm8[2],xmm14[3],xmm8[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm11[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm13 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm15[0],xmm6[1],xmm15[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm1[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm5[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm15[2],xmm6[3],xmm15[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 352(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 288(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rax) -; AVX1-ONLY-NEXT: addq $264, %rsp # imm = 0x108 +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX1-ONLY-NEXT: addq $136, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i32_stride8_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $328, %rsp # imm = 0x148 +; AVX2-ONLY-NEXT: subq $136, %rsp ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 32(%r10), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %ymm13 -; AVX2-ONLY-NEXT: vbroadcastss 56(%rax), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm14 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm8 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm5 +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm11 +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm13 +; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm12 +; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm0 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[4],ymm13[4],ymm11[5],ymm13[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 20(%r8), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm10 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 60(%r10), %ymm1 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm13[2],ymm1[3],ymm13[3],ymm1[6],ymm13[6],ymm1[7],ymm13[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm1 -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm5 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX2-ONLY-NEXT: vbroadcastss 24(%rax), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1,2,3,4,5,6],ymm14[7] +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm12 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[6],ymm13[6],ymm11[7],ymm13[7] +; AVX2-ONLY-NEXT: vmovaps 32(%r10), %ymm13 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm2 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[4],ymm13[4],ymm12[5],ymm13[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm11[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm10 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[4],ymm6[4],ymm9[5],ymm6[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm12[0],ymm9[2],ymm12[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 52(%r8), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm9 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 28(%r10), %ymm5 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[6],ymm0[6],ymm5[7],ymm0[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 24(%rax), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5,6],ymm3[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm6 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm9[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[6],ymm2[6],ymm5[7],ymm2[7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm3 -; AVX2-ONLY-NEXT: vbroadcastss 28(%r10), %ymm6 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm11[1],ymm6[1],ymm11[3],ymm6[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm10[0],ymm3[0],ymm10[1],ymm3[1],ymm10[4],ymm3[4],ymm10[5],ymm3[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm5[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 20(%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm8[1],ymm2[2,3,4],ymm8[5],ymm2[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 52(%r8), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm7[1],ymm14[2,3,4],ymm7[5],ymm14[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm1 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm14 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps %xmm1, %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm4 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm5 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm11 -; AVX2-ONLY-NEXT: vmovaps (%r10), %xmm12 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm6 -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm13 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm13[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3] -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 56(%rax), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm6 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm8[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm0 -; AVX2-ONLY-NEXT: vbroadcastss %xmm14, %xmm1 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastss %xmm11, %xmm1 -; AVX2-ONLY-NEXT: vbroadcastss %xmm12, %xmm2 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 32(%r10), %xmm6 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm7[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm9 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm3 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm2[3] -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm15[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 60(%r10), %ymm0 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm11[2],ymm0[3],ymm11[3],ymm0[6],ymm11[6],ymm0[7],ymm11[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm0 -; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm5 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastss %xmm7, %xmm5 -; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm8 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm14 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm11 +; AVX2-ONLY-NEXT: vbroadcastss %xmm11, %xmm8 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm12 +; AVX2-ONLY-NEXT: vbroadcastss %xmm12, %xmm9 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm13 +; AVX2-ONLY-NEXT: vbroadcastss %xmm13, %xmm8 +; AVX2-ONLY-NEXT: vmovaps (%r10), %xmm15 +; AVX2-ONLY-NEXT: vbroadcastss %xmm15, %xmm9 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm0 +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm9 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm14[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm13[2],mem[2],xmm13[3],mem[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm11[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm14[0,1,2],xmm4[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm14[2],xmm5[2],xmm14[3],xmm5[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm13[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1,2],xmm3[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm11[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm2[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm12 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm13 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss %xmm12, %xmm0 +; AVX2-ONLY-NEXT: vbroadcastss %xmm13, %xmm2 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 32(%r10), %xmm15 +; AVX2-ONLY-NEXT: vbroadcastss %xmm14, %xmm9 +; AVX2-ONLY-NEXT: vbroadcastss %xmm15, %xmm4 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm0 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm14[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1,2],xmm3[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm12[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 256(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 320(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 256(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rax) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rax) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rax) -; AVX2-ONLY-NEXT: addq $328, %rsp # imm = 0x148 +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-ONLY-NEXT: addq $136, %rsp ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -1839,81 +1784,81 @@ ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm8[2,0] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm5[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm8[2,0] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] +; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm3[2],xmm9[3],xmm3[3] ; SSE-NEXT: movaps %xmm9, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm8[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; SSE-NEXT: movaps %xmm11, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rcx), %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] -; SSE-NEXT: movaps 16(%r10), %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm2[0,2] +; SSE-NEXT: movaps 16(%rcx), %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm2[3,3] +; SSE-NEXT: movaps 16(%r10), %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm7[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,0],xmm5[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm7[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm10, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rax), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm10, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rax), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 16(%r8), %xmm11 -; SSE-NEXT: movaps 16(%r9), %xmm5 +; SSE-NEXT: movaps 16(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] ; SSE-NEXT: movaps %xmm9, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm4[2,0] +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm4[2,0] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm10, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] ; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm10, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movaps %xmm11, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm6[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm11, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdx), %xmm0 -; SSE-NEXT: movaps 32(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 32(%rdx), %xmm1 +; SSE-NEXT: movaps 32(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 32(%rdi), %xmm7 ; SSE-NEXT: movaps 32(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm7, %xmm5 @@ -1922,10 +1867,10 @@ ; SSE-NEXT: movaps %xmm5, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%r10), %xmm2 -; SSE-NEXT: movaps 32(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movaps 32(%r10), %xmm3 +; SSE-NEXT: movaps 32(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] ; SSE-NEXT: movaps 32(%r8), %xmm11 ; SSE-NEXT: movaps 32(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -1933,36 +1878,36 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdx), %xmm0 -; SSE-NEXT: movaps 48(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 48(%rdx), %xmm1 +; SSE-NEXT: movaps 48(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 48(%rdi), %xmm7 ; SSE-NEXT: movaps 48(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm7, %xmm5 @@ -1971,10 +1916,10 @@ ; SSE-NEXT: movaps %xmm5, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%r10), %xmm2 -; SSE-NEXT: movaps 48(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movaps 48(%r10), %xmm3 +; SSE-NEXT: movaps 48(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] ; SSE-NEXT: movaps 48(%r8), %xmm11 ; SSE-NEXT: movaps 48(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -1982,36 +1927,36 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdx), %xmm0 -; SSE-NEXT: movaps 64(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 64(%rdx), %xmm1 +; SSE-NEXT: movaps 64(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 64(%rdi), %xmm7 ; SSE-NEXT: movaps 64(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm7, %xmm5 @@ -2020,10 +1965,10 @@ ; SSE-NEXT: movaps %xmm5, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%r10), %xmm2 -; SSE-NEXT: movaps 64(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movaps 64(%r10), %xmm3 +; SSE-NEXT: movaps 64(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] ; SSE-NEXT: movaps 64(%r8), %xmm11 ; SSE-NEXT: movaps 64(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -2031,36 +1976,36 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdx), %xmm0 -; SSE-NEXT: movaps 80(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 80(%rdx), %xmm1 +; SSE-NEXT: movaps 80(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 80(%rdi), %xmm7 ; SSE-NEXT: movaps 80(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm7, %xmm5 @@ -2069,10 +2014,10 @@ ; SSE-NEXT: movaps %xmm5, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%r10), %xmm2 -; SSE-NEXT: movaps 80(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movaps 80(%r10), %xmm3 +; SSE-NEXT: movaps 80(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] ; SSE-NEXT: movaps 80(%r8), %xmm11 ; SSE-NEXT: movaps 80(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -2080,134 +2025,134 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdx), %xmm2 -; SSE-NEXT: movaps 96(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movaps 96(%rdx), %xmm0 +; SSE-NEXT: movaps 96(%rcx), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps 96(%rdi), %xmm15 ; SSE-NEXT: movaps 96(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm15, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movaps %xmm5, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%r10), %xmm1 -; SSE-NEXT: movaps 96(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: movaps 96(%r8), %xmm13 +; SSE-NEXT: movaps 96(%r10), %xmm3 +; SSE-NEXT: movaps 96(%rax), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: movaps 96(%r8), %xmm7 ; SSE-NEXT: movaps 96(%r9), %xmm6 -; SSE-NEXT: movaps %xmm13, %xmm12 -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] -; SSE-NEXT: movaps %xmm12, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm7, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] +; SSE-NEXT: movaps %xmm13, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm5[2,0] ; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm4[2],xmm15[3],xmm4[3] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSE-NEXT: movaps %xmm15, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm6[2],xmm13[3],xmm6[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: movaps %xmm13, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm1[0,2] -; SSE-NEXT: movaps 112(%rdx), %xmm1 -; SSE-NEXT: movaps 112(%rcx), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm1[2,0] +; SSE-NEXT: movaps 112(%rdx), %xmm6 +; SSE-NEXT: movaps 112(%rcx), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] ; SSE-NEXT: movaps 112(%rdi), %xmm2 -; SSE-NEXT: movaps 112(%rsi), %xmm11 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] -; SSE-NEXT: movaps %xmm3, %xmm14 +; SSE-NEXT: movaps 112(%rsi), %xmm12 +; SSE-NEXT: movaps %xmm2, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] +; SSE-NEXT: movaps %xmm11, %xmm14 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2,0] -; SSE-NEXT: movaps 112(%r10), %xmm0 -; SSE-NEXT: movaps 112(%rax), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps 112(%r10), %xmm5 +; SSE-NEXT: movaps 112(%rax), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSE-NEXT: movaps 112(%r8), %xmm4 ; SSE-NEXT: movaps 112(%r9), %xmm9 -; SSE-NEXT: movaps %xmm4, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm5[2,0] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm5[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm11[2],xmm2[3],xmm11[3] -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] -; SSE-NEXT: movaps %xmm2, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm5[0,2] +; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,0] ; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE-NEXT: movaps %xmm4, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm8[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm6[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm3[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm5[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm1[2,0] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm4, 1008(%rax) ; SSE-NEXT: movaps %xmm2, 992(%rax) ; SSE-NEXT: movaps %xmm9, 976(%rax) -; SSE-NEXT: movaps %xmm11, 960(%rax) -; SSE-NEXT: movaps %xmm6, 944(%rax) -; SSE-NEXT: movaps %xmm3, 928(%rax) +; SSE-NEXT: movaps %xmm12, 960(%rax) +; SSE-NEXT: movaps %xmm8, 944(%rax) +; SSE-NEXT: movaps %xmm11, 928(%rax) ; SSE-NEXT: movaps %xmm10, 912(%rax) ; SSE-NEXT: movaps %xmm14, 896(%rax) -; SSE-NEXT: movaps %xmm13, 880(%rax) +; SSE-NEXT: movaps %xmm7, 880(%rax) ; SSE-NEXT: movaps %xmm15, 864(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 848(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 832(%rax) -; SSE-NEXT: movaps %xmm12, 816(%rax) +; SSE-NEXT: movaps %xmm13, 816(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 800(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2322,45 +2267,45 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm4 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm5 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm6 ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm8 ; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm11 ; AVX1-ONLY-NEXT: vmovaps (%r10), %ymm10 ; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[4],ymm11[4],ymm8[5],ymm11[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,0],ymm9[4,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,0],ymm9[4,5],ymm7[6,4] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm11[1,0],ymm8[1,0],ymm11[5,4],ymm8[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[2,0],ymm6[2,3],ymm9[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[2,0],ymm7[2,3],ymm9[6,4],ymm7[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm8[2],ymm11[2],ymm8[3],ymm11[3],ymm8[6],ymm11[6],ymm8[7],ymm11[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1],ymm6[2,0],ymm9[4,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm7[1],ymm5[1],ymm7[3],ymm5[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1],ymm7[2,0],ymm9[4,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm6[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm7[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm9 @@ -2370,7 +2315,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%r10), %ymm8 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm11[2,0],ymm12[2,3],ymm11[6,4],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm11 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[6],ymm7[6],ymm5[7],ymm7[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] @@ -2383,7 +2328,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm2[0],ymm6[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm2[0],ymm7[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] @@ -2392,7 +2337,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[4],ymm11[4],ymm8[5],ymm11[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm10[1,0],ymm9[1,0],ymm10[5,4],ymm9[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,0],ymm3[2,3],ymm4[6,4],ymm3[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[4],ymm6[4],ymm2[5],ymm6[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm7[0],ymm2[1],ymm7[1],ymm2[4],ymm7[4],ymm2[5],ymm7[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] @@ -2405,32 +2350,32 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm2[1],ymm6[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm11[2],ymm8[3],ymm11[3],ymm8[6],ymm11[6],ymm8[7],ymm11[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm8[2],ymm11[2],ymm8[3],ymm11[3],ymm8[6],ymm11[6],ymm8[7],ymm11[7] ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[3,0],ymm9[3,0],ymm10[7,4],ymm9[7,4] ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm7 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[6],ymm6[6],ymm2[7],ymm6[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm6[2,3],ymm8[6,4],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm6 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm7[2],ymm2[3],ymm7[3],ymm2[6],ymm7[6],ymm2[7],ymm7[7] ; AVX1-ONLY-NEXT: vmovaps 64(%r10), %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1],ymm1[2,0],ymm6[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1],ymm1[2,0],ymm7[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm8 ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 @@ -2439,32 +2384,32 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm7[1,0],ymm5[1,0],ymm7[5,4],ymm5[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[2,0],ymm6[2,3],ymm8[6,4],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm6[1,0],ymm5[1,0],ymm6[5,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,0],ymm7[2,3],ymm8[6,4],ymm7[6,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[4],ymm9[4],ymm1[5],ymm9[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[6],ymm7[6],ymm5[7],ymm7[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,0],ymm8[4,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,0],ymm5[3,0],ymm7[7,4],ymm5[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[3,0],ymm5[3,0],ymm6[7,4],ymm5[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm0 @@ -2526,19 +2471,19 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm0 ; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm1 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm9 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] @@ -2549,32 +2494,32 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm2[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm4[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm5[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1,2],xmm7[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm2[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm5 @@ -3260,57 +3205,57 @@ ; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512F-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512F-NEXT: vpermt2d %zmm30, %zmm3, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512F-NEXT: vpermt2d %zmm27, %zmm2, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512F-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512F-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512F-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm27, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512F-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512F-NEXT: vpermt2d %zmm27, %zmm10, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512F-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 ; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512F-NEXT: vpermt2d %zmm27, %zmm12, %zmm13 ; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512F-NEXT: vpermt2d %zmm30, %zmm13, %zmm14 ; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512F-NEXT: vpermt2d %zmm30, %zmm15, %zmm16 ; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm31 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm31 = ; AVX512F-NEXT: vpermt2d %zmm30, %zmm31, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -3332,10 +3277,10 @@ ; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2d %zmm28, %zmm31, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512F-NEXT: vpermt2d %zmm27, %zmm28, %zmm31 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = ; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -3356,33 +3301,33 @@ ; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = ; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm20 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = ; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = ; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = ; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = ; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm29 = ; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm30 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm30 = ; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm30, %zmm26 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 ; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -3401,28 +3346,28 @@ ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm12, %zmm18 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm17 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm15 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm7, %zmm14 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> ; AVX512F-NEXT: vpermt2d %zmm0, %zmm19, %zmm3 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -3523,22 +3468,22 @@ ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm1 {%k3} ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm1, 896(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 960(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 768(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 832(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, 704(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm10, 512(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm12, 576(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 384(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm9, 448(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm13, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm15, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 896(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 768(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm8, 640(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, 576(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, 512(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 448(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm9, 384(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm13, 256(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm15, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512F-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -3554,57 +3499,57 @@ ; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm10, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm12, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm13, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm15, %zmm16 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm31, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 @@ -3626,10 +3571,10 @@ ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm28, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm28, %zmm31 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = ; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm27 @@ -3650,33 +3595,33 @@ ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm27 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm16 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm21 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm22 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm23 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm23 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm24 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm24 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm25 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm29, %zmm25 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm26 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -3695,28 +3640,28 @@ ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm17 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm7, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm3 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -3817,22 +3762,22 @@ ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k3} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm1, 896(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 960(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 832(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 896(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512BW-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3888,81 +3833,81 @@ ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm8[2,0] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm6[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm8[2,0] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] +; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm2[2],xmm9[3],xmm2[3] ; SSE-NEXT: movaps %xmm9, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm8[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rcx), %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm4[3,3] -; SSE-NEXT: movaps 16(%r10), %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm3[0,2] +; SSE-NEXT: movaps 16(%rcx), %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm3[3,3] +; SSE-NEXT: movaps 16(%r10), %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm7[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,0],xmm6[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm7[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] ; SSE-NEXT: movaps %xmm10, %xmm6 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rax), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] ; SSE-NEXT: movaps 16(%r8), %xmm11 ; SSE-NEXT: movaps 16(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm9 ; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] ; SSE-NEXT: movaps %xmm9, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm4[2,0] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm4[2,0] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; SSE-NEXT: movaps %xmm10, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm10, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: movaps %xmm11, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm5[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm1[0,2] +; SSE-NEXT: movaps %xmm11, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdx), %xmm0 -; SSE-NEXT: movaps 32(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 32(%rdx), %xmm1 +; SSE-NEXT: movaps 32(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 32(%rdi), %xmm7 ; SSE-NEXT: movaps 32(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm7, %xmm5 @@ -3971,10 +3916,10 @@ ; SSE-NEXT: movaps %xmm5, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%r10), %xmm2 -; SSE-NEXT: movaps 32(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movaps 32(%r10), %xmm3 +; SSE-NEXT: movaps 32(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] ; SSE-NEXT: movaps 32(%r8), %xmm11 ; SSE-NEXT: movaps 32(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -3982,36 +3927,36 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdx), %xmm0 -; SSE-NEXT: movaps 48(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 48(%rdx), %xmm1 +; SSE-NEXT: movaps 48(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 48(%rdi), %xmm7 ; SSE-NEXT: movaps 48(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm7, %xmm5 @@ -4020,10 +3965,10 @@ ; SSE-NEXT: movaps %xmm5, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%r10), %xmm2 -; SSE-NEXT: movaps 48(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movaps 48(%r10), %xmm3 +; SSE-NEXT: movaps 48(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] ; SSE-NEXT: movaps 48(%r8), %xmm11 ; SSE-NEXT: movaps 48(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4031,36 +3976,36 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdx), %xmm0 -; SSE-NEXT: movaps 64(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 64(%rdx), %xmm1 +; SSE-NEXT: movaps 64(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 64(%rdi), %xmm7 ; SSE-NEXT: movaps 64(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm7, %xmm5 @@ -4069,10 +4014,10 @@ ; SSE-NEXT: movaps %xmm5, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%r10), %xmm2 -; SSE-NEXT: movaps 64(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movaps 64(%r10), %xmm3 +; SSE-NEXT: movaps 64(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] ; SSE-NEXT: movaps 64(%r8), %xmm11 ; SSE-NEXT: movaps 64(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4080,36 +4025,36 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdx), %xmm0 -; SSE-NEXT: movaps 80(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 80(%rdx), %xmm1 +; SSE-NEXT: movaps 80(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 80(%rdi), %xmm7 ; SSE-NEXT: movaps 80(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm7, %xmm5 @@ -4118,10 +4063,10 @@ ; SSE-NEXT: movaps %xmm5, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%r10), %xmm2 -; SSE-NEXT: movaps 80(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movaps 80(%r10), %xmm3 +; SSE-NEXT: movaps 80(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] ; SSE-NEXT: movaps 80(%r8), %xmm11 ; SSE-NEXT: movaps 80(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4129,36 +4074,36 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdx), %xmm0 -; SSE-NEXT: movaps 96(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 96(%rdx), %xmm1 +; SSE-NEXT: movaps 96(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 96(%rdi), %xmm7 ; SSE-NEXT: movaps 96(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm7, %xmm5 @@ -4167,10 +4112,10 @@ ; SSE-NEXT: movaps %xmm5, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%r10), %xmm2 -; SSE-NEXT: movaps 96(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movaps 96(%r10), %xmm3 +; SSE-NEXT: movaps 96(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] ; SSE-NEXT: movaps 96(%r8), %xmm11 ; SSE-NEXT: movaps 96(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4178,36 +4123,36 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdx), %xmm0 -; SSE-NEXT: movaps 112(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 112(%rdx), %xmm1 +; SSE-NEXT: movaps 112(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 112(%rdi), %xmm7 ; SSE-NEXT: movaps 112(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm7, %xmm5 @@ -4216,10 +4161,10 @@ ; SSE-NEXT: movaps %xmm5, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%r10), %xmm2 -; SSE-NEXT: movaps 112(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movaps 112(%r10), %xmm3 +; SSE-NEXT: movaps 112(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] ; SSE-NEXT: movaps 112(%r8), %xmm11 ; SSE-NEXT: movaps 112(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4227,36 +4172,36 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdx), %xmm0 -; SSE-NEXT: movaps 128(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 128(%rdx), %xmm1 +; SSE-NEXT: movaps 128(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 128(%rdi), %xmm7 ; SSE-NEXT: movaps 128(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm7, %xmm5 @@ -4265,10 +4210,10 @@ ; SSE-NEXT: movaps %xmm5, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%r10), %xmm2 -; SSE-NEXT: movaps 128(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movaps 128(%r10), %xmm3 +; SSE-NEXT: movaps 128(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] ; SSE-NEXT: movaps 128(%r8), %xmm11 ; SSE-NEXT: movaps 128(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4276,36 +4221,36 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdx), %xmm0 -; SSE-NEXT: movaps 144(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 144(%rdx), %xmm1 +; SSE-NEXT: movaps 144(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 144(%rdi), %xmm7 ; SSE-NEXT: movaps 144(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm7, %xmm5 @@ -4314,10 +4259,10 @@ ; SSE-NEXT: movaps %xmm5, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%r10), %xmm2 -; SSE-NEXT: movaps 144(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movaps 144(%r10), %xmm3 +; SSE-NEXT: movaps 144(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] ; SSE-NEXT: movaps 144(%r8), %xmm11 ; SSE-NEXT: movaps 144(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4325,36 +4270,36 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdx), %xmm0 -; SSE-NEXT: movaps 160(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 160(%rdx), %xmm1 +; SSE-NEXT: movaps 160(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 160(%rdi), %xmm7 ; SSE-NEXT: movaps 160(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm7, %xmm5 @@ -4363,10 +4308,10 @@ ; SSE-NEXT: movaps %xmm5, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%r10), %xmm2 -; SSE-NEXT: movaps 160(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movaps 160(%r10), %xmm3 +; SSE-NEXT: movaps 160(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] ; SSE-NEXT: movaps 160(%r8), %xmm11 ; SSE-NEXT: movaps 160(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4374,36 +4319,36 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%rdx), %xmm0 -; SSE-NEXT: movaps 176(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 176(%rdx), %xmm1 +; SSE-NEXT: movaps 176(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 176(%rdi), %xmm7 ; SSE-NEXT: movaps 176(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm7, %xmm5 @@ -4412,10 +4357,10 @@ ; SSE-NEXT: movaps %xmm5, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%r10), %xmm2 -; SSE-NEXT: movaps 176(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movaps 176(%r10), %xmm3 +; SSE-NEXT: movaps 176(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] ; SSE-NEXT: movaps 176(%r8), %xmm11 ; SSE-NEXT: movaps 176(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4423,36 +4368,36 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%rdx), %xmm0 -; SSE-NEXT: movaps 192(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 192(%rdx), %xmm1 +; SSE-NEXT: movaps 192(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 192(%rdi), %xmm7 ; SSE-NEXT: movaps 192(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm7, %xmm5 @@ -4461,10 +4406,10 @@ ; SSE-NEXT: movaps %xmm5, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%r10), %xmm2 -; SSE-NEXT: movaps 192(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movaps 192(%r10), %xmm3 +; SSE-NEXT: movaps 192(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] ; SSE-NEXT: movaps 192(%r8), %xmm11 ; SSE-NEXT: movaps 192(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4472,36 +4417,36 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%rdx), %xmm0 -; SSE-NEXT: movaps 208(%rcx), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps 208(%rdx), %xmm1 +; SSE-NEXT: movaps 208(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 208(%rdi), %xmm7 ; SSE-NEXT: movaps 208(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm7, %xmm5 @@ -4510,10 +4455,10 @@ ; SSE-NEXT: movaps %xmm5, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%r10), %xmm2 -; SSE-NEXT: movaps 208(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movaps 208(%r10), %xmm3 +; SSE-NEXT: movaps 208(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] ; SSE-NEXT: movaps 208(%r8), %xmm11 ; SSE-NEXT: movaps 208(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4521,36 +4466,36 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdx), %xmm1 -; SSE-NEXT: movaps 224(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps 224(%rdx), %xmm0 +; SSE-NEXT: movaps 224(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps 224(%rdi), %xmm12 ; SSE-NEXT: movaps 224(%rsi), %xmm4 ; SSE-NEXT: movaps %xmm12, %xmm5 @@ -4559,10 +4504,10 @@ ; SSE-NEXT: movaps %xmm5, %xmm7 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%r10), %xmm2 -; SSE-NEXT: movaps 224(%rax), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movaps 224(%r10), %xmm3 +; SSE-NEXT: movaps 224(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] ; SSE-NEXT: movaps 224(%r8), %xmm15 ; SSE-NEXT: movaps 224(%r9), %xmm6 ; SSE-NEXT: movaps %xmm15, %xmm13 @@ -4570,77 +4515,77 @@ ; SSE-NEXT: movaps %xmm13, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm5[2,0] ; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSE-NEXT: movaps %xmm12, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] ; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE-NEXT: movaps %xmm15, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm2[0,2] -; SSE-NEXT: movaps 240(%rdx), %xmm2 -; SSE-NEXT: movaps 240(%rcx), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps 240(%rdi), %xmm1 -; SSE-NEXT: movaps 240(%rsi), %xmm11 -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] -; SSE-NEXT: movaps %xmm4, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm2[2,0] +; SSE-NEXT: movaps 240(%rdx), %xmm6 +; SSE-NEXT: movaps 240(%rcx), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE-NEXT: movaps 240(%rdi), %xmm4 +; SSE-NEXT: movaps 240(%rsi), %xmm10 +; SSE-NEXT: movaps %xmm4, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; SSE-NEXT: movaps %xmm11, %xmm14 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2,0] -; SSE-NEXT: movaps 240(%r10), %xmm0 -; SSE-NEXT: movaps 240(%rax), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps 240(%r10), %xmm3 +; SSE-NEXT: movaps 240(%rax), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movaps 240(%r8), %xmm5 -; SSE-NEXT: movaps 240(%r9), %xmm9 -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm3[2,0] -; SSE-NEXT: movaps %xmm7, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm3[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm8[1] -; SSE-NEXT: movaps %xmm1, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm3[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] -; SSE-NEXT: movaps %xmm5, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm3[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm8[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[0,2] +; SSE-NEXT: movaps 240(%r9), %xmm8 +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: movaps %xmm7, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; SSE-NEXT: movaps %xmm4, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm6[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm1[2,0] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm5, 2032(%rax) -; SSE-NEXT: movaps %xmm1, 2016(%rax) -; SSE-NEXT: movaps %xmm9, 2000(%rax) -; SSE-NEXT: movaps %xmm11, 1984(%rax) -; SSE-NEXT: movaps %xmm6, 1968(%rax) -; SSE-NEXT: movaps %xmm4, 1952(%rax) -; SSE-NEXT: movaps %xmm10, 1936(%rax) +; SSE-NEXT: movaps %xmm4, 2016(%rax) +; SSE-NEXT: movaps %xmm8, 2000(%rax) +; SSE-NEXT: movaps %xmm10, 1984(%rax) +; SSE-NEXT: movaps %xmm7, 1968(%rax) +; SSE-NEXT: movaps %xmm11, 1952(%rax) +; SSE-NEXT: movaps %xmm9, 1936(%rax) ; SSE-NEXT: movaps %xmm14, 1920(%rax) ; SSE-NEXT: movaps %xmm15, 1904(%rax) ; SSE-NEXT: movaps %xmm12, 1888(%rax) @@ -4891,8 +4836,8 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm4 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm6 ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm7 ; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm9 @@ -5823,8 +5768,8 @@ ; AVX2-ONLY-LABEL: store_i32_stride8_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $1672, %rsp # imm = 0x688 -; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 @@ -5835,8 +5780,8 @@ ; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm7 ; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm9 ; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm11 -; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm10 -; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm12 +; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm10 +; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm12 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 @@ -5858,21 +5803,21 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 24(%rax), %ymm13 +; AVX2-ONLY-NEXT: vbroadcastss 24(%r10), %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm10[0,1,2,3,4,5,6],ymm13[7] ; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm10 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[6],ymm11[6],ymm9[7],ymm11[7] -; AVX2-ONLY-NEXT: vmovaps 32(%r10), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %ymm9 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm5 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm6[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 32(%r10), %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 28(%r10), %ymm5 +; AVX2-ONLY-NEXT: vbroadcastss 28(%rax), %ymm5 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm12[2],ymm5[3],ymm12[3],ymm5[6],ymm12[6],ymm5[7],ymm12[7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm5[1],ymm11[3],ymm5[3] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 @@ -5902,7 +5847,7 @@ ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 56(%rax), %ymm5 +; AVX2-ONLY-NEXT: vbroadcastss 56(%r10), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm5[7] ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm5 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[6],ymm10[6],ymm8[7],ymm10[7] @@ -5916,7 +5861,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 60(%r10), %ymm1 +; AVX2-ONLY-NEXT: vbroadcastss 60(%rax), %ymm1 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] ; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm3[1],ymm8[3],ymm3[3] @@ -5932,8 +5877,8 @@ ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm3[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 64(%r10), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 64(%rax), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 64(%rax), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 64(%r10), %ymm9 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] @@ -5949,7 +5894,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 88(%rax), %ymm2 +; AVX2-ONLY-NEXT: vbroadcastss 88(%r10), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] @@ -5960,7 +5905,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 92(%r10), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastss 92(%rax), %ymm0 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] @@ -5980,8 +5925,8 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3] ; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 96(%r9), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 96(%r10), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 96(%rax), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 96(%rax), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 96(%r10), %ymm9 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] @@ -5997,7 +5942,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 120(%rax), %ymm3 +; AVX2-ONLY-NEXT: vbroadcastss 120(%r10), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] @@ -6008,7 +5953,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 124(%r10), %ymm2 +; AVX2-ONLY-NEXT: vbroadcastss 124(%rax), %ymm2 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 @@ -6028,8 +5973,8 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3] ; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 128(%r9), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 128(%r10), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 128(%rax), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 128(%rax), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 128(%r10), %ymm9 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] @@ -6045,7 +5990,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 152(%rax), %ymm2 +; AVX2-ONLY-NEXT: vbroadcastss 152(%r10), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] @@ -6056,7 +6001,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 156(%r10), %ymm2 +; AVX2-ONLY-NEXT: vbroadcastss 156(%rax), %ymm2 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 @@ -6076,8 +6021,8 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3] ; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 160(%r9), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 160(%r10), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 160(%rax), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 160(%rax), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 160(%r10), %ymm9 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] @@ -6093,7 +6038,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 184(%rax), %ymm2 +; AVX2-ONLY-NEXT: vbroadcastss 184(%r10), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] @@ -6104,7 +6049,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 188(%r10), %ymm2 +; AVX2-ONLY-NEXT: vbroadcastss 188(%rax), %ymm2 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 @@ -6124,8 +6069,8 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3] ; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 192(%r9), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 192(%r10), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 192(%rax), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 192(%rax), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 192(%r10), %ymm9 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] @@ -6141,7 +6086,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 216(%rax), %ymm2 +; AVX2-ONLY-NEXT: vbroadcastss 216(%r10), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] @@ -6152,7 +6097,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 220(%r10), %ymm2 +; AVX2-ONLY-NEXT: vbroadcastss 220(%rax), %ymm2 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 @@ -6172,8 +6117,8 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3] ; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 224(%r9), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 224(%r10), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 224(%rax), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 224(%rax), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 224(%r10), %ymm9 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] @@ -6189,7 +6134,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 248(%rax), %ymm2 +; AVX2-ONLY-NEXT: vbroadcastss 248(%r10), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] @@ -6200,7 +6145,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 252(%r10), %ymm2 +; AVX2-ONLY-NEXT: vbroadcastss 252(%rax), %ymm2 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 @@ -6218,9 +6163,9 @@ ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm5 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps (%r10), %xmm2 ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm7 -; AVX2-ONLY-NEXT: vmovaps (%r10), %xmm3 +; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm3 ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm8 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm8 @@ -6273,8 +6218,8 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vmovaps 32(%r10), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 32(%r10), %xmm3 ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm7 ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm8 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] @@ -6328,8 +6273,8 @@ ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vmovaps 64(%r10), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 64(%r10), %xmm3 ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm7 ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm8 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] @@ -6383,9 +6328,9 @@ ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vmovaps 96(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 96(%r10), %xmm2 ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm7 -; AVX2-ONLY-NEXT: vmovaps 96(%r10), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 96(%rax), %xmm3 ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm8 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-ONLY-NEXT: vmovaps 96(%r9), %xmm8 @@ -6438,9 +6383,9 @@ ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vmovaps 128(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 128(%r10), %xmm2 ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm7 -; AVX2-ONLY-NEXT: vmovaps 128(%r10), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 128(%rax), %xmm3 ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm8 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-ONLY-NEXT: vmovaps 128(%r9), %xmm8 @@ -6493,9 +6438,9 @@ ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vmovaps 160(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 160(%r10), %xmm2 ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm7 -; AVX2-ONLY-NEXT: vmovaps 160(%r10), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 160(%rax), %xmm3 ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm8 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-ONLY-NEXT: vmovaps 160(%r9), %xmm8 @@ -6549,9 +6494,9 @@ ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm3[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vmovaps 192(%rax), %xmm2 -; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm7 ; AVX2-ONLY-NEXT: vmovaps 192(%r10), %xmm3 -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm8 +; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm7 +; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm8 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-ONLY-NEXT: vmovaps 192(%r9), %xmm8 ; AVX2-ONLY-NEXT: vmovaps 192(%r8), %xmm6 @@ -6564,7 +6509,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm6[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm8[1],xmm15[2,3] @@ -6573,8 +6518,8 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm3[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm2[0,1,2],xmm6[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] @@ -6584,7 +6529,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -6594,14 +6539,14 @@ ; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm0 ; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm5 ; AVX2-ONLY-NEXT: vbroadcastss %xmm5, %xmm1 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 224(%rax), %xmm3 -; AVX2-ONLY-NEXT: vmovaps 224(%r10), %xmm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 224(%r10), %xmm3 ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm15 +; AVX2-ONLY-NEXT: vmovaps 224(%rax), %xmm1 ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm14 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; AVX2-ONLY-NEXT: vmovaps 224(%r9), %xmm15 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll @@ -60,15 +60,15 @@ ; SSE-NEXT: movaps (%rsi), %xmm2 ; SSE-NEXT: movaps 16(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps %xmm2, 48(%rdx) -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps %xmm4, 16(%rdx) +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps %xmm2, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps %xmm4, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride2_vf4: @@ -130,25 +130,25 @@ ; SSE-NEXT: movaps 32(%rsi), %xmm6 ; SSE-NEXT: movaps 48(%rsi), %xmm7 ; SSE-NEXT: movaps %xmm0, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] ; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] ; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] ; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0] -; SSE-NEXT: movaps %xmm3, 96(%rdx) -; SSE-NEXT: movaps %xmm6, 112(%rdx) -; SSE-NEXT: movaps %xmm2, 64(%rdx) -; SSE-NEXT: movaps %xmm5, 80(%rdx) -; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps %xmm4, 48(%rdx) -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps %xmm8, 16(%rdx) +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] +; SSE-NEXT: movaps %xmm3, 112(%rdx) +; SSE-NEXT: movaps %xmm6, 96(%rdx) +; SSE-NEXT: movaps %xmm2, 80(%rdx) +; SSE-NEXT: movaps %xmm5, 64(%rdx) +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps %xmm4, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps %xmm8, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride2_vf8: @@ -182,22 +182,22 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -240,48 +240,48 @@ ; SSE-NEXT: movaps 32(%rsi), %xmm14 ; SSE-NEXT: movaps 48(%rsi), %xmm15 ; SSE-NEXT: movaps %xmm8, %xmm7 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm9[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] ; SSE-NEXT: movaps %xmm1, %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] ; SSE-NEXT: movaps %xmm2, %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm14[1] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm14[0] +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm14[1] ; SSE-NEXT: movaps %xmm5, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm15[1] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm15[0] +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm15[1] ; SSE-NEXT: movaps %xmm3, %xmm15 -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm13[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm13[0] +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm13[1] ; SSE-NEXT: movaps %xmm4, %xmm13 -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm12[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm12[0] +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm12[1] ; SSE-NEXT: movaps %xmm6, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm11[0] +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm11[1] ; SSE-NEXT: movaps 112(%rsi), %xmm11 ; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm11[1] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0] -; SSE-NEXT: movaps %xmm0, 224(%rdx) -; SSE-NEXT: movaps %xmm7, 240(%rdx) -; SSE-NEXT: movaps %xmm6, 192(%rdx) -; SSE-NEXT: movaps %xmm12, 208(%rdx) -; SSE-NEXT: movaps %xmm4, 160(%rdx) -; SSE-NEXT: movaps %xmm13, 176(%rdx) -; SSE-NEXT: movaps %xmm3, 128(%rdx) -; SSE-NEXT: movaps %xmm15, 144(%rdx) -; SSE-NEXT: movaps %xmm5, 96(%rdx) -; SSE-NEXT: movaps %xmm14, 112(%rdx) -; SSE-NEXT: movaps %xmm2, 64(%rdx) -; SSE-NEXT: movaps %xmm10, 80(%rdx) -; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps %xmm9, 48(%rdx) -; SSE-NEXT: movaps %xmm8, (%rdx) +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] +; SSE-NEXT: movaps %xmm0, 240(%rdx) +; SSE-NEXT: movaps %xmm7, 224(%rdx) +; SSE-NEXT: movaps %xmm6, 208(%rdx) +; SSE-NEXT: movaps %xmm12, 192(%rdx) +; SSE-NEXT: movaps %xmm4, 176(%rdx) +; SSE-NEXT: movaps %xmm13, 160(%rdx) +; SSE-NEXT: movaps %xmm3, 144(%rdx) +; SSE-NEXT: movaps %xmm15, 128(%rdx) +; SSE-NEXT: movaps %xmm5, 112(%rdx) +; SSE-NEXT: movaps %xmm14, 96(%rdx) +; SSE-NEXT: movaps %xmm2, 80(%rdx) +; SSE-NEXT: movaps %xmm10, 64(%rdx) +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps %xmm9, 32(%rdx) +; SSE-NEXT: movaps %xmm8, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride2_vf16: @@ -294,18 +294,18 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm3[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm6[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm2[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm4[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm0[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm2[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm7[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm3[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[3],ymm4[3] @@ -318,14 +318,14 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[3],ymm7[3] -; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 96(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 160(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 224(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 224(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 160(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 96(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX1-ONLY-NEXT: vmovapd %ymm0, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -339,38 +339,38 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm7 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm4[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm4[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm5[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm1[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm5[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm1[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm2[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm2[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm7[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm3[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm7[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm3[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3],ymm3[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 160(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -380,17 +380,17 @@ ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,1,9,2,10,3,11] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,12,5,13,6,14,7,15] ; AVX512-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 ; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm4, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64 @@ -556,44 +556,44 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride2_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm0[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm6[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm2[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm7[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm3[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm4[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm9[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm5[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm6[0],xmm5[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm7[0],xmm6[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm7[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm7[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 @@ -621,23 +621,23 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[3],ymm15[3] -; AVX1-ONLY-NEXT: vmovapd %ymm0, 416(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm14, 352(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm13, 224(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 32(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 96(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 480(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 416(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 352(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 288(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 224(%rdx) ; AVX1-ONLY-NEXT: vmovapd %ymm10, 160(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 288(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 480(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 384(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 320(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 96(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 320(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 256(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -657,76 +657,76 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm11 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm14 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm1[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm1[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm7[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm4[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm7[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm4[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm8[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm8[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3],ymm15[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm14[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm12[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm14[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm12[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm9[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm9[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3],ymm9[4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm10[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm5[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm10[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm5[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm2[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm2[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm6 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm15[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm15[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 384(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 416(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 320(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 352(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 256(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 288(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 192(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 224(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 416(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 384(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 352(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 320(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 288(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 256(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 224(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 160(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -740,10 +740,10 @@ ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,8,1,9,2,10,3,11] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,12,5,13,6,14,7,15] ; AVX512-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512-NEXT: vpermt2q %zmm5, %zmm8, %zmm4 @@ -753,14 +753,14 @@ ; AVX512-NEXT: vpermt2q %zmm6, %zmm10, %zmm2 ; AVX512-NEXT: vpermi2q %zmm7, %zmm3, %zmm8 ; AVX512-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm3, 384(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm8, 448(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm5, 320(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm4, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm9, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm3, 448(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm8, 384(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm2, 320(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm5, 256(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64 @@ -1522,10 +1522,10 @@ ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm13 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm14 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm15 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,8,1,9,2,10,3,11] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [4,12,5,13,6,14,7,15] ; AVX512-NEXT: vpermt2q %zmm12, %zmm18, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512-NEXT: vpermt2q %zmm13, %zmm16, %zmm12 @@ -1547,22 +1547,22 @@ ; AVX512-NEXT: vpermt2q %zmm9, %zmm18, %zmm1 ; AVX512-NEXT: vpermi2q %zmm8, %zmm0, %zmm16 ; AVX512-NEXT: vpermt2q %zmm8, %zmm18, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, 896(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm16, 960(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm1, 768(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm10, 832(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm2, 640(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm11, 704(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm3, 512(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm15, 576(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm14, 448(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm6, 256(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm13, 320(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm5, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm12, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm4, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm17, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm0, 960(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm16, 896(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm1, 832(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm10, 768(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm2, 704(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm11, 640(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm3, 576(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm15, 512(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm14, 384(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm6, 320(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm13, 256(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm12, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm4, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm17, (%rdx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll @@ -23,10 +23,10 @@ ; SSE-NEXT: movapd (%rdx), %xmm2 ; SSE-NEXT: movapd %xmm0, %xmm3 ; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, 16(%rcx) +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSE-NEXT: movapd %xmm1, 32(%rcx) +; SSE-NEXT: movapd %xmm0, 16(%rcx) ; SSE-NEXT: movapd %xmm3, (%rcx) ; SSE-NEXT: retq ; @@ -84,26 +84,26 @@ define void @store_i64_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i64_stride3_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: movaps (%rsi), %xmm2 -; SSE-NEXT: movaps 16(%rsi), %xmm3 -; SSE-NEXT: movaps (%rdx), %xmm4 -; SSE-NEXT: movaps 16(%rdx), %xmm5 -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movaps %xmm4, 16(%rcx) -; SSE-NEXT: movaps %xmm3, 32(%rcx) -; SSE-NEXT: movaps %xmm1, 48(%rcx) -; SSE-NEXT: movaps %xmm5, 64(%rcx) -; SSE-NEXT: movaps %xmm6, 80(%rcx) +; SSE-NEXT: movapd (%rdi), %xmm0 +; SSE-NEXT: movapd 16(%rdi), %xmm1 +; SSE-NEXT: movapd (%rsi), %xmm2 +; SSE-NEXT: movapd 16(%rsi), %xmm3 +; SSE-NEXT: movapd (%rdx), %xmm4 +; SSE-NEXT: movapd 16(%rdx), %xmm5 +; SSE-NEXT: movapd %xmm0, %xmm6 +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; SSE-NEXT: movapd %xmm1, %xmm4 +; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; SSE-NEXT: movapd %xmm3, 80(%rcx) +; SSE-NEXT: movapd %xmm1, 64(%rcx) +; SSE-NEXT: movapd %xmm4, 48(%rcx) +; SSE-NEXT: movapd %xmm2, 32(%rcx) +; SSE-NEXT: movapd %xmm0, 16(%rcx) +; SSE-NEXT: movapd %xmm6, (%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride3_vf4: @@ -123,8 +123,8 @@ ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd %ymm3, 64(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 64(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -139,15 +139,15 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rcx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -179,46 +179,46 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i64_stride3_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm3 -; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps 32(%rdi), %xmm1 -; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm7 -; SSE-NEXT: movaps 16(%rsi), %xmm8 -; SSE-NEXT: movaps 32(%rsi), %xmm9 -; SSE-NEXT: movaps 48(%rsi), %xmm10 -; SSE-NEXT: movaps (%rdx), %xmm11 -; SSE-NEXT: movaps 16(%rdx), %xmm12 -; SSE-NEXT: movaps 32(%rdx), %xmm6 -; SSE-NEXT: movaps 48(%rdx), %xmm5 -; SSE-NEXT: movaps %xmm10, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] -; SSE-NEXT: movaps %xmm9, %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm6[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0] -; SSE-NEXT: movaps %xmm8, %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm2[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm8[0] -; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm3[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0] -; SSE-NEXT: movaps %xmm3, (%rcx) -; SSE-NEXT: movaps %xmm11, 16(%rcx) -; SSE-NEXT: movaps %xmm8, 32(%rcx) -; SSE-NEXT: movaps %xmm2, 48(%rcx) -; SSE-NEXT: movaps %xmm12, 64(%rcx) -; SSE-NEXT: movaps %xmm9, 80(%rcx) -; SSE-NEXT: movaps %xmm1, 96(%rcx) -; SSE-NEXT: movaps %xmm6, 112(%rcx) -; SSE-NEXT: movaps %xmm10, 128(%rcx) -; SSE-NEXT: movaps %xmm0, 144(%rcx) -; SSE-NEXT: movaps %xmm5, 160(%rcx) -; SSE-NEXT: movaps %xmm4, 176(%rcx) +; SSE-NEXT: movapd (%rdi), %xmm0 +; SSE-NEXT: movapd 16(%rdi), %xmm1 +; SSE-NEXT: movapd 32(%rdi), %xmm3 +; SSE-NEXT: movapd 48(%rdi), %xmm6 +; SSE-NEXT: movapd (%rsi), %xmm2 +; SSE-NEXT: movapd 16(%rsi), %xmm4 +; SSE-NEXT: movapd 32(%rsi), %xmm7 +; SSE-NEXT: movapd 48(%rsi), %xmm8 +; SSE-NEXT: movapd (%rdx), %xmm9 +; SSE-NEXT: movapd 16(%rdx), %xmm10 +; SSE-NEXT: movapd 32(%rdx), %xmm11 +; SSE-NEXT: movapd 48(%rdx), %xmm12 +; SSE-NEXT: movapd %xmm0, %xmm5 +; SSE-NEXT: unpcklpd {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm9[1] +; SSE-NEXT: movapd %xmm1, %xmm9 +; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm4[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] +; SSE-NEXT: movapd %xmm3, %xmm10 +; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm7[0] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm11[0],xmm3[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm11[1] +; SSE-NEXT: movapd %xmm6, %xmm11 +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm8[0] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm12[0],xmm6[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm12[1] +; SSE-NEXT: movapd %xmm8, 176(%rcx) +; SSE-NEXT: movapd %xmm6, 160(%rcx) +; SSE-NEXT: movapd %xmm11, 144(%rcx) +; SSE-NEXT: movapd %xmm7, 128(%rcx) +; SSE-NEXT: movapd %xmm3, 112(%rcx) +; SSE-NEXT: movapd %xmm10, 96(%rcx) +; SSE-NEXT: movapd %xmm4, 80(%rcx) +; SSE-NEXT: movapd %xmm1, 64(%rcx) +; SSE-NEXT: movapd %xmm9, 48(%rcx) +; SSE-NEXT: movapd %xmm2, 32(%rcx) +; SSE-NEXT: movapd %xmm0, 16(%rcx) +; SSE-NEXT: movapd %xmm5, (%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride3_vf8: @@ -239,23 +239,23 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm1[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm3[2,3],ymm8[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm0[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm2[2,3],ymm8[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2],ymm5[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm3[2,3],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2],ymm1[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd %ymm5, 64(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm7, 160(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm1, 128(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 64(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 32(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rcx) @@ -270,36 +270,36 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm5 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm2[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm0[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm3[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm5[2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm2[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm4[2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm4[2,1,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm5[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 128(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rcx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -443,78 +443,78 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride3_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm6 -; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm7 +; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm2 ; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovapd (%rdx), %ymm9 -; AVX1-ONLY-NEXT: vmovapd 32(%rdx), %ymm4 -; AVX1-ONLY-NEXT: vmovapd 64(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vmovapd 32(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vmovapd 64(%rdx), %ymm5 ; AVX1-ONLY-NEXT: vmovapd 96(%rdx), %ymm3 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3],ymm10[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3],ymm11[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm6[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3],ymm10[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm10[0],mem[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm11, %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm11[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm5[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm8[2,3],ymm14[2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm11[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm14, %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3],ymm14[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm7[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm9[2,3],ymm14[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm2[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm4[2,3],ymm14[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm4[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm8[2,3],ymm14[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2],ymm13[3] -; AVX1-ONLY-NEXT: vmovapd 112(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm1[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm3[2,3],ymm15[2,3] +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm2[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[2,3],ymm15[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3] -; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 112(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2],ymm15[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1,0,2,2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],mem[2],ymm6[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm9[1],ymm6[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[1,0,2,2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],mem[2],ymm5[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1,0,2,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2],ymm7[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1,0,2,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],mem[2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm8[1],ymm4[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2],ymm2[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2],ymm1[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd %ymm0, 64(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm14, 352(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 352(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm1, 320(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 256(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 224(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm13, 160(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 128(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 224(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 256(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 128(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 64(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm11, 288(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) ; AVX1-ONLY-NEXT: vzeroupper @@ -522,78 +522,78 @@ ; ; AVX2-ONLY-LABEL: store_i64_stride3_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm11 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm11[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm7[4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm7[1],ymm10[1],ymm7[3],ymm10[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm12[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm6[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm3[4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm4 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm10[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm11[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm13[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1],ymm6[2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm12[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 96(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm1[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm1[2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 288(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 352(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 320(%rcx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm8[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm6[4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm9[2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm9[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm1[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 96(%rdx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm4[2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm4[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 320(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 288(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 256(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 224(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 192(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 160(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 128(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 160(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 128(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 192(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 256(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -610,27 +610,26 @@ ; AVX512-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] ; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = <5,u,14,6,u,15,7,u> -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] -; AVX512-NEXT: vpermt2q %zmm5, %zmm11, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512-NEXT: vpermt2q %zmm3, %zmm12, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] -; AVX512-NEXT: vpermt2q %zmm5, %zmm14, %zmm13 -; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 -; AVX512-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 -; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm9 -; AVX512-NEXT: vpermt2q %zmm4, %zmm11, %zmm9 -; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 -; AVX512-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm9, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm13, 256(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm10, 320(%rcx) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512-NEXT: vpermt2q %zmm2, %zmm9, %zmm10 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [10,1,2,11,4,5,12,7] +; AVX512-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = <5,u,14,6,u,15,7,u> +; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] +; AVX512-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 +; AVX512-NEXT: vpermt2q %zmm5, %zmm8, %zmm6 +; AVX512-NEXT: vpermi2q %zmm3, %zmm1, %zmm9 +; AVX512-NEXT: vpermt2q %zmm5, %zmm11, %zmm9 +; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 +; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm3, 320(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm9, 256(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm6, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm2, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm10, 64(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm7, (%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1234,12 +1233,12 @@ ; ; AVX512-LABEL: store_i64_stride3_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 (%rsi), %zmm4 -; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm1 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm7 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm8 @@ -1247,53 +1246,52 @@ ; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm11 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,8,u,1,9,u,2,10> -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm13 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] ; AVX512-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = <5,u,14,6,u,15,7,u> -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm16 ; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] -; AVX512-NEXT: vpermt2q %zmm11, %zmm17, %zmm16 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512-NEXT: vpermt2q %zmm7, %zmm18, %zmm19 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] -; AVX512-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 -; AVX512-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 -; AVX512-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-NEXT: vpermt2q %zmm1, %zmm15, %zmm7 -; AVX512-NEXT: vpermt2q %zmm10, %zmm17, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512-NEXT: vpermt2q %zmm6, %zmm18, %zmm11 -; AVX512-NEXT: vpermt2q %zmm10, %zmm20, %zmm11 -; AVX512-NEXT: vpermt2q %zmm6, %zmm12, %zmm1 -; AVX512-NEXT: vpermt2q %zmm10, %zmm14, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512-NEXT: vpermt2q %zmm2, %zmm15, %zmm6 -; AVX512-NEXT: vpermt2q %zmm9, %zmm17, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-NEXT: vpermt2q %zmm5, %zmm18, %zmm10 -; AVX512-NEXT: vpermt2q %zmm9, %zmm20, %zmm10 -; AVX512-NEXT: vpermt2q %zmm5, %zmm12, %zmm2 -; AVX512-NEXT: vpermt2q %zmm9, %zmm14, %zmm2 -; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm15 -; AVX512-NEXT: vpermt2q %zmm8, %zmm17, %zmm15 -; AVX512-NEXT: vpermt2q %zmm4, %zmm18, %zmm3 -; AVX512-NEXT: vpermt2q %zmm8, %zmm20, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm2, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm10, 256(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm6, 320(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm1, 384(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm11, 448(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm7, 512(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm0, 576(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm19, 640(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm16, 704(%rcx) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [10,1,2,11,4,5,12,7] +; AVX512-NEXT: vpermt2q %zmm8, %zmm17, %zmm16 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = <5,u,14,6,u,15,7,u> +; AVX512-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,14,5,6,15] +; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 +; AVX512-NEXT: vpermt2q %zmm9, %zmm14, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512-NEXT: vpermt2q %zmm1, %zmm15, %zmm19 +; AVX512-NEXT: vpermt2q %zmm9, %zmm17, %zmm19 +; AVX512-NEXT: vpermt2q %zmm3, %zmm18, %zmm1 +; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512-NEXT: vpermt2q %zmm6, %zmm12, %zmm3 +; AVX512-NEXT: vpermt2q %zmm10, %zmm14, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512-NEXT: vpermt2q %zmm6, %zmm15, %zmm9 +; AVX512-NEXT: vpermt2q %zmm10, %zmm17, %zmm9 +; AVX512-NEXT: vpermt2q %zmm4, %zmm18, %zmm6 +; AVX512-NEXT: vpermt2q %zmm10, %zmm2, %zmm6 +; AVX512-NEXT: vpermi2q %zmm7, %zmm5, %zmm12 +; AVX512-NEXT: vpermt2q %zmm11, %zmm14, %zmm12 +; AVX512-NEXT: vpermi2q %zmm7, %zmm5, %zmm15 +; AVX512-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 +; AVX512-NEXT: vpermt2q %zmm5, %zmm18, %zmm7 +; AVX512-NEXT: vpermt2q %zmm11, %zmm2, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm7, 704(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm15, 640(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm12, 576(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm6, 512(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm9, 448(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm3, 384(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm1, 320(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm19, 256(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm8, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm13, (%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll @@ -26,10 +26,10 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] ; SSE-NEXT: movaps %xmm2, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm0, 32(%r8) +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: movaps %xmm2, 48(%r8) +; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps %xmm5, 16(%r8) ; SSE-NEXT: movaps %xmm4, (%r8) ; SSE-NEXT: retq @@ -97,26 +97,26 @@ ; SSE-NEXT: movaps 16(%rdx), %xmm5 ; SSE-NEXT: movaps (%rcx), %xmm6 ; SSE-NEXT: movaps 16(%rcx), %xmm7 -; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0] -; SSE-NEXT: movaps %xmm0, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] +; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm6[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: movaps %xmm1, 96(%r8) +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] ; SSE-NEXT: movaps %xmm5, 112(%r8) -; SSE-NEXT: movaps %xmm6, 64(%r8) -; SSE-NEXT: movaps %xmm2, 80(%r8) -; SSE-NEXT: movaps %xmm0, 32(%r8) +; SSE-NEXT: movaps %xmm1, 96(%r8) +; SSE-NEXT: movaps %xmm6, 80(%r8) +; SSE-NEXT: movaps %xmm2, 64(%r8) ; SSE-NEXT: movaps %xmm4, 48(%r8) -; SSE-NEXT: movaps %xmm9, (%r8) -; SSE-NEXT: movaps %xmm8, 16(%r8) +; SSE-NEXT: movaps %xmm0, 32(%r8) +; SSE-NEXT: movaps %xmm9, 16(%r8) +; SSE-NEXT: movaps %xmm8, (%r8) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride4_vf4: @@ -131,8 +131,8 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r8) ; AVX1-ONLY-NEXT: vzeroupper @@ -212,125 +212,125 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind { ; SSE-LABEL: store_i64_stride4_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm5 +; SSE-NEXT: movaps (%rdi), %xmm4 ; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps 32(%rdi), %xmm3 -; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm10 -; SSE-NEXT: movaps 16(%rsi), %xmm14 -; SSE-NEXT: movaps 32(%rsi), %xmm11 +; SSE-NEXT: movaps 48(%rdi), %xmm6 +; SSE-NEXT: movaps (%rsi), %xmm8 +; SSE-NEXT: movaps 16(%rsi), %xmm13 +; SSE-NEXT: movaps 32(%rsi), %xmm12 +; SSE-NEXT: movaps 48(%rsi), %xmm10 ; SSE-NEXT: movaps (%rdx), %xmm2 -; SSE-NEXT: movaps 16(%rdx), %xmm4 +; SSE-NEXT: movaps 16(%rdx), %xmm5 ; SSE-NEXT: movaps 32(%rdx), %xmm7 -; SSE-NEXT: movaps 48(%rdx), %xmm9 -; SSE-NEXT: movaps (%rcx), %xmm8 -; SSE-NEXT: movaps 16(%rcx), %xmm13 +; SSE-NEXT: movaps 48(%rdx), %xmm0 +; SSE-NEXT: movaps (%rcx), %xmm11 +; SSE-NEXT: movaps 16(%rcx), %xmm14 ; SSE-NEXT: movaps 32(%rcx), %xmm15 -; SSE-NEXT: movaps 48(%rcx), %xmm12 -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm8[1] -; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm10[1] -; SSE-NEXT: movaps %xmm4, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm13[1] -; SSE-NEXT: movaps %xmm1, %xmm13 +; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] +; SSE-NEXT: movaps %xmm1, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] +; SSE-NEXT: movaps %xmm5, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm14[1] -; SSE-NEXT: movaps %xmm7, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm14[1] +; SSE-NEXT: movaps %xmm3, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm12[1] +; SSE-NEXT: movaps %xmm7, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm15[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm15[1] -; SSE-NEXT: movaps %xmm3, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm11[1] -; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1] -; SSE-NEXT: movaps 48(%rsi), %xmm12 -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] -; SSE-NEXT: movaps %xmm0, 224(%r8) -; SSE-NEXT: movaps %xmm9, 240(%r8) -; SSE-NEXT: movaps %xmm6, 192(%r8) -; SSE-NEXT: movaps %xmm11, 208(%r8) -; SSE-NEXT: movaps %xmm3, 160(%r8) +; SSE-NEXT: movaps %xmm6, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm10[1] +; SSE-NEXT: movaps 48(%rcx), %xmm10 +; SSE-NEXT: movaps %xmm0, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] +; SSE-NEXT: movaps %xmm0, 240(%r8) +; SSE-NEXT: movaps %xmm6, 224(%r8) +; SSE-NEXT: movaps %xmm9, 208(%r8) +; SSE-NEXT: movaps %xmm15, 192(%r8) ; SSE-NEXT: movaps %xmm7, 176(%r8) -; SSE-NEXT: movaps %xmm15, 128(%r8) -; SSE-NEXT: movaps %xmm14, 144(%r8) +; SSE-NEXT: movaps %xmm3, 160(%r8) +; SSE-NEXT: movaps %xmm12, 144(%r8) +; SSE-NEXT: movaps %xmm14, 128(%r8) +; SSE-NEXT: movaps %xmm5, 112(%r8) ; SSE-NEXT: movaps %xmm1, 96(%r8) -; SSE-NEXT: movaps %xmm4, 112(%r8) -; SSE-NEXT: movaps %xmm13, 64(%r8) -; SSE-NEXT: movaps %xmm10, 80(%r8) -; SSE-NEXT: movaps %xmm5, 32(%r8) +; SSE-NEXT: movaps %xmm13, 80(%r8) +; SSE-NEXT: movaps %xmm11, 64(%r8) ; SSE-NEXT: movaps %xmm2, 48(%r8) -; SSE-NEXT: movaps %xmm8, (%r8) +; SSE-NEXT: movaps %xmm4, 32(%r8) +; SSE-NEXT: movaps %xmm8, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r8) +; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride4_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm3[1] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm7 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm7[0] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm11 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm7[1] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vmovaps %xmm8, 48(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 16(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm10, (%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 160(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 144(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 128(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 160(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 176(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 144(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 48(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm9, (%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride4_vf8: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm5 ; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm6 @@ -341,32 +341,32 @@ ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm10 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 ; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm9, %ymm9 ; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm11, %ymm10 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 192(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 64(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm9, 160(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm11, 128(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm8, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 32(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 32(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm12, (%r8) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -655,13 +655,13 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm11 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm10[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm12 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm13 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm13[0],xmm12[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm13[1],xmm12[1] @@ -672,39 +672,39 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm15 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm15[0],xmm14[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm15[1],xmm14[1] -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm0[0],xmm15[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm0[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm15 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm15[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm0[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm15[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm15 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm15[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm4, 432(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 416(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 400(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 384(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 416(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 400(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm3, 384(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 304(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 288(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 272(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 256(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm14, 176(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm13, 160(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm8, 144(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm12, 128(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 304(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 288(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 272(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 48(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 16(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -830,32 +830,32 @@ ; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <2,10,u,u,3,11,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,8,u,u,1,9,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm11 ; AVX512F-NEXT: movb $-52, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512F-NEXT: vpermt2q %zmm6, %zmm9, %zmm12 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,8,u,u,1,9,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <2,10,u,u,3,11,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 ; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512F-NEXT: vpermt2q %zmm6, %zmm12, %zmm15 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = <6,14,u,u,7,15,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = <4,12,u,u,5,13,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = ; AVX512F-NEXT: vpermt2q %zmm6, %zmm15, %zmm4 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <4,12,u,u,5,13,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <6,14,u,u,7,15,u,u> ; AVX512F-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512F-NEXT: vpermi2q %zmm7, %zmm5, %zmm8 @@ -870,14 +870,14 @@ ; AVX512F-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 ; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm1, 384(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm16, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm13, 256(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm10, 320(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm17, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm14, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm1, 448(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm16, 384(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm13, 320(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm10, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm17, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm11, (%r8) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -891,32 +891,32 @@ ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <2,10,u,u,3,11,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,8,u,u,1,9,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm11 ; AVX512BW-NEXT: movb $-52, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,8,u,u,1,9,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <2,10,u,u,3,11,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <6,14,u,u,7,15,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <4,12,u,u,5,13,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <4,12,u,u,5,13,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <6,14,u,u,7,15,u,u> ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512BW-NEXT: vpermi2q %zmm7, %zmm5, %zmm8 @@ -931,14 +931,14 @@ ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64 @@ -1381,38 +1381,38 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -1429,21 +1429,21 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm1 @@ -1454,31 +1454,31 @@ ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm4, (%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 944(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm3, 928(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 912(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 896(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm5, 816(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm7, 800(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm6, 784(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm8, 768(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 944(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 928(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 912(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 896(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 688(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 672(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 656(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 688(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 672(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 656(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 640(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 560(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 544(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 528(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 640(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 512(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1488,29 +1488,29 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%r8) ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 560(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 544(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 528(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 512(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 992(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1772,32 +1772,32 @@ ; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512F-NEXT: vpermt2q %zmm21, %zmm14, %zmm8 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <2,10,u,u,3,11,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,8,u,u,1,9,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512F-NEXT: vpermt2q %zmm17, %zmm7, %zmm4 ; AVX512F-NEXT: movb $-52, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512F-NEXT: vpermt2q %zmm21, %zmm16, %zmm10 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,8,u,u,1,9,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <2,10,u,u,3,11,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512F-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512F-NEXT: vpermt2q %zmm21, %zmm18, %zmm20 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <6,14,u,u,7,15,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <4,12,u,u,5,13,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512F-NEXT: vpermt2q %zmm17, %zmm15, %zmm10 ; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = ; AVX512F-NEXT: vpermt2q %zmm21, %zmm20, %zmm22 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = <4,12,u,u,5,13,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = <6,14,u,u,7,15,u,u> ; AVX512F-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -1848,22 +1848,22 @@ ; AVX512F-NEXT: vpermt2q %zmm9, %zmm20, %zmm6 ; AVX512F-NEXT: vpermt2q %zmm5, %zmm21, %zmm3 ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm3, 896(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm15, 960(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm11, 768(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm7, 832(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm2, 640(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm27, 704(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm26, 512(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm25, 576(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm1, 384(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm24, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm22, 256(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm17, 320(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm3, 960(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm15, 896(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm11, 832(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm7, 768(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm2, 704(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm27, 640(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm26, 576(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm25, 512(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm1, 448(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm24, 384(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm22, 320(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm17, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1885,32 +1885,32 @@ ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm14, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <2,10,u,u,3,11,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,8,u,u,1,9,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm7, %zmm4 ; AVX512BW-NEXT: movb $-52, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm16, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,8,u,u,1,9,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <2,10,u,u,3,11,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm18, %zmm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <6,14,u,u,7,15,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <4,12,u,u,5,13,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm15, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm20, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <4,12,u,u,5,13,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <6,14,u,u,7,15,u,u> ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -1961,22 +1961,22 @@ ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm6 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, 896(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 768(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 832(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 640(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 704(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 512(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 576(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 960(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 896(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 832(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 768(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 704(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 640(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 576(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 512(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64 @@ -2867,26 +2867,26 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -2903,26 +2903,26 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -2939,50 +2939,50 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 256(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 288(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 256(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 288(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 352(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 288(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 288(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -2999,76 +2999,76 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 352(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 352(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 352(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps 480(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps 448(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 448(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps 416(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 448(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 448(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 480(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 480(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm4, (%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 1584(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 1568(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 1552(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm8, 1536(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 1840(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 1824(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 1808(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 1792(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 1968(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 1952(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 1936(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1968(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm3, 1952(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 1936(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 1920(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 1840(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 1824(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 1808(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 1792(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 1712(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 1696(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 1680(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 1664(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 1584(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 1568(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 1552(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1920(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1536(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1712(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1456(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1696(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1440(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1680(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1424(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1664(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1408(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 1328(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3078,14 +3078,6 @@ ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 1280(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1456(%r8) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1440(%r8) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1424(%r8) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1408(%r8) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 1200(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 1184(%r8) @@ -3094,13 +3086,13 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 1152(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 816(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1072(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 800(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1056(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 784(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1040(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 768(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1024(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 944(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3110,6 +3102,14 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 896(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 816(%r8) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 800(%r8) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 784(%r8) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 768(%r8) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 688(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 672(%r8) @@ -3118,21 +3118,21 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 640(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 560(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 416(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 544(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 528(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 512(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 416(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3142,21 +3142,21 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 560(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 544(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 528(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 512(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1072(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1056(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1040(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1024(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2016(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4263,8 +4263,6 @@ ; AVX2-FAST: {{.*}} ; AVX2-FAST-PERLANE: {{.*}} ; AVX2-SLOW: {{.*}} -; AVX512-FAST: {{.*}} -; AVX512-SLOW: {{.*}} ; AVX512BW-FAST: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} ; AVX512BW-ONLY-SLOW: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll @@ -29,9 +29,9 @@ ; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: movapd %xmm2, 16(%r9) -; SSE-NEXT: movapd %xmm0, 32(%r9) ; SSE-NEXT: movapd %xmm1, 48(%r9) +; SSE-NEXT: movapd %xmm0, 32(%r9) +; SSE-NEXT: movapd %xmm2, 16(%r9) ; SSE-NEXT: movapd %xmm3, 64(%r9) ; SSE-NEXT: movapd %xmm5, (%r9) ; SSE-NEXT: retq @@ -109,70 +109,70 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i64_stride5_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm2 -; SSE-NEXT: movaps 16(%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm4 -; SSE-NEXT: movaps 16(%rsi), %xmm6 -; SSE-NEXT: movaps (%rdx), %xmm5 -; SSE-NEXT: movaps 16(%rdx), %xmm1 -; SSE-NEXT: movaps (%rcx), %xmm7 -; SSE-NEXT: movaps 16(%rcx), %xmm8 -; SSE-NEXT: movaps (%r8), %xmm9 -; SSE-NEXT: movaps 16(%r8), %xmm3 -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] -; SSE-NEXT: movaps %xmm4, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm7[0] +; SSE-NEXT: movapd (%rdi), %xmm0 +; SSE-NEXT: movapd 16(%rdi), %xmm3 +; SSE-NEXT: movapd (%rsi), %xmm2 +; SSE-NEXT: movapd 16(%rsi), %xmm4 +; SSE-NEXT: movapd (%rdx), %xmm1 +; SSE-NEXT: movapd 16(%rdx), %xmm5 +; SSE-NEXT: movapd (%rcx), %xmm6 +; SSE-NEXT: movapd 16(%rcx), %xmm7 +; SSE-NEXT: movapd (%r8), %xmm8 +; SSE-NEXT: movapd 16(%r8), %xmm9 +; SSE-NEXT: movapd %xmm0, %xmm10 +; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm2[0] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm6[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] +; SSE-NEXT: movapd %xmm3, %xmm8 +; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm4[0] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm9[0],xmm3[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm5 = xmm5[0],xmm7[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm2[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; SSE-NEXT: movaps %xmm2, (%r9) -; SSE-NEXT: movaps %xmm5, 16(%r9) -; SSE-NEXT: movaps %xmm9, 32(%r9) -; SSE-NEXT: movaps %xmm6, 48(%r9) -; SSE-NEXT: movaps %xmm7, 64(%r9) -; SSE-NEXT: movaps %xmm0, 80(%r9) -; SSE-NEXT: movaps %xmm1, 96(%r9) -; SSE-NEXT: movaps %xmm3, 112(%r9) -; SSE-NEXT: movaps %xmm10, 128(%r9) -; SSE-NEXT: movaps %xmm8, 144(%r9) +; SSE-NEXT: movapd %xmm7, 144(%r9) +; SSE-NEXT: movapd %xmm4, 128(%r9) +; SSE-NEXT: movapd %xmm3, 112(%r9) +; SSE-NEXT: movapd %xmm5, 96(%r9) +; SSE-NEXT: movapd %xmm8, 80(%r9) +; SSE-NEXT: movapd %xmm6, 64(%r9) +; SSE-NEXT: movapd %xmm2, 48(%r9) +; SSE-NEXT: movapd %xmm0, 32(%r9) +; SSE-NEXT: movapd %xmm1, 16(%r9) +; SSE-NEXT: movapd %xmm10, (%r9) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride5_vf4: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm1[1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovlpd {{.*#+}} xmm3 = mem[0],xmm3[1] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm1[2],ymm3[3] +; AVX1-ONLY-NEXT: vmovlpd {{.*#+}} xmm4 = mem[0],xmm4[1] ; AVX1-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm5, (%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 96(%r9) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 32(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 96(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 128(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 64(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 128(%r9) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -182,34 +182,34 @@ ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm2 ; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm4 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm5[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm6 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm4 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1],ymm6[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%r9) +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r9) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -222,6 +222,7 @@ ; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = <15,3,7,u> ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,4,8,12,u,1,5,9> ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] @@ -232,8 +233,7 @@ ; AVX512-NEXT: vpermi2q %zmm2, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512-NEXT: vmovdqa %ymm0, 128(%r9) +; AVX512-NEXT: vmovdqa %ymm3, 128(%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %in.vec0 = load <4 x i64>, ptr %in.vecptr0, align 64 @@ -332,132 +332,132 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride5_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm3 -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm7[1],ymm0[3],ymm7[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm5[1],ymm6[2,3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[2],ymm8[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm5[2],ymm8[3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm5[0],ymm9[1,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm3[2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm13[1],xmm10[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm11[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm5 +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm2 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm7[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm11[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm2[0],ymm8[1,2,3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0],ymm2[1],ymm11[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm12[0],ymm7[0],ymm12[2],ymm7[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm2[2],ymm7[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm6[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm4, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm7, (%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 160(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 176(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 64(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 192(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 256(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 224(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 176(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm6, (%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 256(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 224(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 192(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 288(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 288(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r9) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride5_vf8: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm4 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm6 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm3 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm11 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm3[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm11 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1],ymm7[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm6[0,1],ymm9[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm11, %ymm11 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm4[2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1],ymm10[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm2[2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm1[2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%rsi), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,3,3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%rsi), %ymm14 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm11, 64(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 160(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 192(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 256(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm7, (%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r9) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 224(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 192(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 160(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm5, (%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm13, 128(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm12, 288(%r9) ; AVX2-ONLY-NEXT: vzeroupper @@ -782,137 +782,135 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride5_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $216, %rsp -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm2 +; AVX1-ONLY-NEXT: subq $168, %rsp +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm4 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vmovapd 96(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1],ymm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],mem[0],ymm8[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm1[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm0[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm11[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1],ymm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[3] -; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm6[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm8[0],ymm13[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm15[0],ymm8[1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm5[0,1],ymm8[2],ymm5[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm9[2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm0 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0],ymm3[1,2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm0[2],ymm9[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm14[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm11 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm11[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm11 +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm11[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm13[0],ymm11[1],ymm13[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],ymm11[2],ymm15[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm5[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm5[2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm5[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm8[0],ymm4[1,2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm1[0],ymm8[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1],ymm8[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm15, (%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm1, 496(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 480(%r9) +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm9, 496(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 480(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 336(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 320(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 160(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 336(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 320(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 576(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 512(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 384(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 352(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 256(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm8, (%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 576(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 544(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 512(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 384(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 352(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm15, 256(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -921,18 +919,17 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 608(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 544(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovapd %ymm5, 608(%r9) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 416(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 416(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX1-ONLY-NEXT: addq $216, %rsp +; AVX1-ONLY-NEXT: addq $168, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -1097,198 +1094,194 @@ ; ; AVX512F-LABEL: store_i64_stride5_vf16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm2 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm9 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm10 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm12 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm11 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm12 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm4 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm0 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm2 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm13 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm11 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm5 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <3,u,u,u,12,4,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm6, %zmm3 ; AVX512F-NEXT: movb $49, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} ; AVX512F-NEXT: movb $8, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 {%k2} -; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,8,u,u,u,1,9,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm7, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm16, %zmm14 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,8,u,u,u,1,9,u> +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm9, %zmm8 ; AVX512F-NEXT: movb $-116, %al -; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] -; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = <6,14,u,u,u,7,15,u> -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm14, %zmm13 -; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] -; AVX512F-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm11, %zmm17, %zmm8 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = <1,u,u,u,10,2,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm19 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm15, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,9,2,3,4,5,10,7] +; AVX512F-NEXT: vpermt2q %zmm11, %zmm19, %zmm14 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] ; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm22, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm22, %zmm23 +; AVX512F-NEXT: movb $24, %al +; AVX512F-NEXT: kmovw %eax, %k3 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm23 {%k3} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] ; AVX512F-NEXT: vpermt2q %zmm11, %zmm21, %zmm23 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = <1,u,u,u,10,2,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm24, %zmm25 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm26, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] -; AVX512F-NEXT: vpermt2q %zmm11, %zmm25, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm16, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [15,7,15,7,15,7,15,7] +; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm12, %zmm24, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <6,14,u,u,u,7,15,u> +; AVX512F-NEXT: vpermt2q %zmm13, %zmm12, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 {%k3} -; AVX512F-NEXT: vpermt2q %zmm11, %zmm17, %zmm0 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm18 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm6, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm14 {%k2} -; AVX512F-NEXT: vpermt2q %zmm4, %zmm19, %zmm14 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm6, %zmm20 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm22 {%k2} -; AVX512F-NEXT: vpermt2q %zmm4, %zmm21, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm24, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm26, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} -; AVX512F-NEXT: vpermt2q %zmm4, %zmm25, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm22, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm14, 256(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm0, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm27, 384(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm3, 448(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm23, 512(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm13, 576(%r9) +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,14,3,4,5,6,15] +; AVX512F-NEXT: vpermt2q %zmm11, %zmm10, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm7, %zmm2, %zmm16 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm1, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm9 {%k2} +; AVX512F-NEXT: vpermt2q %zmm5, %zmm17, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm7, %zmm18 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm1, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} +; AVX512F-NEXT: vpermt2q %zmm5, %zmm19, %zmm15 +; AVX512F-NEXT: vpermi2q %zmm7, %zmm2, %zmm20 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm1, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm22 {%k3} +; AVX512F-NEXT: vpermt2q %zmm5, %zmm21, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm24, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k3} +; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, 576(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm22, 512(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm6, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm15, 384(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm9, 320(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm23, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride5_vf16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm10 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm11 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm13 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm11 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm5 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <3,u,u,u,12,4,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm6, %zmm3 ; AVX512BW-NEXT: movb $49, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} ; AVX512BW-NEXT: movb $8, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 {%k2} -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,8,u,u,u,1,9,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm16, %zmm14 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,8,u,u,u,1,9,u> +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm9, %zmm8 ; AVX512BW-NEXT: movb $-116, %al -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <6,14,u,u,u,7,15,u> -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm14, %zmm13 -; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm17, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <1,u,u,u,10,2,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm19 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,9,2,3,4,5,10,7] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm14 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm22, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm23 +; AVX512BW-NEXT: movb $24, %al +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm23 {%k3} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm21, %zmm23 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = <1,u,u,u,10,2,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm24, %zmm25 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm26, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm25, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm16, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [15,7,15,7,15,7,15,7] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm24, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <6,14,u,u,u,7,15,u> +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k3} -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm17, %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm18 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm14 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm14 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm20 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm22 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm24, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm26, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm25, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 512(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 576(%r9) +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,14,3,4,5,6,15] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm10, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm2, %zmm16 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm17, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm7, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm19, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm2, %zmm20 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm22 {%k3} +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm24, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k3} +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, 576(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 512(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 384(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1702,308 +1695,299 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride5_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1048, %rsp # imm = 0x418 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm6 +; AVX1-ONLY-NEXT: subq $984, %rsp # imm = 0x3D8 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm6[1],ymm1[3],ymm6[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 144(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 208(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX1-ONLY-NEXT: vmovapd 96(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 144(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],mem[0],ymm7[2],mem[2] -; AVX1-ONLY-NEXT: vmovapd 160(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 208(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],mem[0],ymm8[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],mem[0],ymm14[2],mem[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 136(%rsi), %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 168(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 200(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm0[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm6[0,1,2],ymm11[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm15[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm15[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm8[0],ymm2[0],ymm8[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm15[1],xmm8[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm15[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0],ymm15[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm15[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm8 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, (%rsp), %ymm8, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm8[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm8[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 168(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm0[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 200(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm6[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm8[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm6[0],mem[0],ymm6[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm8[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm8[0],ymm6[0],ymm8[2],ymm6[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm7 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm7[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm7[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm7 +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm7[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm7[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm3[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm3[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $207, (%rsp), %ymm3, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3],ymm3[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm5[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm8 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0],ymm8[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm8[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm8[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm2[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm2[2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm2[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 176(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm5 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0],ymm5[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm5[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd 176(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm2[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0],ymm2[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm2[2],ymm4[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 240(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0],ymm11[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm4[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm3[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm1[2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 240(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0],ymm11[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0],ymm1[1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm1[2],ymm5[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm14[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm10, (%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm2, 976(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 960(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1136(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 1120(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 816(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 800(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 496(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 480(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 160(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 336(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 320(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm4, 656(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 640(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm9, 1136(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 1120(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 976(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 960(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 816(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 800(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 656(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 640(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm3, 496(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 480(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 336(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 320(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm8, (%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1216(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2011,22 +1995,28 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1152(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1024(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 992(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 896(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 864(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 832(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 672(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%r9) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%r9) @@ -2045,20 +2035,14 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1056(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1024(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 928(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 864(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 768(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 736(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%r9) @@ -2068,7 +2052,7 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX1-ONLY-NEXT: addq $1048, %rsp # imm = 0x418 +; AVX1-ONLY-NEXT: addq $984, %rsp # imm = 0x3D8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -2433,405 +2417,393 @@ ; ; AVX512F-LABEL: store_i64_stride5_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm16 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm17 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm24 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm22 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm27 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm23 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm25 -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm18 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <3,u,u,u,12,4,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm19, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 +; AVX512F-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm19 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm28 +; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm27 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm20 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm11 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm10 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm21 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <3,u,u,u,12,4,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm17, %zmm19 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm12, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,8,u,u,u,1,9,u> -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] -; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <6,14,u,u,u,7,15,u> -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] -; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm31 = <1,u,u,u,10,2,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm28, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm12, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm21, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm20, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm22, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm31, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm25, %zmm28, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm31, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm28, %zmm24 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512F-NEXT: vpermt2q %zmm23, %zmm12, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,8,u,u,u,1,9,u> +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm9, %zmm18 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <1,u,u,u,10,2,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm15, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm28, %zmm23 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm17, %zmm12 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm17, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm31, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm13, %zmm17 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm16, %zmm24 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [15,7,15,7,15,7,15,7] +; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm19, %zmm31, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm9, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm13, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm16, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm31, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm9, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm13, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm16, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm31, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [13,5,13,5,13,5,13,5] +; AVX512F-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm20, %zmm1, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm20, %zmm1, %zmm13 +; AVX512F-NEXT: vpermi2q %zmm20, %zmm1, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm31, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm10, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 -; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm27, %zmm14 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <6,14,u,u,u,7,15,u> +; AVX512F-NEXT: vpermt2q %zmm21, %zmm7, %zmm11 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm3 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm11, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm28, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm15 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm31 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm28, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 -; AVX512F-NEXT: vpermi2q %zmm31, %zmm1, %zmm11 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm31, %zmm29 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm31, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm15, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm27, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm10 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm31 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm15, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm31 +; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm1 +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm22 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm15 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 ; AVX512F-NEXT: movb $49, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1} -; AVX512F-NEXT: movb $-116, %al -; AVX512F-NEXT: kmovw %eax, %k3 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} -; AVX512F-NEXT: movb $24, %al +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512F-NEXT: movb $-116, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm20 {%k3} -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm15 {%k3} -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 {%k3} -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm28 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512F-NEXT: movb $8, %al +; AVX512F-NEXT: kmovw %eax, %k2 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm17 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm24 {%k3} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm11 {%k3} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,9,2,3,4,5,10,7] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm17 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [12,1,2,3,4,13,6,7] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm24 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,14,3,4,5,6,15] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm25 {%k3} ; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} -; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm30 {%k3} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm9 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm14 {%k3} -; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm3, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm12 {%k2} -; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm28 {%k2} -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm28 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k3} -; AVX512F-NEXT: vmovdqa64 %zmm31, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm19, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm28, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm12, 256(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm2, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm4, 384(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm14, 448(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm5, 512(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm25, 576(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm15, 640(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm8, 704(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm10, 768(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm16, 832(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm18, 896(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm20, 960(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm17, 1024(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm22, 1088(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm23, 1152(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm26, 1216(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%r9) -; AVX512F-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} +; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm31 +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm13 {%k1} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm16 {%k3} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm16 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k3} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, 1216(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm16, 1152(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm12, 1088(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm13, 1024(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm9, 960(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm31, 896(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm30, 832(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm21, 768(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm29, 704(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm28, 640(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm10, 576(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm25, 512(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm20, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm23, 384(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm19, 320(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm11, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm24, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm22, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm18, (%r9) +; AVX512F-NEXT: addq $520, %rsp # imm = 0x208 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride5_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm17 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm24 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm22 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm27 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm23 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm25 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <3,u,u,u,12,4,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 +; AVX512BW-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm11 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm10 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <3,u,u,u,12,4,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm17, %zmm19 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm12, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,8,u,u,u,1,9,u> -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm29, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <6,14,u,u,u,7,15,u> -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm26 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = <1,u,u,u,10,2,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm31, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm28, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm12, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm21, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm20, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm22, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm28, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm31, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm28, %zmm24 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm12, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,8,u,u,u,1,9,u> +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm9, %zmm18 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <1,u,u,u,10,2,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm28, %zmm23 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm17, %zmm12 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm17, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm31, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm13, %zmm17 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm16, %zmm24 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [15,7,15,7,15,7,15,7] +; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm31, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm9, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm13, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm16, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm31, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm9, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm13, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm16, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm31, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [13,5,13,5,13,5,13,5] +; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm20, %zmm1, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm20, %zmm1, %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm20, %zmm1, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm31, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm10, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm27, %zmm14 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <6,14,u,u,u,7,15,u> +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm7, %zmm11 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm11, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm28, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm28, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm1, %zmm11 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm31, %zmm29 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm31, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm15, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm27, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm10 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm31 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm22, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm15, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm27, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm31 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm22 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 ; AVX512BW-NEXT: movb $49, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1} -; AVX512BW-NEXT: movb $-116, %al -; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} -; AVX512BW-NEXT: movb $24, %al +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-NEXT: movb $-116, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k3} -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k3} -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 {%k3} -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512BW-NEXT: movb $8, %al +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,14,3,4,5,6,15] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm26 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,1,2,3,4,13,6,7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,9,2,3,4,5,10,7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm17 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,8,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm24 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm11 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,9,2,3,4,5,10,7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm17 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [12,1,2,3,4,13,6,7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm24 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,14,3,4,5,6,15] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm25 {%k3} ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k3} -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm30 {%k3} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm14 {%k3} -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm12 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm28 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm28 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k3} -; AVX512BW-NEXT: vmovdqa64 %zmm31, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 512(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 576(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 640(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 704(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 768(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 832(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 896(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 960(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 1024(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 1088(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 1152(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 1216(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r9) -; AVX512BW-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k3} +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm31 +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm16 {%k3} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm16 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k3} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm1, 1216(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 1152(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 1088(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 1024(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 960(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 896(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 832(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 768(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 704(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 640(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 576(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 512(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 384(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm18, (%r9) +; AVX512BW-NEXT: addq $520, %rsp # imm = 0x208 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64 @@ -2865,8 +2837,8 @@ ; SSE-NEXT: movapd (%rcx), %xmm12 ; SSE-NEXT: movapd 16(%rcx), %xmm13 ; SSE-NEXT: movapd 32(%rcx), %xmm14 -; SSE-NEXT: movapd 16(%r8), %xmm1 ; SSE-NEXT: movapd 32(%r8), %xmm0 +; SSE-NEXT: movapd 16(%r8), %xmm1 ; SSE-NEXT: movapd (%r8), %xmm2 ; SSE-NEXT: movapd %xmm3, %xmm15 ; SSE-NEXT: unpcklpd {{.*#+}} xmm15 = xmm15[0],xmm6[0] @@ -3661,163 +3633,117 @@ ; AVX1-ONLY-LABEL: store_i64_stride5_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $2264, %rsp # imm = 0x8D8 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 256(%rcx), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm4 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 448(%rcx), %ymm0 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 144(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 208(%rdx), %xmm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 272(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 336(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 400(%rdx), %xmm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 144(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],mem[0],ymm4[2],mem[2] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],mem[0],ymm14[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 208(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 272(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],mem[0],ymm14[2],mem[2] -; AVX1-ONLY-NEXT: vmovapd 288(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm10 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 336(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm9 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 400(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rcx), %ymm0 ; AVX1-ONLY-NEXT: vmovaps 464(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovapd %ymm1, %ymm13 -; AVX1-ONLY-NEXT: vmovapd 480(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm7 ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],mem[0],ymm8[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],mem[0],ymm7[2],mem[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vbroadcastsd 136(%rsi), %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] @@ -3830,142 +3756,192 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 168(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 200(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 168(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vbroadcastsd 200(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vbroadcastsd 264(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],mem[0],ymm3[2],mem[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 296(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 288(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 296(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 288(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 304(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vbroadcastsd 328(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vbroadcastsd 360(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 352(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 360(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 352(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 368(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 392(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 424(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 424(%rsi), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm0[0,1],ymm8[2,3] ; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 456(%rsi), %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 432(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vbroadcastsd 456(%rsi), %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vbroadcastsd 488(%rsi), %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm8[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 488(%rsi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm13[0,1],ymm10[2,3] -; AVX1-ONLY-NEXT: vmovapd %ymm13, %ymm9 -; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm12 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm12[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm12[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[2],ymm6[3] -; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm12[1],xmm8[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm12 -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm12[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0],ymm12[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm12[2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm8[0],mem[0],ymm8[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm9[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovapd 496(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[2],ymm8[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm8[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0,1],ymm8[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm8[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0,1],ymm8[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2,3],ymm8[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm7 ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm7[6,7] @@ -3978,25 +3954,22 @@ ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[2],ymm6[3] -; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm8 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = mem[0],ymm8[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm8[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm7 +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm7[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0,1],ymm7[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2,3],ymm7[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] @@ -4012,163 +3985,149 @@ ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[3] -; AVX1-ONLY-NEXT: vmovapd 176(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm7 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[0],ymm7[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm7[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 176(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm6 +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm6[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm6[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3],ymm6[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX1-ONLY-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm6 -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1],ymm6[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm6[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovapd 240(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm6 -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm6[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0],ymm6[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm5[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm5[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 240(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm5[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm5[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3],ymm5[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 256(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm4[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1],ymm4[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovapd 304(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovapd 304(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastsd 312(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovapd 288(%r8), %ymm5 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0],ymm5[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm5[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm4[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm4[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 304(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 312(%rcx), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 288(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm4[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm4[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3],ymm4[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX1-ONLY-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 320(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1],ymm4[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm4[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 368(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 368(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vbroadcastsd 376(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovapd 352(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm4[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0],ymm4[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 320(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm3[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 368(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 376(%rcx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 352(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm3[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 384(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm3[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 432(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 432(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 440(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovapd 416(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm3[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0],ymm3[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 384(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm2[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 432(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 440(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 416(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm2[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3],ymm2[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 448(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] @@ -4176,54 +4135,44 @@ ; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm1[2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovapd 496(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 496(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 504(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 480(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[0],ymm2[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm2[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 496(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 504(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 480(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm10[0],ymm1[1],ymm10[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2],mem[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm14[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4235,105 +4184,106 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm9, (%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 1936(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 1920(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 2416(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 2400(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm0, 2256(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm11, 2240(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 2416(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 2400(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm1, 2096(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 2080(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 2080(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 1936(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 1920(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm3, 1776(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 1760(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 1616(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 1600(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 1456(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 1440(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 1296(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 1280(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1616(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 1600(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm2, 1776(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 1760(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1136(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1456(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 1440(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1120(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 976(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm4, 960(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 1136(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 1120(%r9) +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 960(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 816(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 800(%r9) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 496(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 480(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 656(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 640(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 496(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 480(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 336(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 320(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 656(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 640(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1296(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1280(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2496(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 2464(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2432(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2304(%r9) @@ -4346,6 +4296,8 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2112(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1984(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1952(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1856(%r9) @@ -4354,10 +4306,14 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1792(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1664(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1632(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1536(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1504(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1472(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1344(%r9) @@ -4370,10 +4326,14 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1152(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1024(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 992(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 896(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 864(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 832(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%r9) @@ -4382,6 +4342,8 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%r9) @@ -4400,8 +4362,6 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2528(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 2464(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2368(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2336(%r9) @@ -4412,20 +4372,14 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2016(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1984(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1888(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1728(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1696(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1664(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1568(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1504(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1408(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1376(%r9) @@ -4436,20 +4390,14 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1056(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1024(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 928(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 864(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 768(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 736(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%r9) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll @@ -29,15 +29,15 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] ; SSE-NEXT: movaps %xmm2, %xmm7 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] -; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm5[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movaps %xmm2, 16(%rax) -; SSE-NEXT: movaps %xmm4, 32(%rax) +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; SSE-NEXT: movaps %xmm4, 80(%rax) ; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps %xmm8, 80(%rax) +; SSE-NEXT: movaps %xmm3, 32(%rax) +; SSE-NEXT: movaps %xmm2, 16(%rax) ; SSE-NEXT: movaps %xmm7, 64(%rax) ; SSE-NEXT: movaps %xmm6, (%rax) ; SSE-NEXT: retq @@ -126,94 +126,92 @@ ; SSE-LABEL: store_i64_stride6_vf4: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps (%rdi), %xmm2 -; SSE-NEXT: movaps 16(%rdi), %xmm1 +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movaps 16(%rdi), %xmm2 ; SSE-NEXT: movaps (%rsi), %xmm5 ; SSE-NEXT: movaps 16(%rsi), %xmm6 -; SSE-NEXT: movaps (%rdx), %xmm0 -; SSE-NEXT: movaps 16(%rdx), %xmm4 -; SSE-NEXT: movaps (%rcx), %xmm7 -; SSE-NEXT: movaps 16(%rcx), %xmm8 -; SSE-NEXT: movaps (%r8), %xmm9 +; SSE-NEXT: movaps (%rdx), %xmm1 +; SSE-NEXT: movaps 16(%rdx), %xmm7 +; SSE-NEXT: movaps (%rcx), %xmm8 +; SSE-NEXT: movaps 16(%rcx), %xmm9 +; SSE-NEXT: movaps (%r8), %xmm4 ; SSE-NEXT: movaps 16(%r8), %xmm10 ; SSE-NEXT: movaps (%r9), %xmm11 ; SSE-NEXT: movaps 16(%r9), %xmm12 -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] -; SSE-NEXT: movaps %xmm10, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] -; SSE-NEXT: movaps %xmm9, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm11[1] -; SSE-NEXT: movaps %xmm0, %xmm15 -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm7[1] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; SSE-NEXT: movaps %xmm1, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm8[0] +; SSE-NEXT: movaps %xmm4, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm11[1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm9[0] +; SSE-NEXT: movaps %xmm10, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm12[1] -; SSE-NEXT: movaps %xmm4, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm8[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm8[0] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm11[0] -; SSE-NEXT: movaps %xmm9, 32(%rax) -; SSE-NEXT: movaps %xmm2, 48(%rax) -; SSE-NEXT: movaps %xmm1, 96(%rax) -; SSE-NEXT: movaps %xmm4, 112(%rax) -; SSE-NEXT: movaps %xmm12, 160(%rax) ; SSE-NEXT: movaps %xmm10, 176(%rax) -; SSE-NEXT: movaps %xmm7, (%rax) -; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps %xmm15, 64(%rax) -; SSE-NEXT: movaps %xmm14, 80(%rax) -; SSE-NEXT: movaps %xmm13, 128(%rax) -; SSE-NEXT: movaps %xmm3, 144(%rax) +; SSE-NEXT: movaps %xmm7, 160(%rax) +; SSE-NEXT: movaps %xmm2, 144(%rax) +; SSE-NEXT: movaps %xmm11, 128(%rax) +; SSE-NEXT: movaps %xmm8, 112(%rax) +; SSE-NEXT: movaps %xmm5, 96(%rax) +; SSE-NEXT: movaps %xmm4, 80(%rax) +; SSE-NEXT: movaps %xmm1, 64(%rax) +; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: movaps %xmm14, 32(%rax) +; SSE-NEXT: movaps %xmm13, 16(%rax) +; SSE-NEXT: movaps %xmm3, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride6_vf4: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm3 -; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm3[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm9[1],xmm8[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm10, %ymm11 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm4[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[3] -; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm6[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm9[0],xmm8[0] -; AVX1-ONLY-NEXT: vmovaps %xmm4, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm0, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 32(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm0 +; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm1 +; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm2 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm5, %ymm6 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm1[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm2[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm10 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm10[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[2],ymm9[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm7[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm7, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 128(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 160(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -226,39 +224,40 @@ ; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm5 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = xmm5[0,0] -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm7 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm8 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm7[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm4[0,1],ymm9[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm9 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm10 +; AVX2-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = xmm5[0,0] +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm10 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm10[1],xmm9[1] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm3 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 16(%r9), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm2 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rax) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -420,178 +419,180 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride6_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm11 -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm14 +; AVX1-ONLY-NEXT: pushq %rax +; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm14 +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm13 +; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm9 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm2, %ymm7 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm7[2,3],ymm2[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm1, %ymm8 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm8[2,3],ymm1[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm9[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm7, %ymm10 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm12[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm14[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm15[0],ymm10[1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm14[2,3],ymm13[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm10[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0],ymm8[1],ymm11[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm12[1],xmm5[1] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm15 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm9[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[2],ymm15[3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm15 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3] ; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm15[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm13[0],ymm14[0],ymm13[2],ymm14[3] -; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm1[2,3],ymm15[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm11[0],ymm14[0],ymm11[2],ymm14[3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm12[0],xmm4[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm8[0],xmm6[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm5[0] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm13[0],ymm15[0],ymm13[2],ymm15[3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm7[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm10[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm12[0],xmm5[0] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 208(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 16(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 208(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm4, 192(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm15, 320(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm14, 128(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm13, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 160(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm10, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 352(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: popq %rax ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride6_vf8: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm4 -; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm3 +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm2 ; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm5 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = xmm3[0,0] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = xmm2[0,0] ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm8 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm6 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm9[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[0,1],ymm2[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm12 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm13 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm10 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm10[1],xmm12[1] -; AVX2-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm13[1],xmm11[1] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm9[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = xmm5[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm14[1],xmm7[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm4[0,1],ymm8[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm11 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm13 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm12[1],xmm10[1] +; AVX2-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = xmm5[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm7[1],xmm8[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1],ymm3[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm13[1],xmm11[1] +; AVX2-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm12 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm14, %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm14 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%r9), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm14 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm11 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm13 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 16(%r9), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm10 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm10[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm14[0],ymm12[0],ymm14[2],ymm12[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm13[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm14[1],mem[1],ymm14[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm14[1],ymm12[1],ymm14[3],ymm12[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%r9), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm11[2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm12 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm8[1],ymm13[1],ymm8[3],ymm13[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 16(%r9), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm12[2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm13[0],ymm8[2],ymm13[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 352(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 288(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 160(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 288(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 320(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 352(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 128(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 224(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 256(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 256(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -599,85 +600,85 @@ ; AVX512F-LABEL: store_i64_stride6_vf8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512F-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12] -; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,12,5,13,4,12,5,13] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 ; AVX512F-NEXT: movb $12, %r10b ; AVX512F-NEXT: kmovw %r10d, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} ; AVX512F-NEXT: movb $16, %r10b ; AVX512F-NEXT: kmovw %r10d, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10] ; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 ; AVX512F-NEXT: movb $48, %r9b ; AVX512F-NEXT: kmovw %r9d, %k2 ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,9,u,4,5,6,7> ; AVX512F-NEXT: vpermi2q %zmm1, %zmm8, %zmm7 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,9,4,5,6,7] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,6,14,6,14,6,14] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14] ; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,13,u,4,5,6,7> ; AVX512F-NEXT: vpermi2q %zmm1, %zmm9, %zmm7 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,1,9,0,8,1,9] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 ; AVX512F-NEXT: vmovdqa (%rdx), %xmm10 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] ; AVX512F-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm7 {%k1} ; AVX512F-NEXT: vinserti32x4 $2, (%r8), %zmm7, %zmm7 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,6,7] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm7, %zmm10 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15] -; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm11 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <14,u,2,3,4,5,15,u> -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm6 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,14,2,3,4,5,6,15] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm6, %zmm4 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <10,u,2,3,4,5,11,u> -; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm7, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <10,u,2,3,4,5,11,u> +; AVX512F-NEXT: vpermi2q %zmm1, %zmm7, %zmm11 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,10,2,3,4,5,6,11] +; AVX512F-NEXT: vpermi2q %zmm6, %zmm11, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] +; AVX512F-NEXT: vpermi2q %zmm6, %zmm5, %zmm11 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [7,15,7,15] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm5[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,14,2,3,4,5,6,15] +; AVX512F-NEXT: vpermi2q %zmm6, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm9, 256(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512F-NEXT: vzeroupper @@ -686,85 +687,85 @@ ; AVX512BW-LABEL: store_i64_stride6_vf8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12] -; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,12,5,13,4,12,5,13] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 ; AVX512BW-NEXT: movb $12, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} ; AVX512BW-NEXT: movb $16, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 ; AVX512BW-NEXT: movb $48, %r9b ; AVX512BW-NEXT: kmovd %r9d, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,9,u,4,5,6,7> ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm7 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,9,4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,6,14,6,14,6,14] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,13,u,4,5,6,7> ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm9, %zmm7 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,1,9,0,8,1,9] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm10 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm7 {%k1} ; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm7, %zmm7 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,6,7] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm10 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15] -; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm11 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <14,u,2,3,4,5,15,u> -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,14,2,3,4,5,6,15] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm4 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <10,u,2,3,4,5,11,u> -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm7, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <10,u,2,3,4,5,11,u> +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm7, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,10,2,3,4,5,6,11] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm11, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm11 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [7,15,7,15] +; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm5[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,14,2,3,4,5,6,15] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -1030,18 +1031,18 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride6_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $440, %rsp # imm = 0x1B8 +; AVX1-ONLY-NEXT: subq $504, %rsp # imm = 0x1F8 ; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm11 ; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm0 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3] @@ -1052,7 +1053,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm1, %ymm2 ; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm3 @@ -1062,7 +1063,7 @@ ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3] @@ -1075,102 +1076,97 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm10[1] +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm1[1] ; AVX1-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r9), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm10[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm6[1] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm7[1] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r9), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3],ymm8[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm12[1],ymm2[3],ymm12[3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm2 = mem[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm4[2,3],ymm12[2,3] +; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm8 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm8[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm11[0],ymm2[2],ymm11[3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm11[1],ymm2[3],ymm11[3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm2 = mem[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm2[2,3],ymm11[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm2[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm11[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm11[0],ymm5[2],ymm11[3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm11[1],ymm5[3],ymm11[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm5[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm11[0],ymm3[2],ymm11[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm3 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovapd 96(%r9), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm1[2,3],ymm11[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm5[0],ymm11[0],ymm5[2],ymm11[3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm11[0],ymm3[2],ymm11[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm3[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm14 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm14[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm13 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm13[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] @@ -1180,54 +1176,57 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm14[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm12 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm12[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm10[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm6[0] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 592(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 576(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 208(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 400(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 384(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 704(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 512(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 320(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 736(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 592(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 576(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 400(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm12, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 736(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm13, 672(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 640(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 608(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm0, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 480(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 640(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 544(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 480(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 352(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 288(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1240,7 +1239,7 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $440, %rsp # imm = 0x1B8 +; AVX1-ONLY-NEXT: addq $504, %rsp # imm = 0x1F8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -1262,69 +1261,73 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 ; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm15[1],xmm3[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[0,1],ymm3[0,1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm9 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm10 ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm13 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm8 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm9 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm11 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm9[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm9[1],xmm10[1] ; AVX2-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[0,1],ymm2[0,1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm13[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm8 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm6[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm8[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm4 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm5 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm6 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm5[1] ; AVX2-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm3[1],xmm2[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[0,1],ymm10[0,1] +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm4[1],xmm3[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 96(%r9), %xmm0 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = xmm0[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm12[2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm12 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm10[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm2[1] ; AVX2-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm14 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm15, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm15, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1333,11 +1336,11 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm4, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm2 @@ -1370,27 +1373,27 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm6 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm5[2,3],ymm6[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm8 ; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm10[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] ; AVX2-ONLY-NEXT: vperm2f128 $19, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = mem[2,3],ymm7[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 80(%r9), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 80(%r9), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm10[1],mem[1],ymm10[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm10 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm10[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm10[2,3] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm11 ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] ; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm10 = mem[2,3],ymm10[2,3] @@ -1402,13 +1405,13 @@ ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm11, 736(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm10, 704(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 672(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 672(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 576(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 544(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 544(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm7, 512(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm5, 480(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm14, 384(%rax) @@ -1442,1125 +1445,285 @@ ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; -; AVX512F-ONLY-SLOW-LABEL: store_i64_stride6_vf16: -; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: movb $12, %r10b -; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: movb $16, %r10b -; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: movb $48, %r9b -; AVX512F-ONLY-SLOW-NEXT: kmovw %r9d, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] -; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512F-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] -; AVX512F-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vzeroupper -; AVX512F-ONLY-SLOW-NEXT: retq -; -; AVX512F-ONLY-FAST-LABEL: store_i64_stride6_vf16: -; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 -; AVX512F-ONLY-FAST-NEXT: movb $12, %r10b -; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: movb $16, %r10b -; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 -; AVX512F-ONLY-FAST-NEXT: movb $48, %r9b -; AVX512F-ONLY-FAST-NEXT: kmovw %r9d, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] -; AVX512F-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512F-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %xmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] -; AVX512F-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512F-ONLY-FAST-NEXT: vzeroupper -; AVX512F-ONLY-FAST-NEXT: retq -; -; AVX512DQ-SLOW-LABEL: store_i64_stride6_vf16: -; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 -; AVX512DQ-SLOW-NEXT: movb $12, %r10b -; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: movb $16, %r10b -; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] -; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 -; AVX512DQ-SLOW-NEXT: movb $48, %r9b -; AVX512DQ-SLOW-NEXT: kmovw %r9d, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] -; AVX512DQ-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] -; AVX512DQ-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512DQ-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %xmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] -; AVX512DQ-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [7,15,7,15] -; AVX512DQ-SLOW-NEXT: # ymm22 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] -; AVX512DQ-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 448(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512DQ-SLOW-NEXT: vzeroupper -; AVX512DQ-SLOW-NEXT: retq -; -; AVX512DQ-FAST-LABEL: store_i64_stride6_vf16: -; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 -; AVX512DQ-FAST-NEXT: movb $12, %r10b -; AVX512DQ-FAST-NEXT: kmovw %r10d, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: movb $16, %r10b -; AVX512DQ-FAST-NEXT: kmovw %r10d, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] -; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 -; AVX512DQ-FAST-NEXT: movb $48, %r9b -; AVX512DQ-FAST-NEXT: kmovw %r9d, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] -; AVX512DQ-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512DQ-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %xmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [7,15,7,15] -; AVX512DQ-FAST-NEXT: # ymm22 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512DQ-FAST-NEXT: vzeroupper -; AVX512DQ-FAST-NEXT: retq -; -; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride6_vf16: -; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: movb $12, %r10b -; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: movb $16, %r10b -; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: movb $48, %r9b -; AVX512BW-ONLY-SLOW-NEXT: kmovd %r9d, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] -; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] -; AVX512BW-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 448(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 512(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vzeroupper -; AVX512BW-ONLY-SLOW-NEXT: retq -; -; AVX512BW-ONLY-FAST-LABEL: store_i64_stride6_vf16: -; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: movb $12, %r10b -; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: movb $16, %r10b -; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: movb $48, %r9b -; AVX512BW-ONLY-FAST-NEXT: kmovd %r9d, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] -; AVX512BW-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] -; AVX512BW-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 512(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vzeroupper -; AVX512BW-ONLY-FAST-NEXT: retq -; -; AVX512DQBW-SLOW-LABEL: store_i64_stride6_vf16: -; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 -; AVX512DQBW-SLOW-NEXT: movb $12, %r10b -; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: movb $16, %r10b -; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 -; AVX512DQBW-SLOW-NEXT: movb $48, %r9b -; AVX512DQBW-SLOW-NEXT: kmovd %r9d, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] -; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512DQBW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # ymm22 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] -; AVX512DQBW-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 448(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 512(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512DQBW-SLOW-NEXT: vzeroupper -; AVX512DQBW-SLOW-NEXT: retq +; AVX512F-LABEL: store_i64_stride6_vf16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm2 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm4 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm13 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm6 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm10 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm5 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm11, %zmm12 +; AVX512F-NEXT: movb $12, %r10b +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 {%k1} +; AVX512F-NEXT: movb $16, %r10b +; AVX512F-NEXT: kmovw %r10d, %k2 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} +; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm13, %zmm9, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm16 +; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm14 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm18, %zmm15 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [1,9,2,10,1,9,2,10] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm12, %zmm11 +; AVX512F-NEXT: movb $48, %r9b +; AVX512F-NEXT: kmovw %r9d, %k2 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,9,u,4,5,6,7> +; AVX512F-NEXT: vpermt2q %zmm10, %zmm19, %zmm11 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,9,4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm16, %zmm20, %zmm11 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,13,6,14,5,13,6,14] +; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm17, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,13,u,4,5,6,7> +; AVX512F-NEXT: vpermt2q %zmm10, %zmm22, %zmm15 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,13,4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm18 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} +; AVX512F-NEXT: vpermt2q %zmm5, %zmm19, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm20, %zmm12 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm21 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} +; AVX512F-NEXT: vpermt2q %zmm5, %zmm22, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm23, %zmm17 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] +; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm19, %zmm18 +; AVX512F-NEXT: vmovdqa64 (%rdx), %xmm20 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %xmm21 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] +; AVX512F-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512F-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} +; AVX512F-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] +; AVX512F-NEXT: vpermt2q %zmm16, %zmm20, %zmm18 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm19 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] +; AVX512F-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512F-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm20, %zmm19 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 +; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm22 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm22 = ymm22[1],mem[1],ymm22[3],mem[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm22, %zmm21, %zmm21 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = <10,u,2,3,4,5,11,u> +; AVX512F-NEXT: vpermt2q %zmm10, %zmm22, %zmm21 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,10,2,3,4,5,6,11] +; AVX512F-NEXT: vpermt2q %zmm16, %zmm24, %zmm21 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,4,12,6,7] +; AVX512F-NEXT: vpermt2q %zmm16, %zmm25, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm13, %zmm26, %zmm9 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [7,15,7,15] +; AVX512F-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm8, %zmm13, %zmm7 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm9[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <14,u,2,3,4,5,15,u> +; AVX512F-NEXT: vpermt2q %zmm10, %zmm8, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,14,2,3,4,5,6,15] +; AVX512F-NEXT: vpermt2q %zmm16, %zmm9, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm20 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm22, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm24, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm25, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm26, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, 704(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, 640(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 576(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm15, 256(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq ; -; AVX512DQBW-FAST-LABEL: store_i64_stride6_vf16: -; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 -; AVX512DQBW-FAST-NEXT: movb $12, %r10b -; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: movb $16, %r10b -; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 -; AVX512DQBW-FAST-NEXT: movb $48, %r9b -; AVX512DQBW-FAST-NEXT: kmovd %r9d, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] -; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512DQBW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %xmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [7,15,7,15] -; AVX512DQBW-FAST-NEXT: # ymm22 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] -; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 512(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512DQBW-FAST-NEXT: vzeroupper -; AVX512DQBW-FAST-NEXT: retq +; AVX512BW-LABEL: store_i64_stride6_vf16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm13 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm10 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm5 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm12 +; AVX512BW-NEXT: movb $12, %r10b +; AVX512BW-NEXT: kmovd %r10d, %k1 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 {%k1} +; AVX512BW-NEXT: movb $16, %r10b +; AVX512BW-NEXT: kmovd %r10d, %k2 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm13, %zmm9, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm16 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm14 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm18, %zmm15 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [1,9,2,10,1,9,2,10] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm11 +; AVX512BW-NEXT: movb $48, %r9b +; AVX512BW-NEXT: kmovd %r9d, %k2 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,9,u,4,5,6,7> +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm19, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,9,4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm20, %zmm11 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,13,6,14,5,13,6,14] +; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm17, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,13,u,4,5,6,7> +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm22, %zmm15 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,13,4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm19, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm20, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm21 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm22, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm17 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm19, %zmm18 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %xmm20 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %xmm21 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} +; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm20, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm19 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm20, %zmm19 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm22 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm22 = ymm22[1],mem[1],ymm22[3],mem[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm22, %zmm21, %zmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <10,u,2,3,4,5,11,u> +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm22, %zmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,10,2,3,4,5,6,11] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm24, %zmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,4,12,6,7] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm25, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm26, %zmm9 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [7,15,7,15] +; AVX512BW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm7 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <14,u,2,3,4,5,15,u> +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm8, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,14,2,3,4,5,6,15] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm9, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm20 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm22, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm24, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm25, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm26, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64 %in.vec1 = load <16 x i64>, ptr %in.vecptr1, align 64 %in.vec2 = load <16 x i64>, ptr %in.vecptr2, align 64 @@ -3090,13 +2253,11 @@ ; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm0 ; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm1 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm13 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm14 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm14[1],xmm13[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3] @@ -3129,8 +2290,9 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm13[1],xmm11[1] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm9[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm3 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] @@ -3168,64 +2330,65 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r9), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm14[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovapd 128(%r8), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 128(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastsd 136(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; AVX1-ONLY-NEXT: vbroadcastsd 136(%r8), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r9), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm2[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; AVX1-ONLY-NEXT: vbroadcastsd 168(%r8), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r9), %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovapd 192(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm4[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm2[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; AVX1-ONLY-NEXT: vbroadcastsd 200(%r8), %ymm7 +; AVX1-ONLY-NEXT: vbroadcastsd 168(%r8), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r9), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovapd 192(%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] +; AVX1-ONLY-NEXT: vbroadcastsd 200(%r8), %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r9), %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] @@ -3236,8 +2399,8 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3251,153 +2414,137 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r9), %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm0[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm12[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm6 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm6 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm12 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm12[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm11 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovapd 96(%r9), %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 128(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovapd 128(%r9), %ymm3 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 160(%r9), %ymm0 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 160(%r9), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm8[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 192(%r9), %ymm1 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 192(%r9), %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm2 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm15[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 224(%r9), %ymm2 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[3] ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm9[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovapd 224(%r9), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm4[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[2],ymm7[3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 144(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 144(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 144(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 152(%r8), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 184(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 184(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 208(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 216(%r8), %ymm15 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -3409,19 +2556,19 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm15[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 248(%r8), %ymm12 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm14[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm14[0],xmm13[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm10[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm0[0],xmm10[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm0[0],xmm9[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm10 = xmm0[0],mem[0] @@ -3435,13 +2582,13 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd (%rsp), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vunpcklpd (%rsp), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload @@ -3456,22 +2603,22 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1360(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 1344(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm2, 1168(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm3, 1152(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm4, 1360(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 1344(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 976(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 960(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 976(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 960(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 784(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 768(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm8, 592(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm9, 576(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 208(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 400(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 384(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 784(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 768(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 400(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 208(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm14, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1472(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3575,7 +2722,8 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 ; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm8[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[0,1],ymm3[0,1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -3594,23 +2742,25 @@ ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[0,1],ymm2[0,1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm5[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] @@ -3625,14 +2775,15 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 96(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] @@ -3647,14 +2798,15 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %xmm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 128(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] @@ -3668,12 +2820,13 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %xmm12 ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm12[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 160(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] @@ -3686,12 +2839,13 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %xmm8 ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm8[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 192(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] @@ -3704,12 +2858,13 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 224(%r9), %xmm0 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm0[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] @@ -7274,69 +6429,69 @@ ; AVX1-ONLY-LABEL: store_i64_stride6_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $3464, %rsp # imm = 0xD88 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm7 ; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm0 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd %ymm7, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7344,20 +6499,20 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7365,21 +6520,20 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 128(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 136(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 136(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7387,20 +6541,20 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 168(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 168(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7408,20 +6562,20 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 192(%r8), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 192(%r8), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 200(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 200(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7429,20 +6583,20 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 232(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 232(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7453,18 +6607,18 @@ ; AVX1-ONLY-NEXT: vmovaps 256(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 264(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 264(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7475,18 +6629,18 @@ ; AVX1-ONLY-NEXT: vmovaps 288(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 296(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 296(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7497,18 +6651,18 @@ ; AVX1-ONLY-NEXT: vmovaps 320(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 328(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 328(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7519,18 +6673,18 @@ ; AVX1-ONLY-NEXT: vmovaps 352(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 360(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 360(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7538,20 +6692,20 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 384(%r8), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 384(%r8), %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 392(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 392(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7559,20 +6713,20 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 416(%r8), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 416(%r8), %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 424(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 424(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7580,202 +6734,168 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 448(%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 448(%r8), %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 456(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 456(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 480(%rsi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovapd 480(%r8), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 480(%r8), %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0],ymm2[1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 480(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 488(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%r9), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 488(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%r9), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm3[0],ymm2[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm6 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm15 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovapd 96(%r9), %ymm3 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovapd 128(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovapd 128(%r9), %ymm9 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 160(%r9), %ymm8 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 192(%r9), %ymm7 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 224(%r9), %ymm10 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 256(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 256(%r9), %ymm11 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 288(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 288(%r9), %ymm2 +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm7 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm7[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm1 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm2 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm9[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm2 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm2[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm9[0],ymm3[2],ymm9[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm3 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm13[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovapd 96(%r9), %ymm13 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm13[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm9[0],ymm3[2],ymm9[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm3 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm4[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovapd 128(%r9), %ymm3 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm3[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm9[0],ymm6[0],ymm9[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm6 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm5[2,3],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovapd 160(%r9), %ymm9 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm9[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm5 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 192(%r9), %ymm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm4 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm8[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 224(%r9), %ymm5 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm5[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm8[0],ymm4[2],ymm8[3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm4 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 256(%r9), %ymm4 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm4[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[2],ymm12[3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm8 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = mem[2,3],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovapd 288(%r9), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 320(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 320(%r9), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm12[0],ymm8[0],ymm12[2],ymm8[3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm8 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[2,3],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovapd 320(%r9), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 352(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[2],ymm12[3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm8 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[2,3],ymm8[2,3] ; AVX1-ONLY-NEXT: vmovapd 352(%r9), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 384(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 384(%r9), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 416(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 416(%r9), %ymm13 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 448(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 448(%r9), %ymm14 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 480(%r9), %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[2],ymm12[3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm8 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm11[2,3],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovapd 384(%r9), %ymm11 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm11[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[2],ymm12[3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm8 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm14[2,3],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovapd 416(%r9), %ymm14 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm14[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[2],ymm12[3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm8 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovapd 448(%r9), %ymm10 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm10[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[2],ymm12[3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm8 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm15[2,3],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovapd 480(%r9), %ymm15 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm15[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[2],ymm12[3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm7[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -7787,7 +6907,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -7799,7 +6919,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -7811,7 +6931,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 152(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -7823,7 +6943,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 184(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -7835,7 +6955,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 216(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -7847,7 +6967,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 248(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -7859,7 +6979,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 280(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -7910,7 +7030,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 408(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -7922,7 +7042,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 440(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -7934,7 +7054,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 472(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -7946,7 +7066,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 504(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -7961,7 +7081,7 @@ ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7969,7 +7089,7 @@ ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7979,7 +7099,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] @@ -7995,7 +7115,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] @@ -8061,54 +7181,54 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm2, 2320(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 2304(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm4, 2704(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 2688(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 2896(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 2880(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm8, 2512(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 2496(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 2896(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 2880(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 2704(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm3, 2688(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 2512(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 2496(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 2320(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 2304(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 2128(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 2112(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm10, 1936(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm11, 1920(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 2128(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 2112(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 1744(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 1728(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 1744(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 1728(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 1552(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 1536(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1360(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1344(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 1168(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 1152(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1360(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1344(%rax) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 976(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 960(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 784(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 768(%rax) +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 592(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 576(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 784(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 768(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1552(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1536(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 3008(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8291,7 +7411,8 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[0,1],ymm2[0,1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -8309,7 +7430,8 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm0 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm5[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm1 @@ -8317,17 +7439,18 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] @@ -8342,14 +7465,15 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 96(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] @@ -8364,14 +7488,15 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %xmm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 128(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] @@ -8386,14 +7511,15 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %xmm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 160(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] @@ -8408,14 +7534,15 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %xmm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 192(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] @@ -8430,14 +7557,15 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %xmm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 224(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] @@ -8452,14 +7580,15 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 256(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rsi), %xmm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 256(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 256(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] @@ -8474,14 +7603,15 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 288(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rsi), %xmm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 288(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 288(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] @@ -8496,14 +7626,15 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 320(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rsi), %xmm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 320(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 320(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] @@ -8518,14 +7649,15 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %xmm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 352(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 352(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] @@ -8540,14 +7672,15 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %xmm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 384(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 384(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] @@ -8561,12 +7694,13 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %xmm12 ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm12[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 416(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 416(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] @@ -8579,12 +7713,13 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %xmm8 ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm8[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 448(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 448(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] @@ -8597,12 +7732,13 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 480(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 480(%r9), %xmm0 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm0[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll @@ -31,15 +31,15 @@ ; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1] -; SSE-NEXT: movapd %xmm2, 16(%rax) -; SSE-NEXT: movapd %xmm4, 32(%rax) -; SSE-NEXT: movapd %xmm0, 48(%rax) -; SSE-NEXT: movapd %xmm3, 80(%rax) ; SSE-NEXT: movapd %xmm5, 96(%rax) +; SSE-NEXT: movapd %xmm3, 80(%rax) +; SSE-NEXT: movapd %xmm0, 48(%rax) +; SSE-NEXT: movapd %xmm4, 32(%rax) +; SSE-NEXT: movapd %xmm2, 16(%rax) ; SSE-NEXT: movapd %xmm1, 64(%rax) ; SSE-NEXT: movapd %xmm7, (%rax) ; SSE-NEXT: retq @@ -51,22 +51,22 @@ ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm3 +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm7 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm4[1],xmm5[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -107,17 +107,17 @@ ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-NEXT: vmovdqa (%r8), %xmm2 +; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1] +; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = <3,5,7,9,11,13,u,u> -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 +; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 ; AVX512-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -146,101 +146,100 @@ ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movapd (%rdi), %xmm8 -; SSE-NEXT: movaps 16(%rdi), %xmm7 -; SSE-NEXT: movapd (%rsi), %xmm0 -; SSE-NEXT: movaps 16(%rsi), %xmm13 -; SSE-NEXT: movapd (%rdx), %xmm4 -; SSE-NEXT: movaps 16(%rdx), %xmm2 -; SSE-NEXT: movapd (%rcx), %xmm3 -; SSE-NEXT: movaps 16(%rcx), %xmm1 -; SSE-NEXT: movapd (%r8), %xmm10 -; SSE-NEXT: movaps 16(%r8), %xmm6 +; SSE-NEXT: movapd (%rdi), %xmm0 +; SSE-NEXT: movapd 16(%rdi), %xmm5 +; SSE-NEXT: movapd (%rsi), %xmm2 +; SSE-NEXT: movapd 16(%rsi), %xmm8 +; SSE-NEXT: movapd (%rdx), %xmm1 +; SSE-NEXT: movapd 16(%rdx), %xmm6 +; SSE-NEXT: movapd (%rcx), %xmm4 +; SSE-NEXT: movapd 16(%rcx), %xmm11 +; SSE-NEXT: movapd (%r8), %xmm3 +; SSE-NEXT: movapd 16(%r8), %xmm10 ; SSE-NEXT: movapd (%r9), %xmm9 -; SSE-NEXT: movaps 16(%r9), %xmm5 +; SSE-NEXT: movapd 16(%r9), %xmm12 +; SSE-NEXT: movapd 16(%r10), %xmm13 ; SSE-NEXT: movapd (%r10), %xmm14 -; SSE-NEXT: movaps 16(%r10), %xmm12 -; SSE-NEXT: movaps %xmm13, %xmm11 -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm12[1] -; SSE-NEXT: movapd %xmm8, %xmm15 -; SSE-NEXT: unpcklpd {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm13[0] -; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm9[0] +; SSE-NEXT: movapd %xmm0, %xmm7 +; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm2[0] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm9[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm14[0],xmm8[1] -; SSE-NEXT: movapd %xmm10, 32(%rax) -; SSE-NEXT: movapd %xmm8, 48(%rax) +; SSE-NEXT: movapd %xmm5, %xmm14 +; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm8[0] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm13[0],xmm5[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm6[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm10[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm13[1] +; SSE-NEXT: movapd %xmm12, 208(%rax) +; SSE-NEXT: movapd %xmm11, 192(%rax) +; SSE-NEXT: movapd %xmm8, 176(%rax) +; SSE-NEXT: movapd %xmm5, 160(%rax) +; SSE-NEXT: movapd %xmm10, 144(%rax) +; SSE-NEXT: movapd %xmm6, 128(%rax) +; SSE-NEXT: movapd %xmm14, 112(%rax) ; SSE-NEXT: movapd %xmm9, 96(%rax) -; SSE-NEXT: movaps %xmm7, 112(%rax) -; SSE-NEXT: movaps %xmm12, 160(%rax) -; SSE-NEXT: movaps %xmm11, 176(%rax) -; SSE-NEXT: movapd %xmm15, (%rax) -; SSE-NEXT: movapd %xmm4, 16(%rax) -; SSE-NEXT: movapd %xmm0, 64(%rax) -; SSE-NEXT: movapd %xmm3, 80(%rax) -; SSE-NEXT: movaps %xmm2, 128(%rax) -; SSE-NEXT: movaps %xmm6, 144(%rax) -; SSE-NEXT: movaps %xmm1, 192(%rax) -; SSE-NEXT: movaps %xmm5, 208(%rax) +; SSE-NEXT: movapd %xmm4, 80(%rax) +; SSE-NEXT: movapd %xmm2, 64(%rax) +; SSE-NEXT: movapd %xmm0, 48(%rax) +; SSE-NEXT: movapd %xmm3, 32(%rax) +; SSE-NEXT: movapd %xmm1, 16(%rax) +; SSE-NEXT: movapd %xmm7, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride7_vf4: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 16(%r10), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm5 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm5 ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm9 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm10 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm12[0],xmm11[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm11[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r9), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm10[0] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm4, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm8[0],xmm7[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm10[2,3],ymm6[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%r10), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm4[1],ymm11[1],ymm4[3],ymm11[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm8[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm7[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r9), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm2[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -248,51 +247,51 @@ ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm6 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm5 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm0 ; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%r10), %xmm3 -; AVX2-ONLY-NEXT: vmovaps 16(%r10), %xmm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm9 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm10 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm10[1],xmm9[1] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm5 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm5[1],xmm3[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm7 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm8 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm7[1] +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps (%r10), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 16(%r10), %xmm10 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm8, %ymm8 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm11 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm7[0],xmm5[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm3, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm4 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm4[1],xmm9[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm6[0],xmm4[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm9, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%r9), %ymm2 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, (%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 128(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rax) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -309,6 +308,7 @@ ; AVX512F-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <15,3,7,u> ; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] ; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 @@ -328,16 +328,15 @@ ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] ; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <2,6,u,u,u,11,15,3> -; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <2,6,u,u,u,11,15,3> +; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm0 ; AVX512F-NEXT: movb $28, %cl ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-NEXT: vmovdqa %ymm0, 192(%rax) +; AVX512F-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -354,6 +353,7 @@ ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <15,3,7,u> ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 @@ -373,16 +373,15 @@ ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <2,6,u,u,u,11,15,3> -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <2,6,u,u,u,11,15,3> +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm0 ; AVX512BW-NEXT: movb $28, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa %ymm0, 192(%rax) +; AVX512BW-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <4 x i64>, ptr %in.vecptr0, align 64 @@ -530,219 +529,212 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride7_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: pushq %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm7 -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm0 -; AVX1-ONLY-NEXT: vmovapd 32(%rax), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm6[1] -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm9 -; AVX1-ONLY-NEXT: vmovapd 32(%r9), %xmm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm15[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm3 +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm8 +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm10 +; AVX1-ONLY-NEXT: vmovapd 32(%r9), %xmm6 +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm10[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rax), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm10[0],xmm8[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm8, %ymm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2],ymm10[2] -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm14 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm3[1],ymm14[1],ymm3[3],ymm14[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm11[1] -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm15[0],ymm11[2],ymm15[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm0[0,1],ymm11[2,3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm15[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 32(%rax), %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm2[2,3],ymm15[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0],ymm0[1],ymm15[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovapd 48(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm4 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm4[0],mem[0],ymm4[2],mem[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm9[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm5[0],xmm13[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%r9), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm11[0],xmm12[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm7, %ymm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2],ymm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %xmm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm14[0],xmm6[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rax), %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm14 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm13[0] +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 32(%rax), %ymm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm13[0],mem[0],ymm13[2],mem[2] +; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm14[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm10[2,3],ymm14[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2],ymm15[3] +; AVX1-ONLY-NEXT: vmovapd 48(%rcx), %xmm15 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%r9), %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm14[0],xmm8[0] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rax) +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm4, 16(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 352(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm15, 384(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 224(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 352(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 96(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 288(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 256(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: popq %rax +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride7_vf8: ; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: pushq %rax ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 16(%rax), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm13 -; AVX2-ONLY-NEXT: vmovaps 48(%rax), %xmm12 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm12 +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm3 +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm7, %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm6 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm6[1] -; AVX2-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm14 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] +; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 16(%rax), %xmm13 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm11[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm12[0],mem[0],ymm12[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm6[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm12, %ymm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm11, %ymm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm13[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm13 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm15 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm14[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm12, %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm13 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],xmm13[1] +; AVX2-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm14 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 48(%rax), %xmm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm10[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm11, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm5[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%r9), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm14[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm15[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm2, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%r9), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 64(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 320(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, (%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 384(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm5, 416(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 320(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 288(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 256(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 224(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 160(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 224(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 416(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-ONLY-NEXT: popq %rax ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -750,123 +742,123 @@ ; AVX512F-ONLY-SLOW: # %bb.0: ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: movb $12, %sil +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] ; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%r10), %zmm8, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: movb $112, %sil +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] ; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $96, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: movb $12, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%r10), %zmm8, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: movb $112, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm7, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] ; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: movb $-61, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] -; AVX512F-ONLY-SLOW-NEXT: movb $28, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm13[2,3,2,3],zmm2[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm2, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $6, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm13 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm11, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm7 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm10, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: movb $56, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm4, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm10, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm8[0],ymm12[2],ymm8[2] +; AVX512F-ONLY-SLOW-NEXT: movb $28, %cl +; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k2 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm13[2,3,2,3],zmm3[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: movb $120, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm11, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: movb $-61, %cl +; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $48, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm8[1],ymm12[3],ymm8[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm15 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; @@ -874,246 +866,246 @@ ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: movb $12, %sil +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] ; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%r10), %zmm8, %zmm8 +; AVX512F-ONLY-FAST-NEXT: movb $112, %sil +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,0,14,6,5,0,14,6] ; AVX512F-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm7, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] ; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm7 ; AVX512F-ONLY-FAST-NEXT: movb $24, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $96, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-ONLY-FAST-NEXT: movb $12, %sil -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k2} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%r10), %zmm8, %zmm7 -; AVX512F-ONLY-FAST-NEXT: movb $112, %sil -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm9, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: movb $-61, %sil -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] ; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 -; AVX512F-ONLY-FAST-NEXT: movb $48, %sil -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k2} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <1,3,7,u> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %ymm9, %ymm12, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: movb $14, %sil -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm7 {%k2} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] -; AVX512F-ONLY-FAST-NEXT: movb $28, %sil -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm9[2,3,2,3],zmm3[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: movb $6, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm9 {%k2} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm7 {%k2} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm9, %zmm8 ; AVX512F-ONLY-FAST-NEXT: movb $56, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 {%k2} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm8[0],ymm12[2],ymm8[2] +; AVX512F-ONLY-FAST-NEXT: movb $28, %cl +; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm13[2,3,2,3],zmm3[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 +; AVX512F-ONLY-FAST-NEXT: movb $48, %cl +; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k2} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,3,7,u> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %ymm8, %ymm12, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: movb $14, %cl +; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k2} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm4 ; AVX512F-ONLY-FAST-NEXT: movb $120, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: movb $-61, %cl +; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i64_stride7_vf8: ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [5,0,14,6,5,0,14,6] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm9, %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] -; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: movb $-61, %sil +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm3 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,8,0,1,0,8,0,1] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: movb $12, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3] -; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,10,0,3,2,10,0,3] -; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512DQ-SLOW-NEXT: movb $96, %sil +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm9, %zmm4, %zmm5 +; AVX512DQ-SLOW-NEXT: movb $112, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm8 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX512DQ-SLOW-NEXT: movb $28, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm11[2,3,2,3],zmm3[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [15,7,15,7,15,7,15,7] -; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] -; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm12 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] -; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm13 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,12,0,5,4,12,0,5] -; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm11, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [15,7,15,7] -; AVX512DQ-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: movb $24, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [14,1,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm10, %zmm9 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,0,10,2,1,0,10,2] +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm5, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] +; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm2, %zmm5 +; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: movb $6, %cl +; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [9,1,9,1,9,1,9,1] +; AVX512DQ-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm9, %zmm4, %zmm8 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,9,0,3,4,9,0,3] ; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 -; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: movb $6, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2} -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,9,0,3,4,9,0,3] -; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm12, %zmm11 -; AVX512DQ-SLOW-NEXT: movb $56, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] -; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-SLOW-NEXT: movb $12, %cl -; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2} -; AVX512DQ-SLOW-NEXT: movb $112, %cl +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm8, %zmm10 +; AVX512DQ-SLOW-NEXT: movb $56, %cl +; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] +; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [2,10,0,3,2,10,0,3] +; AVX512DQ-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 +; AVX512DQ-SLOW-NEXT: movb $96, %cl +; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm11 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512DQ-SLOW-NEXT: movb $28, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%r10), %zmm13, %zmm11 {%k2} +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm12[2,3,2,3],zmm3[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] ; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] ; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm14, %zmm4 -; AVX512DQ-SLOW-NEXT: movb $120, %cl -; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: movb $48, %cl -; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: movb $14, %cl -; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm9, %zmm4, %zmm13 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,0,14,6,5,0,14,6] +; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm9, %zmm14 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm15 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [15,7,15,7,15,7,15,7] +; AVX512DQ-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm16, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [3,0,12,4,3,0,12,4] +; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,1,12,7,0,1,12,7] +; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm13, %zmm6 +; AVX512DQ-SLOW-NEXT: movb $120, %dl +; AVX512DQ-SLOW-NEXT: kmovw %edx, %k2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 {%k2} +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] +; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm14, %zmm7 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm12 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: movb $-61, %dl +; AVX512DQ-SLOW-NEXT: kmovw %edx, %k2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 {%k2} +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] +; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm7 +; AVX512DQ-SLOW-NEXT: movb $24, %dl +; AVX512DQ-SLOW-NEXT: kmovw %edx, %k2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm7, %zmm4 +; AVX512DQ-SLOW-NEXT: movb $48, %dl +; AVX512DQ-SLOW-NEXT: kmovw %edx, %k1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: movb $14, %al +; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 384(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 320(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 256(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; @@ -1121,122 +1113,123 @@ ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,0,14,6,5,0,14,6] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm9, %zmm5 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm6 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: movb $-61, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [3,0,12,4,3,0,12,4] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 -; AVX512DQ-FAST-NEXT: movb $48, %sil +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,8,0,1,0,8,0,1] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FAST-NEXT: movb $12, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,3,7,u> -; AVX512DQ-FAST-NEXT: vpermi2q %ymm10, %ymm11, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: movb $14, %sil +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm7 +; AVX512DQ-FAST-NEXT: movb $112, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [11,3,11,3,11,3,11,3] -; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [2,10,0,3,2,10,0,3] +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%r10), %zmm7, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] ; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm7 -; AVX512DQ-FAST-NEXT: movb $96, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX512DQ-FAST-NEXT: movb $28, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm10[2,3,2,3],zmm3[2,3,2,3] -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] -; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm11 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm12 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] -; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm8 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [15,7,15,7] -; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: movb $24, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2] -; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: movb $6, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm9 {%k2} +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: movb $6, %cl +; AVX512DQ-FAST-NEXT: kmovw %ecx, %k1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [9,1,9,1,9,1,9,1] +; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm9 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,9,0,3,4,9,0,3] ; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: movb $56, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm9, %zmm10 +; AVX512DQ-FAST-NEXT: movb $56, %cl +; AVX512DQ-FAST-NEXT: kmovw %ecx, %k1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] +; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm9 +; AVX512DQ-FAST-NEXT: movb $96, %cl +; AVX512DQ-FAST-NEXT: kmovw %ecx, %k1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm12 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512DQ-FAST-NEXT: movb $28, %cl +; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm10[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [3,0,12,4,3,0,12,4] ; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm11 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-FAST-NEXT: movb $12, %cl +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm6, %zmm10 +; AVX512DQ-FAST-NEXT: movb $48, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2} -; AVX512DQ-FAST-NEXT: movb $112, %cl +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k2} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <1,3,7,u> +; AVX512DQ-FAST-NEXT: vpermi2q %ymm11, %ymm12, %ymm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: movb $14, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%r10), %zmm12, %zmm10 {%k2} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2} ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] ; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm12 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] +; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm12 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,1,12,7,0,1,12,7] +; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm12, %zmm13 ; AVX512DQ-FAST-NEXT: movb $120, %cl -; AVX512DQ-FAST-NEXT: kmovw %ecx, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 {%k2} +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,0,14,6,5,0,14,6] +; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm8, %zmm11 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,13,6,7,0,13,6,7] +; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm11, %zmm12 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm11 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm14 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm14[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: movb $-61, %cl +; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 {%k2} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [15,7,15,7,15,7,15,7] +; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [15,7,15,7] +; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: movb $24, %cl +; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -1244,123 +1237,123 @@ ; AVX512BW-ONLY-SLOW: # %bb.0: ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: movb $12, %sil +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] ; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%r10), %zmm8, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: movb $112, %sil +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] ; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $96, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: movb $12, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%r10), %zmm8, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: movb $112, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm7, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] ; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm9, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: movb $-61, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] -; AVX512BW-ONLY-SLOW-NEXT: movb $28, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm13[2,3,2,3],zmm2[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm14 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm2, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $6, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm13 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm11, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm7 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm10, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: movb $56, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm4, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm10, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm8[0],ymm12[2],ymm8[2] +; AVX512BW-ONLY-SLOW-NEXT: movb $28, %cl +; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm13[2,3,2,3],zmm3[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: movb $120, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm11, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: movb $-61, %cl +; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $48, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm8[1],ymm12[3],ymm8[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm15 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; @@ -1368,246 +1361,246 @@ ; AVX512BW-ONLY-FAST: # %bb.0: ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-ONLY-FAST-NEXT: movb $12, %sil +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] ; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%r10), %zmm8, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: movb $112, %sil +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,0,14,6,5,0,14,6] ; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm7, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] ; AVX512BW-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: movb $24, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $96, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-ONLY-FAST-NEXT: movb $12, %sil -; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%r10), %zmm8, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: movb $112, %sil -; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm9, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: movb $-61, %sil -; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] ; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: movb $48, %sil -; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k2} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <1,3,7,u> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %ymm9, %ymm12, %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: movb $14, %sil -; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm7 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] -; AVX512BW-ONLY-FAST-NEXT: movb $28, %sil -; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm9[2,3,2,3],zmm3[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $6, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm9 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm7 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm9, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: movb $56, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm8[0],ymm12[2],ymm8[2] +; AVX512BW-ONLY-FAST-NEXT: movb $28, %cl +; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm13[2,3,2,3],zmm3[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: movb $48, %cl +; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k2} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,3,7,u> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %ymm8, %ymm12, %ymm14 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: movb $14, %cl +; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: movb $120, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: movb $-61, %cl +; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: store_i64_stride7_vf8: ; AVX512DQBW-SLOW: # %bb.0: ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm9, %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm2 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: movb $-61, %sil +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm3 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQBW-SLOW-NEXT: movb $12, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 -; AVX512DQBW-SLOW-NEXT: movb $96, %sil +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm9, %zmm4, %zmm5 +; AVX512DQBW-SLOW-NEXT: movb $112, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm8 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX512DQBW-SLOW-NEXT: movb $28, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm11[2,3,2,3],zmm3[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm12 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm13 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm11, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: movb $24, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm10, %zmm9 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm5, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm2, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: movb $6, %cl +; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm9, %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,9,0,3,4,9,0,3] ; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm11 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: movb $6, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2} -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm12, %zmm11 -; AVX512DQBW-SLOW-NEXT: movb $56, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] -; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQBW-SLOW-NEXT: movb $12, %cl -; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2} -; AVX512DQBW-SLOW-NEXT: movb $112, %cl +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm8, %zmm10 +; AVX512DQBW-SLOW-NEXT: movb $56, %cl +; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 +; AVX512DQBW-SLOW-NEXT: movb $96, %cl +; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm11 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512DQBW-SLOW-NEXT: movb $28, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%r10), %zmm13, %zmm11 {%k2} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm12[2,3,2,3],zmm3[2,3,2,3] ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] ; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] ; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm14, %zmm4 -; AVX512DQBW-SLOW-NEXT: movb $120, %cl -; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: movb $48, %cl -; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: movb $14, %cl -; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm9, %zmm4, %zmm13 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm9, %zmm14 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm15 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm16, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm13, %zmm6 +; AVX512DQBW-SLOW-NEXT: movb $120, %dl +; AVX512DQBW-SLOW-NEXT: kmovd %edx, %k2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 {%k2} +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm14, %zmm7 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm12 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: movb $-61, %dl +; AVX512DQBW-SLOW-NEXT: kmovd %edx, %k2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 {%k2} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm7 +; AVX512DQBW-SLOW-NEXT: movb $24, %dl +; AVX512DQBW-SLOW-NEXT: kmovd %edx, %k2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm7, %zmm4 +; AVX512DQBW-SLOW-NEXT: movb $48, %dl +; AVX512DQBW-SLOW-NEXT: kmovd %edx, %k1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: movb $14, %al +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 384(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 320(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 256(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; @@ -1615,122 +1608,123 @@ ; AVX512DQBW-FAST: # %bb.0: ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm9, %zmm5 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm6 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm5 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: movb $-61, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm4, %zmm6 -; AVX512DQBW-FAST-NEXT: movb $48, %sil +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQBW-FAST-NEXT: movb $12, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQBW-FAST-NEXT: vmovdqa (%r9), %ymm10 -; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm11 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,3,7,u> -; AVX512DQBW-FAST-NEXT: vpermi2q %ymm10, %ymm11, %ymm7 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: movb $14, %sil +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm7 +; AVX512DQBW-FAST-NEXT: movb $112, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%r10), %zmm7, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] ; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm7 -; AVX512DQBW-FAST-NEXT: movb $96, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX512DQBW-FAST-NEXT: movb $28, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm10[2,3,2,3],zmm3[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm11 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm12 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [15,7,15,7] -; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512DQBW-FAST-NEXT: movb $24, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm9, %zmm8 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm10 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: movb $6, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm9 {%k2} +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: movb $6, %cl +; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k1 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm9 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,9,0,3,4,9,0,3] ; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm10 -; AVX512DQBW-FAST-NEXT: movb $56, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k2} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm9, %zmm10 +; AVX512DQBW-FAST-NEXT: movb $56, %cl +; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm9 +; AVX512DQBW-FAST-NEXT: movb $96, %cl +; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa (%r9), %ymm11 +; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm12 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512DQBW-FAST-NEXT: movb $28, %cl +; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm10[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [3,0,12,4,3,0,12,4] ; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm11 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQBW-FAST-NEXT: movb $12, %cl +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm6, %zmm10 +; AVX512DQBW-FAST-NEXT: movb $48, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2} -; AVX512DQBW-FAST-NEXT: movb $112, %cl +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k2} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <1,3,7,u> +; AVX512DQBW-FAST-NEXT: vpermi2q %ymm11, %ymm12, %ymm13 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: movb $14, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%r10), %zmm12, %zmm10 {%k2} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2} ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] ; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm12, %zmm13 ; AVX512DQBW-FAST-NEXT: movb $120, %cl -; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 {%k2} +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm8, %zmm11 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm11, %zmm12 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm11 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm14 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm14[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: movb $-61, %cl +; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 {%k2} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [15,7,15,7] +; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: movb $24, %cl +; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <8 x i64>, ptr %in.vecptr0, align 64 @@ -2022,24 +2016,19 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride7_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $520, %rsp # imm = 0x208 +; AVX1-ONLY-NEXT: subq $392, %rsp # imm = 0x188 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm12 +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm14 ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm14[0] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rax), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm2 @@ -2051,16 +2040,16 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm9[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2068,178 +2057,165 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm7 ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm11[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm7[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rax), %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm10 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm8[0] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rax), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 80(%rax), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 80(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm13[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rax), %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm10[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm13[0],xmm10[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rax), %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm14[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm12[0],xmm9[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm11[1],xmm12[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm9[0],xmm7[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovapd 112(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 96(%rax), %ymm8 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm8[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2],ymm6[3] -; AVX1-ONLY-NEXT: vmovapd 112(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],xmm6[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vbroadcastsd 112(%r9), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%r9), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm10[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm8[0],xmm15[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 112(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 96(%rax), %ymm7 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3] +; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 112(%r9), %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovapd 112(%rcx), %xmm12 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%r9), %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm11[0],xmm8[0] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm11 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm11 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm12, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm10, (%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 464(%rax) +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm15, 464(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm8, 448(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 832(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 768(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm12, (%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 832(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 768(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 352(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 544(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 128(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 864(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 800(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 736(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 864(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 800(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 736(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 704(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 672(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 640(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rax) @@ -2263,7 +2239,7 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $520, %rsp # imm = 0x208 +; AVX1-ONLY-NEXT: addq $392, %rsp # imm = 0x188 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -2508,1685 +2484,1699 @@ ; ; AVX512F-ONLY-SLOW-LABEL: store_i64_stride7_vf16: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: subq $200, %rsp +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm11 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm16, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm13[0,1,2,3],zmm12[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: movb $64, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm17, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm8, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm24, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm16, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm16, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm13, %zmm20, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm29, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm21, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm21, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm11, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm18, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm17, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm17, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm17, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: movb $48, %al +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm17[0],zmm18[0],zmm17[2],zmm18[2],zmm17[4],zmm18[4],zmm17[6],zmm18[6] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm31 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm31 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm31, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm17[0,1,2,3],zmm19[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm19, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm18, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm17, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: movb $24, %al +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm30, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: movb $64, %al +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm20, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm20, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm20, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm0, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: movb $96, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm17, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm17, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm16, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm29, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm25, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm14, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm28, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm28, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm27, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm23, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm24[0],ymm15[0],ymm24[2],ymm15[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm10 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512F-ONLY-SLOW-NEXT: movb $28, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm23[2,3,2,3],zmm30[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm29[0],ymm26[0],ymm29[2],ymm26[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k4} = zmm12[2,3,2,3],zmm30[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm28, %zmm12 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] ; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm26[0],ymm15[0],ymm26[2],ymm15[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k3} = zmm23[2,3,2,3],zmm22[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm23, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm23, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm25, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm1, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm13, %zmm30, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm13, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm13, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm11 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm2[0],ymm24[0],ymm2[2],ymm24[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k4} = zmm13[2,3,2,3],zmm30[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,13,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm20, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm1, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm25, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: movb $48, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm14, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm14[0],zmm7[0],zmm14[2],zmm7[2],zmm14[4],zmm7[4],zmm14[6],zmm7[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm21, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm23, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm12 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm20, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: movb $120, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm19, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $6, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm9 {%k4} +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k3} ; AVX512F-ONLY-SLOW-NEXT: movb $56, %sil +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512F-ONLY-SLOW-NEXT: movb $120, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} -; AVX512F-ONLY-SLOW-NEXT: movb $-31, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm10 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm31[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $-61, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm13 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm31 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm31 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm24, %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm4 = ymm24[1],mem[1],ymm24[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k6} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: movb $-31, %sil +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm8 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm29[1],ymm26[1],ymm29[3],ymm26[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm26[1],ymm15[1],ymm26[3],ymm15[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm24[1],ymm2[3],ymm24[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $200, %rsp ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i64_stride7_vf16: ; AVX512F-ONLY-FAST: # %bb.0: +; AVX512F-ONLY-FAST-NEXT: pushq %rax ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm30 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm31 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm8 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,0,1,0,8,0,1] ; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm16 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512F-ONLY-FAST-NEXT: movb $12, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm22, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm18, %zmm18 ; AVX512F-ONLY-FAST-NEXT: movb $112, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 {%k2} -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm16[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm16, %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm30, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm13, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 {%k2} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm24, %zmm26 -; AVX512F-ONLY-FAST-NEXT: movb $96, %sil +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm31, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm10, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 {%k2} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: movb $6, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm26 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm13 -; AVX512F-ONLY-FAST-NEXT: movb $120, %sil -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] -; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm14, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm22 -; AVX512F-ONLY-FAST-NEXT: movb $24, %dil -; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} -; AVX512F-ONLY-FAST-NEXT: movb $-31, %dil -; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k3} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm15, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm28, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm1, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: movb $-61, %dil -; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k3} -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm13 {%k3} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm27, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm20, %zmm19 ; AVX512F-ONLY-FAST-NEXT: movb $48, %sil -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm24, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm11, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm16 {%k3} +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k2} = zmm2[0],zmm11[0],zmm2[2],zmm11[2],zmm2[4],zmm11[4],zmm2[6],zmm11[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm24, %zmm17, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm21, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm2, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm25[0,1,2,3],zmm23[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: movb $-61, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %ymm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,7,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm27, %ymm6, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: movb $14, %sil -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm17, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 {%k3} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm25, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [6,13,14,7,6,13,14,7] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm24, %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm25, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm24, %zmm2 +; AVX512F-ONLY-FAST-NEXT: movb $24, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %ymm24, %ymm8, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm26 {%k3} -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm19[0,1,2,3],zmm18[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm27, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm17 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: movb $28, %al -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm6[2,3,2,3],zmm31[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm24[0],ymm8[2],ymm24[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: movb $6, %al -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm2 {%k3} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm30 +; AVX512F-ONLY-FAST-NEXT: movb $56, %sil +; AVX512F-ONLY-FAST-NEXT: movb $-31, %dil +; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm10 {%k4} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm21, %zmm18 +; AVX512F-ONLY-FAST-NEXT: movb $96, %cl +; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm31, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm12 {%k4} +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] +; AVX512F-ONLY-FAST-NEXT: movb $28, %cl +; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k4 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k4} = zmm27[2,3,2,3],zmm16[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %ymm27 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm0[0],ymm27[0],ymm0[2],ymm27[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k4} = zmm29[2,3,2,3],zmm8[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = <1,3,7,u> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm15, %ymm29, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm27, %ymm29, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: movb $14, %cl +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm7, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k2} = zmm1[0],zmm4[0],zmm1[2],zmm4[2],zmm1[4],zmm4[4],zmm1[6],zmm4[6] +; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm19 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm14, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm14, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm26 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm14 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $64, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} -; AVX512F-ONLY-FAST-NEXT: movb $56, %al +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: movb $120, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm23, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm4, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm18, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm22 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm31, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm4, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm0, %zmm1 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 576(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 832(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: popq %rax ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i64_stride7_vf16: ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm11 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [15,7,15,7] -; AVX512DQ-SLOW-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm16, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm14 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: movb $64, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [0,8,0,8,0,8,0,8] -; AVX512DQ-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm17, %zmm21 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [13,5,13,5,13,5,13,5] -; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: movb $96, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] -; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm24, %zmm11, %zmm17 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] ; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm18, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm18, %zmm19 -; AVX512DQ-SLOW-NEXT: movb $24, %sil +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] +; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm10 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [11,3,11,3,11,3,11,3] +; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm8, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm6, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm19, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm25 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm17[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512DQ-SLOW-NEXT: movb $12, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm16 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm16, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm12, %zmm11 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,1,12,7,0,1,12,7] -; AVX512DQ-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm30, %zmm11 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] -; AVX512DQ-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3] -; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [9,1,9,1,9,1,9,1] -; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm14, %zmm25, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm25, %zmm14, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm29, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm24, %zmm27, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm24, %zmm23, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %ymm23 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm23[0],ymm5[0],ymm23[2],ymm5[2] -; AVX512DQ-SLOW-NEXT: movb $28, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm24[2,3,2,3],zmm28[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm17, %zmm0, %zmm2 {%k2} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm27, %zmm28 +; AVX512DQ-SLOW-NEXT: movb $112, %sil +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm17 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [5,0,14,6,5,0,14,6] +; AVX512DQ-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm22, %zmm24, %zmm26 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] +; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm23, %zmm26, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm5, %zmm19 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm19[0,1,2,3],zmm18[4,5,6,7] +; AVX512DQ-SLOW-NEXT: movb $-61, %dil +; AVX512DQ-SLOW-NEXT: kmovw %edi, %k1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [15,7,15,7,15,7,15,7] +; AVX512DQ-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm26, %zmm22 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7] ; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm29 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7] -; AVX512DQ-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %ymm26 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm26[0],ymm5[0],ymm26[2],ymm5[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k3} = zmm24[2,3,2,3],zmm22[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3] -; AVX512DQ-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm24, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm24, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm28, %zmm25, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,13,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm28, %zmm1, %zmm24 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [1,0,10,2,1,0,10,2] -; AVX512DQ-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm25 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,0,12,4,3,0,12,4] -; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm28, %zmm9 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,8,0,1,0,8,0,1] -; AVX512DQ-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm30, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm30 -; AVX512DQ-SLOW-NEXT: movb $48, %sil +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm22, %zmm23, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm31 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [15,7,15,7] +; AVX512DQ-SLOW-NEXT: # ymm22 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm22, %zmm19 +; AVX512DQ-SLOW-NEXT: movb $24, %dil +; AVX512DQ-SLOW-NEXT: kmovw %edi, %k1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm19 {%k1} +; AVX512DQ-SLOW-NEXT: movb $-31, %dil +; AVX512DQ-SLOW-NEXT: kmovw %edi, %k3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm19 {%k3} ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm13, %zmm28 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm13[0],zmm4[2],zmm13[2],zmm4[4],zmm13[4],zmm4[6],zmm13[6] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: movb $12, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k5} -; AVX512DQ-SLOW-NEXT: movb $112, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k7 -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm21, %zmm8 {%k7} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: movb $120, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 {%k3} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm28, %zmm2 {%k3} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $6, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k4} -; AVX512DQ-SLOW-NEXT: movb $56, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm9 {%k6} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5} -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm17, %zmm30 {%k7} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} -; AVX512DQ-SLOW-NEXT: movb $-31, %sil +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm25[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm0, %ymm25 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm0, %zmm4 {%k2} ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm10 {%k2} -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm25[4,5,6,7] -; AVX512DQ-SLOW-NEXT: movb $-61, %sil +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm10 {%k2} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [9,1,9,1,9,1,9,1] +; AVX512DQ-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm28, %zmm30 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [4,9,0,3,4,9,0,3] +; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm29, %zmm30 +; AVX512DQ-SLOW-NEXT: movb $56, %sil +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm16, %zmm3, %zmm27 +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm27, %zmm4 {%k3} +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm10 {%k3} +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] +; AVX512DQ-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm27, %zmm25 +; AVX512DQ-SLOW-NEXT: movb $96, %sil +; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm11 {%k2} ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm15 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm12 {%k3} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 {%k6} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm23, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = ymm23[1],mem[1],ymm23[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k2} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] +; AVX512DQ-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm21, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm27 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [3,0,12,4,3,0,12,4] +; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm21, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm12 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm16, %zmm3, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm29, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %ymm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11 {%k3} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm29[0],ymm9[0],ymm29[2],ymm9[2] +; AVX512DQ-SLOW-NEXT: movb $28, %cl +; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k3 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k3} = zmm28[2,3,2,3],zmm23[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %ymm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm27 {%k2} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm28[0],ymm12[0],ymm28[2],ymm12[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm27 {%k3} = zmm14[2,3,2,3],zmm15[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm21, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm14 {%k2} +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,12,0,5,4,12,0,5] +; AVX512DQ-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm30, %zmm17 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,1,12,7,0,1,12,7] +; AVX512DQ-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm24, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm6, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm21 {%k2} +; AVX512DQ-SLOW-NEXT: movb $120, %cl +; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm17 {%k2} +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm16, %zmm3, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm24, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm30 {%k2} +; AVX512DQ-SLOW-NEXT: movb $48, %cl +; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k2} = zmm6[0],zmm8[0],zmm6[2],zmm8[2],zmm6[4],zmm8[4],zmm6[6],zmm8[6] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k2} = zmm5[0],zmm7[0],zmm5[2],zmm7[2],zmm5[4],zmm7[4],zmm5[6],zmm7[6] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm6, %zmm22 +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm5 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm5 = zmm20[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm22 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm9[1],ymm29[3],ymm9[3] +; AVX512DQ-SLOW-NEXT: movb $64, %cl +; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: movb $14, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm26[1],ymm5[1],ymm26[3],ymm5[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm28[1],ymm12[1],ymm28[3],ymm12[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm22, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <13,u,2,3,4,5,6,14> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm16, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm16, %zmm6, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,13,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm15, %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm15, %zmm5, %zmm3 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 448(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 768(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 832(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 832(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i64_stride7_vf16: ; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: pushq %rax ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm30 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm17 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm21 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm31 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %xmm22 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm22, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm8 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] +; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512DQ-FAST-NEXT: movb $12, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm2 {%k2} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm12 -; AVX512DQ-FAST-NEXT: movb $112, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm12, %zmm2 {%k3} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [13,5,13,5,13,5,13,5] -; AVX512DQ-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm25 -; AVX512DQ-FAST-NEXT: movb $96, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm25 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] -; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm30, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm19 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] -; AVX512DQ-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm19 -; AVX512DQ-FAST-NEXT: movb $120, %sil -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm22[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm22, %zmm0, %zmm4 {%k2} -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm24, %zmm4 {%k3} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm22 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] -; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm22, %zmm14, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm24, %zmm29 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [15,7,15,7] -; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm22 -; AVX512DQ-FAST-NEXT: movb $24, %dil -; AVX512DQ-FAST-NEXT: kmovw %edi, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} -; AVX512DQ-FAST-NEXT: movb $-31, %dil -; AVX512DQ-FAST-NEXT: kmovw %edi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k3} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] -; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm15, %zmm28 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] -; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm28, %zmm29 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm16, %zmm1, %zmm20 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm21 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-FAST-NEXT: movb $-61, %dil -; AVX512DQ-FAST-NEXT: kmovw %edi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k3} -; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 {%k3} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] -; AVX512DQ-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm25, %zmm21 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm27 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm30 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] +; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm15 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [3,0,12,4,3,0,12,4] +; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm20, %zmm19 ; AVX512DQ-FAST-NEXT: movb $48, %sil -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm28 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm23 {%k1} -; AVX512DQ-FAST-NEXT: vpermi2q %zmm15, %zmm11, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm12 {%k3} +; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k1} = zmm2[0],zmm12[0],zmm2[2],zmm12[2],zmm2[4],zmm12[4],zmm2[6],zmm12[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm23 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6] +; AVX512DQ-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm29, %zmm18, %zmm21 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,13,6,7,0,13,6,7] +; AVX512DQ-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm21, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm13, %zmm24, %zmm22 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm12, %zmm2, %zmm26 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm26[0,1,2,3],zmm22[4,5,6,7] +; AVX512DQ-FAST-NEXT: movb $-61, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm3[0],zmm5[0],zmm3[2],zmm5[2],zmm3[4],zmm5[4],zmm3[6],zmm5[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %ymm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,7,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm8 -; AVX512DQ-FAST-NEXT: vpermt2q %ymm27, %ymm6, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: movb $14, %sil -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm16, %zmm25 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm22 {%k3} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [15,7,15,7,15,7,15,7] +; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm26, %zmm29 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [6,13,14,7,6,13,14,7] +; AVX512DQ-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm29, %zmm17, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm24 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm29 = [15,7,15,7] +; AVX512DQ-FAST-NEXT: # ymm29 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm29, %zmm2 +; AVX512DQ-FAST-NEXT: movb $24, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX512DQ-FAST-NEXT: vpermi2q %ymm23, %ymm8, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm25 {%k3} -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm18[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [11,3,11,3,11,3,11,3] -; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] -; AVX512DQ-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm27, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: movb $28, %al -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm6[2,3,2,3],zmm31[2,3,2,3] -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm13 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm23[0],ymm8[2],ymm23[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: movb $6, %al -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm8 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,9,0,3,4,9,0,3] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm8 -; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %xmm24 +; AVX512DQ-FAST-NEXT: movb $-31, %sil +; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 {%k4} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm31, %zmm30 +; AVX512DQ-FAST-NEXT: movb $112, %sil +; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm30, %zmm27 {%k4} +; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm14 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: movb $6, %sil +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm24[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm24 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm24, %zmm0, %zmm6 {%k2} +; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm10 {%k2} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [9,1,9,1,9,1,9,1] +; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm28 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] +; AVX512DQ-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm30, %zmm28 +; AVX512DQ-FAST-NEXT: movb $56, %sil +; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm16, %zmm31 +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm31, %zmm6 {%k4} +; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm10 {%k4} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [11,3,11,3,11,3,11,3] +; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm11 +; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm11 {%k2} +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,10,0,3,2,10,0,3] +; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm15 +; AVX512DQ-FAST-NEXT: movb $96, %cl +; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %ymm31 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm16, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm30, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 {%k4} +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm31[0],ymm0[2],ymm31[2] +; AVX512DQ-FAST-NEXT: movb $28, %cl +; AVX512DQ-FAST-NEXT: kmovw %ecx, %k4 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k4} = zmm14[2,3,2,3],zmm17[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm14 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm28 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm5, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm24 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm4[0],ymm14[0],ymm4[2],ymm14[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k4} = zmm28[2,3,2,3],zmm8[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = <1,3,7,u> +; AVX512DQ-FAST-NEXT: vpermt2q %ymm31, %ymm28, %ymm0 +; AVX512DQ-FAST-NEXT: vpermt2q %ymm14, %ymm28, %ymm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: movb $14, %cl +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm20 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] +; AVX512DQ-FAST-NEXT: kmovw %ecx, %k1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [13,5,13,5,13,5,13,5] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm4, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm4, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm23 {%k2} +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] +; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm21 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,1,12,7,0,1,12,7] +; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm14 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm4 {%k2} ; AVX512DQ-FAST-NEXT: movb $64, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} -; AVX512DQ-FAST-NEXT: movb $56, %al +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: movb $120, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512DQ-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <13,u,2,3,4,5,6,14> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm17, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm31, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm16, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm12 {%k1} +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512DQ-FAST-NEXT: vpermi2q %zmm16, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm3, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm31, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm0, %zmm1 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 832(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 832(%rax) +; AVX512DQ-FAST-NEXT: popq %rax ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride7_vf16: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: subq $200, %rsp +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm16, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm13[0,1,2,3],zmm12[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: movb $64, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm17, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm8, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm24, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm16, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm16, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm13, %zmm20, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm29, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm21, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm16, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm21, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm20, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm11, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm18, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm17, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm17, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm17, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: movb $48, %al +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm17[0],zmm18[0],zmm17[2],zmm18[2],zmm17[4],zmm18[4],zmm17[6],zmm18[6] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm31 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm31, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm17[0,1,2,3],zmm19[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm19, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm18, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm17, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: movb $24, %al +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm30 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm30, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: movb $64, %al +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm20, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm20, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm20, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm0, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm29, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: movb $96, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm17, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm17, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm16, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm29, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm17, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm25, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm14, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm28, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm28, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm27, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm24, %zmm23, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm24 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm24[0],ymm15[0],ymm24[2],ymm15[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm10 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512BW-ONLY-SLOW-NEXT: movb $28, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm23[2,3,2,3],zmm30[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %ymm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm29 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm29[0],ymm26[0],ymm29[2],ymm26[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k4} = zmm12[2,3,2,3],zmm30[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm28, %zmm12 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] ; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm26[0],ymm15[0],ymm26[2],ymm15[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k3} = zmm23[2,3,2,3],zmm22[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm23, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm23, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm25, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm1, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm29, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm13, %zmm30, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm13, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm13, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm11 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %ymm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm2[0],ymm24[0],ymm2[2],ymm24[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k4} = zmm13[2,3,2,3],zmm30[2,3,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,13,2,3,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm20, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm1, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm25, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm31, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: movb $48, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm14, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm14[0],zmm7[0],zmm14[2],zmm7[2],zmm14[4],zmm7[4],zmm14[6],zmm7[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm8 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm21, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm23, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm12 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm20, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: movb $120, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm6 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $6, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm9 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: movb $56, %sil +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: movb $120, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm9 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: movb $-31, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm10 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm31[0,1,2,3],zmm1[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $-61, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm1 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm13 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm31 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm31 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm24, %ymm4 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm4 = ymm24[1],mem[1],ymm24[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: movb $-31, %sil +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm8 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm29[1],ymm26[1],ymm29[3],ymm26[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm26[1],ymm15[1],ymm26[3],ymm15[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm24[1],ymm2[3],ymm24[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 64(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 640(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 768(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 832(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 832(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: addq $200, %rsp ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: store_i64_stride7_vf16: ; AVX512BW-ONLY-FAST: # %bb.0: +; AVX512BW-ONLY-FAST-NEXT: pushq %rax ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm25, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,0,1,0,8,0,1] ; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm16 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm3, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512BW-ONLY-FAST-NEXT: movb $12, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm22, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm18, %zmm18 ; AVX512BW-ONLY-FAST-NEXT: movb $112, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm16[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm16, %zmm0, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm30, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm13, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm24, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: movb $96, %sil +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm31, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm10, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm12, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm14 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: movb $6, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm26 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: movb $120, %sil -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm14, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm23, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: movb $24, %dil -; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} -; AVX512BW-ONLY-FAST-NEXT: movb $-31, %dil -; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm15, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm28, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm1, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: movb $-61, %dil -; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k3} -; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm13 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm27, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm20, %zmm19 ; AVX512BW-ONLY-FAST-NEXT: movb $48, %sil -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm24, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm24 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm11, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm16 {%k3} +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k2} = zmm2[0],zmm11[0],zmm2[2],zmm11[2],zmm2[4],zmm11[4],zmm2[6],zmm11[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm24, %zmm17, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm21, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm2, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm25[0,1,2,3],zmm23[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: movb $-61, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %ymm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %ymm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,7,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm27, %ymm6, %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: movb $14, %sil -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm17, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm12[0],zmm0[0],zmm12[2],zmm0[2],zmm12[4],zmm0[4],zmm12[6],zmm0[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm25, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [6,13,14,7,6,13,14,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm24, %zmm16, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm25, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm24, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: movb $24, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %ymm24, %ymm8, %ymm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm26 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm19[0,1,2,3],zmm18[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm27, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm17 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: movb $28, %al -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm6[2,3,2,3],zmm31[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm24[0],ymm8[2],ymm24[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: movb $6, %al -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm12 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm2 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: movb $56, %sil +; AVX512BW-ONLY-FAST-NEXT: movb $-31, %dil +; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm10 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm15 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm21, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: movb $96, %cl +; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm15 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm31, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm12 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] +; AVX512BW-ONLY-FAST-NEXT: movb $28, %cl +; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k4 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k4} = zmm27[2,3,2,3],zmm16[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %ymm27 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm29 = ymm0[0],ymm27[0],ymm0[2],ymm27[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k4} = zmm29[2,3,2,3],zmm8[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = <1,3,7,u> +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm15, %ymm29, %ymm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm27, %ymm29, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: movb $14, %cl +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm7, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k2} = zmm1[0],zmm4[0],zmm1[2],zmm4[2],zmm1[4],zmm4[4],zmm1[6],zmm4[6] +; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm19 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm14, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm14, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm1, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm14 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $64, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} -; AVX512BW-ONLY-FAST-NEXT: movb $56, %al +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: movb $120, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm23, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm4, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm5, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm18, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm4, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm22 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm31, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm25, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm4, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm0, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 576(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 512(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 576(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 64(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 448(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 832(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 832(%rax) +; AVX512BW-ONLY-FAST-NEXT: popq %rax ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: store_i64_stride7_vf16: ; AVX512DQBW-SLOW: # %bb.0: ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm11 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm16, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm14 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: movb $64, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm23 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm28 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm17, %zmm21 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: movb $96, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm24, %zmm11, %zmm17 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] ; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm18, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm18, %zmm19 -; AVX512DQBW-SLOW-NEXT: movb $24, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm16 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm16, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm12, %zmm11 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm30, %zmm11 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm31, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm14, %zmm25, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm25, %zmm14, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm29, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r9), %ymm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm24, %zmm27, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm24, %zmm23, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %ymm23 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm23[0],ymm5[0],ymm23[2],ymm5[2] -; AVX512DQBW-SLOW-NEXT: movb $28, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k3} = zmm24[2,3,2,3],zmm28[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7] -; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm29 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm22, %zmm0, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %ymm26 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm18 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm24 = ymm26[0],ymm5[0],ymm26[2],ymm5[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k3} = zmm24[2,3,2,3],zmm22[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm24, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm24, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm28, %zmm25, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm28, %zmm1, %zmm24 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm8, %zmm25 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm28, %zmm9 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,8,0,1,0,8,0,1] -; AVX512DQBW-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm30, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm30 -; AVX512DQBW-SLOW-NEXT: movb $48, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm6 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm13, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm4[0],zmm13[0],zmm4[2],zmm13[2],zmm4[4],zmm13[4],zmm4[6],zmm13[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm10 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm8, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm6, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm19, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm25 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm17[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 ; AVX512DQBW-SLOW-NEXT: movb $12, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k5 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k5} +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm17, %zmm0, %zmm2 {%k2} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm27, %zmm28 ; AVX512DQBW-SLOW-NEXT: movb $112, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k7 -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm21, %zmm8 {%k7} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: movb $120, %sil +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm17 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm22, %zmm24, %zmm26 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm23, %zmm26, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm5, %zmm19 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm19[0,1,2,3],zmm18[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: movb $-61, %dil +; AVX512DQBW-SLOW-NEXT: kmovd %edi, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm26, %zmm22 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm22, %zmm23, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm31 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # ymm22 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm22, %zmm19 +; AVX512DQBW-SLOW-NEXT: movb $24, %dil +; AVX512DQBW-SLOW-NEXT: kmovd %edi, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: movb $-31, %dil +; AVX512DQBW-SLOW-NEXT: kmovd %edi, %k3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm19 {%k3} ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 {%k3} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm0 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm28, %zmm2 {%k3} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $6, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm9 {%k4} -; AVX512DQBW-SLOW-NEXT: movb $56, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm9 {%k6} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k5} -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm17, %zmm30 {%k7} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm10 {%k2} -; AVX512DQBW-SLOW-NEXT: movb $-31, %sil +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm25[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm0, %ymm25 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm0, %zmm4 {%k2} ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm10 {%k2} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm25[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: movb $-61, %sil +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm10 {%k2} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm28, %zmm30 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm29, %zmm30 +; AVX512DQBW-SLOW-NEXT: movb $56, %sil +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm16, %zmm3, %zmm27 +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm27, %zmm4 {%k3} +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm10 {%k3} +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm27, %zmm25 +; AVX512DQBW-SLOW-NEXT: movb $96, %sil +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm11 {%k2} ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm15 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm12 {%k3} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 {%k6} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm23, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm23[1],mem[1],ymm23[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k2} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm21, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm27 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm21, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm12, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r9), %ymm12 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm16, %zmm3, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm29, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %ymm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11 {%k3} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm29[0],ymm9[0],ymm29[2],ymm9[2] +; AVX512DQBW-SLOW-NEXT: movb $28, %cl +; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k3 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k3} = zmm28[2,3,2,3],zmm23[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %ymm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm27 {%k2} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm28[0],ymm12[0],ymm28[2],ymm12[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm27 {%k3} = zmm14[2,3,2,3],zmm15[2,3,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm21, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm14 {%k2} +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm30, %zmm17 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm24, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm6, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm21 {%k2} +; AVX512DQBW-SLOW-NEXT: movb $120, %cl +; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm17 {%k2} +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm16, %zmm3, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm24, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm30 {%k2} +; AVX512DQBW-SLOW-NEXT: movb $48, %cl +; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k2} = zmm6[0],zmm8[0],zmm6[2],zmm8[2],zmm6[4],zmm8[4],zmm6[6],zmm8[6] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k2} = zmm5[0],zmm7[0],zmm5[2],zmm7[2],zmm5[4],zmm7[4],zmm5[6],zmm7[6] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm6, %zmm22 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm5 = zmm20[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm22 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm29[1],ymm9[1],ymm29[3],ymm9[3] +; AVX512DQBW-SLOW-NEXT: movb $64, %cl +; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: movb $14, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm26[1],ymm5[1],ymm26[3],ymm5[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm28[1],ymm12[1],ymm28[3],ymm12[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm22, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm16, %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm16, %zmm6, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,13,2,3,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm5, %zmm3 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 448(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 576(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 640(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 768(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 832(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 832(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax) ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: store_i64_stride7_vf16: ; AVX512DQBW-FAST: # %bb.0: +; AVX512DQBW-FAST-NEXT: pushq %rax ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm30 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm17 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm21 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm21, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm31 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,8,0,1,0,8,0,1] -; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %xmm22 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm22, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm26 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512DQBW-FAST-NEXT: movb $12, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm2 {%k2} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm12 -; AVX512DQBW-FAST-NEXT: movb $112, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm12, %zmm2 {%k3} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm23, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm25 -; AVX512DQBW-FAST-NEXT: movb $96, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm25 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm30, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm19 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm19 -; AVX512DQBW-FAST-NEXT: movb $120, %sil -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm22[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm22, %zmm0, %zmm4 {%k2} -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm24, %zmm4 {%k3} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm22 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] -; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm22, %zmm14, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm24, %zmm29 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [15,7,15,7] -; AVX512DQBW-FAST-NEXT: # ymm26 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm22 -; AVX512DQBW-FAST-NEXT: movb $24, %dil -; AVX512DQBW-FAST-NEXT: kmovd %edi, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm22 {%k2} -; AVX512DQBW-FAST-NEXT: movb $-31, %dil -; AVX512DQBW-FAST-NEXT: kmovd %edi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k3} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm11, %zmm15, %zmm28 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm14, %zmm28, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm16, %zmm1, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm21 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQBW-FAST-NEXT: movb $-61, %dil -; AVX512DQBW-FAST-NEXT: kmovd %edi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k3} -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 {%k3} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm25, %zmm21 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm27 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm30 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm15 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm20, %zmm19 ; AVX512DQBW-FAST-NEXT: movb $48, %sil -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm23 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm11, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm12 {%k3} +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k1} = zmm2[0],zmm12[0],zmm2[2],zmm12[2],zmm2[4],zmm12[4],zmm2[6],zmm12[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm23 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm29, %zmm18, %zmm21 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm17, %zmm21, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm13, %zmm24, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm12, %zmm2, %zmm26 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm26[0,1,2,3],zmm22[4,5,6,7] +; AVX512DQBW-FAST-NEXT: movb $-61, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm3[0],zmm5[0],zmm3[2],zmm5[2],zmm3[4],zmm5[4],zmm3[6],zmm5[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %ymm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %ymm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,7,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %ymm28, %ymm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm27, %ymm6, %ymm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: movb $14, %sil -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm16, %zmm25 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm13[0],zmm0[0],zmm13[2],zmm0[2],zmm13[4],zmm0[4],zmm13[6],zmm0[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm22 {%k3} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm26, %zmm29 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm29, %zmm17, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm24 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm29 = [15,7,15,7] +; AVX512DQBW-FAST-NEXT: # ymm29 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm29, %zmm2 +; AVX512DQBW-FAST-NEXT: movb $24, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX512DQBW-FAST-NEXT: vpermi2q %ymm23, %ymm8, %ymm6 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm25 {%k3} -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm18[0,1,2,3],zmm17[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm27, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: movb $28, %al -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm6[2,3,2,3],zmm31[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm13 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm8[0],ymm23[0],ymm8[2],ymm23[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm6[2,3,2,3],zmm14[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm6 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: movb $6, %al -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm6 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %xmm24 +; AVX512DQBW-FAST-NEXT: movb $-31, %sil +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 {%k4} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm31, %zmm30 +; AVX512DQBW-FAST-NEXT: movb $112, %sil +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm30, %zmm27 {%k4} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm14 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm14 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-FAST-NEXT: movb $6, %sil +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm24[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm24 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm24, %zmm0, %zmm6 {%k2} +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm10 {%k2} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm28 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm30, %zmm28 +; AVX512DQBW-FAST-NEXT: movb $56, %sil +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm16, %zmm31 +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm31, %zmm6 {%k4} +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm10 {%k4} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm11 +; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm11 {%k2} +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm15 +; AVX512DQBW-FAST-NEXT: movb $96, %cl +; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %ymm31 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm16, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm30, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 {%k4} +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm31[0],ymm0[2],ymm31[2] +; AVX512DQBW-FAST-NEXT: movb $28, %cl +; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k4 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k4} = zmm14[2,3,2,3],zmm17[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%r9), %ymm14 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm5, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm24 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm4[0],ymm14[0],ymm4[2],ymm14[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k4} = zmm28[2,3,2,3],zmm8[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = <1,3,7,u> +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm31, %ymm28, %ymm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm14, %ymm28, %ymm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: movb $14, %cl +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm7, %zmm20 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] +; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k1 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm4, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm4, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm23 {%k2} +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm21 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm4 {%k2} ; AVX512DQBW-FAST-NEXT: movb $64, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} -; AVX512DQBW-FAST-NEXT: movb $56, %al +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: movb $120, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm5, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm17, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm31, %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm21 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm16, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm12 {%k1} +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm29, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm16, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm3, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm31, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm0, %zmm1 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 320(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 448(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 512(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 832(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 832(%rax) +; AVX512DQBW-FAST-NEXT: popq %rax ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64 @@ -4768,122 +4758,117 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $1320, %rsp # imm = 0x528 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm3 +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm6 ; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm4 -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm6 -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm9 -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm7 +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rax), %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm10[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rax), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rax), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm3 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm0 @@ -4910,72 +4895,68 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rax), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 112(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 112(%rax), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 112(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 128(%r8), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 128(%rax), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rax), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vbroadcastsd 136(%rcx), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %ymm3 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm0 @@ -5002,72 +4983,68 @@ ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rcx), %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdx), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rax), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 168(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 176(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 176(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%r9), %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 176(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 176(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 176(%rax), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 176(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%rax), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vbroadcastsd 200(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm5 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm0 @@ -5087,76 +5064,75 @@ ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm2[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rcx), %ymm5, %ymm6 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2],ymm6[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdx), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm5[0],ymm6[1],ymm5[2],ymm6[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rax), %ymm3, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0,0,3,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm0[0,0,3,2] ; AVX1-ONLY-NEXT: vmovapd 224(%rax), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2],ymm15[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 216(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 240(%rcx), %xmm15 -; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],xmm15[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm14[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 248(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 216(%r9), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 240(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovapd 240(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 248(%r9), %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm1[0,1],ymm14[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm15[0],ymm0[1],ymm15[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm10[0],xmm13[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm10[0],xmm11[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm9[0],xmm8[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm14[0],xmm12[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm11[0],xmm13[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm7[0] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm12 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm11 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm10 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm10, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps %xmm9, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 1360(%rsi) -; AVX1-ONLY-NEXT: vmovaps %xmm8, 1344(%rsi) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 464(%rsi) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 448(%rsi) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 912(%rsi) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 1360(%rsi) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 1344(%rsi) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 912(%rsi) ; AVX1-ONLY-NEXT: vmovaps %xmm13, 896(%rsi) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 464(%rsi) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 448(%rsi) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps %xmm14, (%rsi) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 1760(%rsi) ; AVX1-ONLY-NEXT: vmovapd %ymm2, 1728(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 1696(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 1696(%rsi) ; AVX1-ONLY-NEXT: vmovapd %ymm1, 1664(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 1632(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm6, 1600(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 1568(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 1568(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 1536(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1504(%rsi) @@ -5252,20 +5228,21 @@ ; ; AVX2-ONLY-LABEL: store_i64_stride7_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1672, %rsp # imm = 0x688 +; AVX2-ONLY-NEXT: subq $1624, %rsp # imm = 0x658 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm13 -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm8 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm3 ; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm4, %ymm4 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5280,14 +5257,14 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm8 -; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm10 +; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 16(%rax), %xmm5 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm10[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm13[0],ymm9[0],ymm13[2],ymm9[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],mem[0],ymm2[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5297,13 +5274,13 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm1 @@ -5425,24 +5402,21 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 128(%r8), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps 128(%r8), %xmm14 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rax), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rax), %xmm13 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 128(%r9), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 128(%r9), %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5456,13 +5430,13 @@ ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps 160(%rax), %xmm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %xmm7 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] ; AVX2-ONLY-NEXT: vbroadcastsd 168(%rcx), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -5476,9 +5450,9 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 160(%r9), %ymm8 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 160(%r9), %ymm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5490,8 +5464,8 @@ ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %xmm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rcx), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdx), %ymm7, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vinsertf128 $1, 192(%rdx), %ymm4, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm0 @@ -5499,166 +5473,166 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 192(%r8), %xmm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps 192(%r8), %xmm3 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rax), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 192(%rax), %xmm2 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm12 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm9[0],ymm12[0],ymm9[2],ymm12[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm12[1],ymm9[3],ymm12[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 208(%rax), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm2 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[0,1],ymm0[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 224(%rax), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; AVX2-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm9 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 240(%rax), %xmm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm3, %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1],ymm3[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 240(%rax), %xmm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%rcx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm9[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1],ymm9[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd %xmm13, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 152(%rcx), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX2-ONLY-NEXT: vbroadcastsd 160(%rcx), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 152(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 -; AVX2-ONLY-NEXT: vbroadcastsd 160(%rcx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm4, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%r9), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 -; AVX2-ONLY-NEXT: vbroadcastsd 224(%rcx), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm2, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 216(%rcx), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 216(%r9), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm4[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vbroadcastsd 224(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 224(%r9), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm8[2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 224(%rax), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%r9), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 224(%rax), %ymm7 -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm7[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%r9), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rax), %ymm8 +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[0,1],ymm8[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm2, 1760(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 1728(%rcx) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm6, 1760(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 1728(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1696(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 1664(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 1664(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1632(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1600(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 1600(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 1568(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 1536(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 1536(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1504(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5667,10 +5641,10 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1440(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1408(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 1376(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 1376(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1344(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 1312(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 1312(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1280(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5681,8 +5655,8 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1184(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1152(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 1120(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 1088(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 1120(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 1088(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1056(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5691,10 +5665,10 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm0, 992(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 960(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 928(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 928(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 896(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 864(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 864(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 832(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5705,7 +5679,7 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm0, 736(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 704(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 672(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 672(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 640(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5716,7 +5690,7 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm0, 544(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 512(%rcx) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rcx) @@ -5748,7 +5722,7 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-ONLY-NEXT: addq $1672, %rsp # imm = 0x688 +; AVX2-ONLY-NEXT: addq $1624, %rsp # imm = 0x658 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -6204,439 +6178,442 @@ ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: subq $2024, %rsp # imm = 0x7E8 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm8 ; AVX512F-ONLY-FAST-NEXT: movb $96, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3] ; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] ; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm20 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %ymm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm23 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %ymm22 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm1[0],ymm19[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX512F-ONLY-FAST-NEXT: movb $28, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm4[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,7,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm3, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm5[2,3,2,3],zmm8[2,3,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm6[0],ymm23[2],ymm6[2] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm6, %ymm3, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <1,3,7,u> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %ymm3, %ymm4, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm24, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [6,13,14,7,6,13,14,7] +; AVX512F-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm2[0],ymm23[2],ymm2[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,7,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm2, %ymm4, %ymm23 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm22[0],ymm2[0],ymm22[2],ymm2[2] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm2, %ymm3, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm22[0],ymm1[0],ymm22[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm4, %ymm22 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm14, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm0[2,3,2,3],zmm18[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm20, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm16[2,3,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm3, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm24, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm6[2,3,2,3],zmm11[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm14, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm4, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm8, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm5[2,3,2,3],zmm9[2,3,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm28, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm31, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] ; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm28, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm27, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: movb $48, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm31, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm7[0],zmm17[0],zmm7[2],zmm17[2],zmm7[4],zmm17[4],zmm7[6],zmm17[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm31, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm30[0],zmm15[0],zmm30[2],zmm15[2],zmm30[4],zmm15[4],zmm30[6],zmm15[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm15, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm24, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm24, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm30, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm6[0],zmm14[0],zmm6[2],zmm14[2],zmm6[4],zmm14[4],zmm6[6],zmm14[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm31, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm27, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm30, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm3, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm10[0],zmm29[0],zmm10[2],zmm29[2],zmm10[4],zmm29[4],zmm10[6],zmm29[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm31, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm14, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm23, %zmm22, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm23, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm27, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm15 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm31, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm26[0],zmm3[0],zmm26[2],zmm3[2],zmm26[4],zmm3[4],zmm26[6],zmm3[6] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm26, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm26, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k3} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm19, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm3, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm25[0],zmm2[0],zmm25[2],zmm2[2],zmm25[4],zmm2[4],zmm25[6],zmm2[6] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm25, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm25, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k3} = zmm15[0],zmm0[0],zmm15[2],zmm0[2],zmm15[4],zmm0[4],zmm15[6],zmm0[6] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm15, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm15 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: movb $12, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k3} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] ; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm2, %zmm2 ; AVX512F-ONLY-FAST-NEXT: movb $112, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm25 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm16 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm31 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm22, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm24 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm13 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm27 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm30 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm20, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: movb $14, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k3} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k3} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $120, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm11 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k3} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k3} ; AVX512F-ONLY-FAST-NEXT: movb $-61, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k5 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k5} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k4} ; AVX512F-ONLY-FAST-NEXT: movb $24, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # zmm1 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm8[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k5} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k5} ; AVX512F-ONLY-FAST-NEXT: movb $-31, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm26 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm29 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm25 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm25 {%k4} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: movb $6, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k4} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm18 {%k4} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k4} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k4} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k4} ; AVX512F-ONLY-FAST-NEXT: movb $56, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm22, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm20, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm8 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm8 = zmm23[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm12, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,11,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm9, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm9[2,3,2,3],zmm0[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm6 = zmm21[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,11,u,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm10, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,11,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm7[2,3,2,3],zmm0[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 ; AVX512F-ONLY-FAST-NEXT: movb $64, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $8, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm19 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm6, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm19, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,12,u,3,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm9, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm8, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm15 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <12,u,u,3,4,5,6,13> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,12,u,3,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm7, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm6, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,12,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,13,2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm7, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1472(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1408(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1280(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1152(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1088(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1472(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1408(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 1280(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 1088(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 960(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 960(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 832(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 704(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 640(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 512(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 320(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 1344(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 896(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1664(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1536(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1344(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 896(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1728(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1664(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1600(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 1536(%rax) ; AVX512F-ONLY-FAST-NEXT: addq $2024, %rsp # imm = 0x7E8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq @@ -7086,247 +7063,246 @@ ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: subq $2056, %rsp # imm = 0x808 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm14 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm17 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm11 ; AVX512DQ-FAST-NEXT: movb $96, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3] ; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] ; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm21 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%r9), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%r9), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %ymm23 ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %ymm22 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] ; AVX512DQ-FAST-NEXT: movb $28, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k2 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm8[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,7,7] -; AVX512DQ-FAST-NEXT: vpermt2q %ymm1, %ymm4, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm5[2,3,2,3],zmm11[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <1,3,7,u> +; AVX512DQ-FAST-NEXT: vpermi2q %ymm1, %ymm4, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,1,12,7,0,1,12,7] -; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm7, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,1,12,7,0,1,12,7] +; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [5,0,14,6,5,0,14,6] ; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm26, %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm5[0],ymm23[2],ymm5[2] -; AVX512DQ-FAST-NEXT: vpermt2q %ymm5, %ymm4, %ymm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] +; AVX512DQ-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm25, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [6,13,14,7,6,13,14,7] +; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,7,7] +; AVX512DQ-FAST-NEXT: vpermt2q %ymm3, %ymm4, %ymm23 ; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm3[0],ymm22[2],ymm3[2] -; AVX512DQ-FAST-NEXT: vpermt2q %ymm3, %ymm4, %ymm22 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm2[0],ymm22[2],ymm2[2] +; AVX512DQ-FAST-NEXT: vpermt2q %ymm2, %ymm4, %ymm22 ; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm19, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm14 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm0[2,3,2,3],zmm14[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm13, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm13 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm0[2,3,2,3],zmm13[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm22 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm12, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm7, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm18, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm26, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm25, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm26 ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm12 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k2} = zmm8[2,3,2,3],zmm12[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm11 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k2} = zmm3[2,3,2,3],zmm11[2,3,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm8 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm3 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm24 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm18, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm26, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [3,0,12,4,3,0,12,4] -; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm11 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,0,12,4,3,0,12,4] +; AVX512DQ-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm31, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14] ; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm9, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm25, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: movb $48, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k3 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] -; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm29 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,8,0,1,0,8,0,1] +; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm28 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] ; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm31 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm10[0],zmm16[0],zmm10[2],zmm16[2],zmm10[4],zmm16[4],zmm10[6],zmm16[6] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm30 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm7[0],zmm16[0],zmm7[2],zmm16[2],zmm7[4],zmm16[4],zmm7[6],zmm16[6] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm24 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm23 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] ; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm30 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm26, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm8, %zmm17 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm31, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm25, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm17 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm10[0],zmm20[0],zmm10[2],zmm20[2],zmm10[4],zmm20[4],zmm10[6],zmm20[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm15, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm31, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm18, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm18 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm13, %zmm5, %zmm15 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm13, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm20 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm15, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm15 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm12, %zmm4, %zmm31 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm12, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm25, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm19 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm10 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm8, %zmm5 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm27[0],zmm6[0],zmm27[2],zmm6[2],zmm27[4],zmm6[4],zmm27[6],zmm6[6] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm27, %zmm1 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm27, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm27 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm20[0],zmm0[0],zmm20[2],zmm0[2],zmm20[4],zmm0[4],zmm20[6],zmm0[6] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm20, %zmm11 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm20, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm14, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm8, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm26[0],zmm6[0],zmm26[2],zmm6[2],zmm26[4],zmm6[4],zmm26[6],zmm6[6] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm26, %zmm1 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm26, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm26 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm14 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm19, %zmm0, %zmm8 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm20, %zmm6 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm19 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FAST-NEXT: movb $14, %sil @@ -7341,53 +7317,53 @@ ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm2 = zmm24[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm2 = zmm23[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k3} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k3} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k3} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k3} ; AVX512DQ-FAST-NEXT: movb $-61, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k5 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k5} ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm3 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k4} ; AVX512DQ-FAST-NEXT: movb $24, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k3} -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm23[0,1,2,3],zmm30[4,5,6,7] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm22[0,1,2,3],zmm29[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm5[0,1,2,3],zmm9[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k5} ; AVX512DQ-FAST-NEXT: movb $-31, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm27 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm26 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: movb $12, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k4} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k4} ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] ; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload @@ -7395,114 +7371,114 @@ ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm18 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm26 {%k4} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm25 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm11 {%k4} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm14 {%k4} ; AVX512DQ-FAST-NEXT: movb $112, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm6, %zmm29 {%k4} +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm6, %zmm28 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm19 {%k4} +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm18 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm26 {%k4} +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm25 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm6 ; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm11 {%k4} +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm14 {%k4} ; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FAST-NEXT: movb $6, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k4} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k4} ; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k4} ; AVX512DQ-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k4} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} ; AVX512DQ-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k4} ; AVX512DQ-FAST-NEXT: movb $56, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k4 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm9 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm9 = zmm21[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm16, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,11,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm12, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%r8), %ymm12 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm12[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm9 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm9 = zmm20[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,11,u,u,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm16, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,11,u,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm11, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%r8), %ymm11 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k2} = zmm11[2,3,2,3],zmm0[2,3,2,3] ; AVX512DQ-FAST-NEXT: movb $64, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} ; AVX512DQ-FAST-NEXT: movb $8, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm20 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm19 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <12,u,u,3,4,5,6,13> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm11 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm20, %zmm10 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm19, %zmm10 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,12,u,3,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm12, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm9, %zmm12 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm11, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <13,u,2,3,4,5,6,14> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm9, %zmm11 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = ; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm10, %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] ; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm10 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 1472(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 1472(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 1408(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 1344(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 1280(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 1344(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 1280(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 1216(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 1152(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 1088(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 1152(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 1088(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 896(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 832(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 960(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 896(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 832(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 768(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 576(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 384(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -7511,12 +7487,12 @@ ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, (%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 1728(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 1664(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 1536(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 1536(%rax) ; AVX512DQ-FAST-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq @@ -7973,439 +7949,442 @@ ; AVX512BW-ONLY-FAST: # %bb.0: ; AVX512BW-ONLY-FAST-NEXT: subq $2024, %rsp # imm = 0x7E8 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm19 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm8 ; AVX512BW-ONLY-FAST-NEXT: movb $96, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm21 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3] ; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] ; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm20 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %ymm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%r9), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm23 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %ymm22 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm1[0],ymm19[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX512BW-ONLY-FAST-NEXT: movb $28, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k2 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 {%k2} = zmm3[2,3,2,3],zmm4[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,7,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm3, %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm5[2,3,2,3],zmm8[2,3,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm6[0],ymm23[2],ymm6[2] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm6, %ymm3, %ymm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <1,3,7,u> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %ymm3, %ymm4, %ymm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm24, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm27, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [6,13,14,7,6,13,14,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm2[0],ymm23[2],ymm2[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,7,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm2, %ymm4, %ymm23 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm22[0],ymm2[0],ymm22[2],ymm2[2] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm2, %ymm3, %ymm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm14, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm0[2,3,2,3],zmm18[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm22[0],ymm1[0],ymm22[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm4, %ymm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm20, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm16[2,3,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm19, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm3, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm24, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm16, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm6[2,3,2,3],zmm11[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm2, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm14, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm4, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm10, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm9, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm27, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm8, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm5[2,3,2,3],zmm9[2,3,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm28, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm27, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm17, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm31, %zmm28 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] ; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm28, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm27, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: movb $48, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k3 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm31, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm4, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm7[0],zmm17[0],zmm7[2],zmm17[2],zmm7[4],zmm17[4],zmm7[6],zmm17[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm28, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm31, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm30[0],zmm15[0],zmm30[2],zmm15[2],zmm30[4],zmm15[4],zmm30[6],zmm15[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm15, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm28, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm24, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm24, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm30, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k3} = zmm6[0],zmm14[0],zmm6[2],zmm14[2],zmm6[4],zmm14[4],zmm6[6],zmm14[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm31, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm27, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm30, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm3, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm10[0],zmm29[0],zmm10[2],zmm29[2],zmm10[4],zmm29[4],zmm10[6],zmm29[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm31, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm14, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm23, %zmm22, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm23, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm27, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm15 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm31, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm26[0],zmm3[0],zmm26[2],zmm3[2],zmm26[4],zmm3[4],zmm26[6],zmm3[6] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm26, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm26, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 {%k3} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm19, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm3, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm25[0],zmm2[0],zmm25[2],zmm2[2],zmm25[4],zmm2[4],zmm25[6],zmm2[6] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm25, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm25, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm11, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k3} = zmm15[0],zmm0[0],zmm15[2],zmm0[2],zmm15[4],zmm0[4],zmm15[6],zmm0[6] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm15, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm15 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-ONLY-FAST-NEXT: movb $12, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm24 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm2, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: movb $112, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm25 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm16 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm3, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm3, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm31 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm22, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm24 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm13 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm2, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm27 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm2, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm30 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm20, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $14, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $120, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm13 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm11 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k3} ; AVX512BW-ONLY-FAST-NEXT: movb $-61, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k5} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k4} ; AVX512BW-ONLY-FAST-NEXT: movb $24, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # zmm1 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm8[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k5} ; AVX512BW-ONLY-FAST-NEXT: movb $-31, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm26 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm29 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm25 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm25 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $6, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm18 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm20 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm3 {%k4} ; AVX512BW-ONLY-FAST-NEXT: movb $56, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm22, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm20, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm8 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm8 = zmm23[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm12, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,11,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm9, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm9[2,3,2,3],zmm0[2,3,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm6 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = zmm21[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,11,u,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm10, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,11,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm7, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%r8), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm7[2,3,2,3],zmm0[2,3,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 ; AVX512BW-ONLY-FAST-NEXT: movb $64, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $8, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm19 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm6, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm22, %zmm19, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,12,u,3,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm9, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm8, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm15 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <12,u,u,3,4,5,6,13> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm4, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,12,u,3,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm7, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm6, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm4, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,12,3,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,13,2,3,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm7, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm6, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1472(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1408(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1280(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1216(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1152(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1088(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1472(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1408(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 1280(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1152(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 1088(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 960(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 832(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 960(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 832(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 704(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 640(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 512(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 320(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 192(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 1344(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 896(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1664(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1536(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1344(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 896(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1728(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1664(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1600(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 1536(%rax) ; AVX512BW-ONLY-FAST-NEXT: addq $2024, %rsp # imm = 0x7E8 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq @@ -8855,247 +8834,246 @@ ; AVX512DQBW-FAST: # %bb.0: ; AVX512DQBW-FAST-NEXT: subq $2056, %rsp # imm = 0x808 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm10 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm14 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm17 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm11 ; AVX512DQBW-FAST-NEXT: movb $96, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [9,1,9,1,9,1,9,1] ; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,9,0,3,4,9,0,3] ; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [11,3,11,3,11,3,11,3] ; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] ; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm21 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%r9), %ymm5 -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%r9), %ymm3 -; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm7 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%r9), %ymm2 +; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm4 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %ymm23 ; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %ymm22 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] ; AVX512DQBW-FAST-NEXT: movb $28, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k2 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm2[2,3,2,3],zmm8[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,7,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm1, %ymm4, %ymm7 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm5[2,3,2,3],zmm11[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <1,3,7,u> +; AVX512DQBW-FAST-NEXT: vpermi2q %ymm1, %ymm4, %ymm5 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm7, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [5,0,14,6,5,0,14,6] ; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm1 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm26, %zmm0 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm4 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,6,7,0,13,6,7] ; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm5[0],ymm23[2],ymm5[2] -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm5, %ymm4, %ymm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm25, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,3,7,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm3, %ymm4, %ymm23 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm3[0],ymm22[2],ymm3[2] -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm3, %ymm4, %ymm22 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm22[0],ymm2[0],ymm22[2],ymm2[2] +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm2, %ymm4, %ymm22 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm19, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm14 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm0[2,3,2,3],zmm14[2,3,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm13, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm21, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm13 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm0[2,3,2,3],zmm13[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm22 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm12, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm7, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm18, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm26, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm25, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm26 ; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm12 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k2} = zmm8[2,3,2,3],zmm12[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm11 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k2} = zmm3[2,3,2,3],zmm11[2,3,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm8 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm7, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm3 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm0, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm24 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm18, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm26, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm12 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm11 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm31, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] ; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [6,14,6,14,6,14,6,14] ; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm9, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm25, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: movb $48, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k3 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] -; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm29 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm28 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] ; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm31 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k3} = zmm10[0],zmm16[0],zmm10[2],zmm16[2],zmm10[4],zmm16[4],zmm10[6],zmm16[6] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm30 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k3} = zmm7[0],zmm16[0],zmm7[2],zmm16[2],zmm7[4],zmm16[4],zmm7[6],zmm16[6] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm7 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm24 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm23 ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] ; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm26, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm8, %zmm17 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm31, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm25, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm17 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm10[0],zmm20[0],zmm10[2],zmm20[2],zmm10[4],zmm20[4],zmm10[6],zmm20[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm10 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm15, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm31, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm18, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm13, %zmm5, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm13, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm20 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm15, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm12, %zmm4, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm12, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm25, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm19 ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm10 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm8, %zmm5 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k3} = zmm27[0],zmm6[0],zmm27[2],zmm6[2],zmm27[4],zmm6[4],zmm27[6],zmm6[6] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm27, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm27, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm27 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm20[0],zmm0[0],zmm20[2],zmm0[2],zmm20[4],zmm0[4],zmm20[6],zmm0[6] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm20, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm20, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm14, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm8, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm26[0],zmm6[0],zmm26[2],zmm6[2],zmm26[4],zmm6[4],zmm26[6],zmm6[6] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm26, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm26, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm26 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm19[0],zmm0[0],zmm19[2],zmm0[2],zmm19[4],zmm0[4],zmm19[6],zmm0[6] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm19, %zmm0, %zmm8 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm20, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm19, %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm19 ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-FAST-NEXT: movb $14, %sil @@ -9110,53 +9088,53 @@ ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm2 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm2 = zmm24[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm2 = zmm23[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k3} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k3} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm15 {%k3} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k3} ; AVX512DQBW-FAST-NEXT: movb $-61, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k5 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k5} ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm3 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k4} ; AVX512DQBW-FAST-NEXT: movb $24, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k3} -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm23[0,1,2,3],zmm30[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm22[0,1,2,3],zmm29[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k5} -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm5[0,1,2,3],zmm9[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k5} ; AVX512DQBW-FAST-NEXT: movb $-31, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm27 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm26 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQBW-FAST-NEXT: movb $12, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm29 {%k4} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm28 {%k4} ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload @@ -9164,114 +9142,114 @@ ; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm19 {%k4} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm18 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm3 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm26 {%k4} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm25 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm3 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],mem[0] ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm11 {%k4} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm14 {%k4} ; AVX512DQBW-FAST-NEXT: movb $112, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm6, %zmm29 {%k4} +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm6, %zmm28 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm19 {%k4} +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm3, %zmm18 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm26 {%k4} +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm3, %zmm25 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm6 ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm11 {%k4} +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 192(%rax), %zmm0, %zmm14 {%k4} ; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQBW-FAST-NEXT: movb $6, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm31 {%k4} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm30 {%k4} ; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm17 {%k4} ; AVX512DQBW-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm5 {%k4} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm4 {%k4} ; AVX512DQBW-FAST-NEXT: vpbroadcastq 200(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm8 {%k4} ; AVX512DQBW-FAST-NEXT: movb $56, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm9 ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm9 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm8 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 {%k1} -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm9 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm9 = zmm21[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm16, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,11,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm12, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%r8), %ymm12 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm12[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm9 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm9 = zmm20[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,11,u,u,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm16, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,1,11,u,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm11, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%r8), %ymm11 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k2} = zmm11[2,3,2,3],zmm0[2,3,2,3] ; AVX512DQBW-FAST-NEXT: movb $64, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} ; AVX512DQBW-FAST-NEXT: movb $8, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm20 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm19 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <12,u,u,3,4,5,6,13> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm11 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm20, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm19, %zmm10 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,12,u,3,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm12, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm9, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm11, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm9, %zmm11 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm10, %zmm9 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,12,3,4,5,6,7] ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm3, %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [14,1,2,3,4,5,6,15] ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm9, %zmm10 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 1472(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 1472(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 1408(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 1344(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 1280(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 1344(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 1280(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 1216(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 1152(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 1088(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 1152(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 1088(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1024(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 960(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 896(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 832(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 960(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 896(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 832(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 768(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 576(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 512(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 384(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -9280,12 +9258,12 @@ ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, (%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 1728(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 1664(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 1536(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 1536(%rax) ; AVX512DQBW-FAST-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq @@ -10442,7 +10420,7 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $3816, %rsp # imm = 0xEE8 +; AVX1-ONLY-NEXT: subq $3448, %rsp # imm = 0xD78 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10456,9 +10434,7 @@ ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rax), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm2 @@ -10488,30 +10464,27 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm3[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rax), %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] @@ -10521,9 +10494,7 @@ ; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm1 @@ -10533,9 +10504,7 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rax), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 @@ -10570,8 +10539,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -10579,12 +10547,10 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rax), %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 @@ -10603,9 +10569,7 @@ ; AVX1-ONLY-NEXT: vmovaps 112(%r8), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm1 @@ -10615,9 +10579,7 @@ ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 128(%rax), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rax), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm1 @@ -10652,8 +10614,7 @@ ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rcx), %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdx), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -10661,12 +10622,10 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rax), %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 168(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 @@ -10685,9 +10644,7 @@ ; AVX1-ONLY-NEXT: vmovaps 176(%r8), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 176(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm1 @@ -10697,9 +10654,7 @@ ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%rax), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm1 @@ -10734,8 +10689,7 @@ ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rcx), %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdx), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -10743,12 +10697,10 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps 224(%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rax), %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 @@ -10767,9 +10719,7 @@ ; AVX1-ONLY-NEXT: vmovaps 240(%r8), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 240(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%r9), %xmm1 @@ -10779,9 +10729,7 @@ ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 256(%rax), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rax), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %ymm1 @@ -10816,21 +10764,18 @@ ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rcx), %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 288(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdx), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 288(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps 288(%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rax), %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 296(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 @@ -10849,9 +10794,7 @@ ; AVX1-ONLY-NEXT: vmovaps 304(%r8), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 304(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%r9), %xmm1 @@ -10861,9 +10804,7 @@ ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 320(%rax), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%rax), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %ymm1 @@ -10876,71 +10817,62 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rsi), %ymm14 ; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm0[1],ymm14[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 336(%rax), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%r9), %ymm12 ; AVX1-ONLY-NEXT: vmovaps 336(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 336(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 336(%rcx), %xmm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rcx), %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 352(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdx), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 352(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%r9), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 352(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps 352(%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm10[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rax), %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 360(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rsi), %ymm15 +; AVX1-ONLY-NEXT: vmovaps 352(%rsi), %ymm9 ; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 368(%rax), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%r9), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 352(%r9), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 368(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm0[1],ymm14[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 368(%rcx), %xmm12 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%r9), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 384(%r9), %xmm7 ; AVX1-ONLY-NEXT: vmovaps 384(%r8), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm13[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm7[0] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 384(%rax), %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%rax), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %ymm1 @@ -10953,63 +10885,60 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 400(%rax), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%r9), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 384(%r9), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 400(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 400(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 400(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm5[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rcx), %ymm1, %ymm6 -; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm3[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rcx), %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdx), %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm13[0],ymm1[1],ymm13[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovapd 416(%r9), %xmm7 -; AVX1-ONLY-NEXT: vmovapd 416(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm7[0] -; AVX1-ONLY-NEXT: vmovapd 416(%rax), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 424(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 416(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovapd 416(%r8), %xmm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm13[0],xmm2[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rax), %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 424(%rcx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vbroadcastsd 456(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 448(%rdx), %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%r8), %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 456(%rcx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 448(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%r8), %ymm3, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm13[2,3],ymm1[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm2[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rcx), %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vbroadcastsd 488(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 480(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm13[0],xmm1[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rcx), %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdx), %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 488(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%r8), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11017,8 +10946,7 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm0 @@ -11033,24 +10961,21 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm0 @@ -11065,24 +10990,21 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm0 @@ -11097,24 +11019,21 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps 176(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm0 @@ -11129,24 +11048,21 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps 240(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%r8), %ymm0 @@ -11160,207 +11076,199 @@ ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps 304(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%r8), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] ; AVX1-ONLY-NEXT: vmovaps 336(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm10[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%r8), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] ; AVX1-ONLY-NEXT: vmovaps 368(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm13[1],xmm11[1] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%r8), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX1-ONLY-NEXT: vmovaps 400(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 416(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 416(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%r8), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 416(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovapd 432(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovapd 432(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 432(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 432(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm2[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 416(%rax), %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 416(%rax), %ymm1 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 448(%r8), %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rax), %ymm3, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 448(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 448(%r8), %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rax), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 448(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 448(%rax), %ymm3 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 480(%r8), %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rax), %ymm2, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm8[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 448(%rax), %ymm10 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm10[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd 480(%rax), %ymm8 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0],ymm9[1],ymm7[2],ymm9[3] ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 480(%r8), %ymm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rax), %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 480(%rax), %ymm15 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm15[2,3],ymm6[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 440(%r9), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 432(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 440(%r9), %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 464(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovapd 464(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 472(%r9), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 464(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 472(%r9), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0],ymm3[1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 496(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 496(%r9), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 496(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovapd 496(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vbroadcastsd 496(%r9), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 504(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0],ymm15[1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0],ymm8[1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm11[0],mem[0] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm14 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm10 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm12 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rax) +; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm7, 3152(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 3136(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm9, 2704(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm1, 2688(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 3152(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm4, 3136(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 2256(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm2, 2240(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 1360(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 2688(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 2256(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 2240(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 1808(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 1792(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 1360(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm3, 1344(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 464(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 912(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 896(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 464(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm5, 448(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 912(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 896(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 1808(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 1792(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 3520(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11569,27 +11477,27 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $3816, %rsp # imm = 0xEE8 +; AVX1-ONLY-NEXT: addq $3448, %rsp # imm = 0xD78 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride7_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $3880, %rsp # imm = 0xF28 +; AVX2-ONLY-NEXT: subq $3816, %rsp # imm = 0xEE8 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm2 ; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm3 ; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm4, %ymm4 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11603,13 +11511,13 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm8 -; AVX2-ONLY-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm6 +; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],mem[0],ymm2[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11620,7 +11528,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 @@ -11862,7 +11770,7 @@ ; AVX2-ONLY-NEXT: vmovaps 224(%rax), %xmm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11994,18 +11902,20 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rax), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 320(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 320(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%r8), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 320(%r8), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 320(%r9), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12024,9 +11934,9 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %xmm5 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] ; AVX2-ONLY-NEXT: vbroadcastsd 360(%rcx), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -12040,10 +11950,10 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 352(%r8), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 352(%r8), %ymm15 ; AVX2-ONLY-NEXT: vmovaps 352(%r9), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm15[0],ymm3[0],ymm15[2],ymm3[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12055,9 +11965,8 @@ ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %xmm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, 384(%rcx), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 384(%rdx), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vinsertf128 $1, 384(%rdx), %ymm11, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %ymm0 @@ -12065,24 +11974,21 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 384(%r8), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps 384(%r8), %xmm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rax), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rax), %xmm14 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm14[1] ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%r8), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 384(%r9), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 384(%r8), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 384(%r9), %ymm12 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12096,13 +12002,13 @@ ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps 416(%rax), %xmm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %xmm9 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm14[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] ; AVX2-ONLY-NEXT: vbroadcastsd 424(%rcx), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -12129,299 +12035,298 @@ ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %xmm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rcx), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdx), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdx), %ymm6, %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 448(%r8), %ymm11 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[0,1],ymm0[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 448(%rax), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd 456(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%r8), %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%r8), %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm2 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps 464(%rax), %xmm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 480(%r8), %ymm12 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm12[0,1],ymm0[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 480(%rax), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 464(%rax), %xmm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %xmm7 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] -; AVX2-ONLY-NEXT: vbroadcastsd 488(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%r8), %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; AVX2-ONLY-NEXT: vbroadcastsd 488(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%r8), %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm2 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovaps 496(%rax), %xmm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd (%rsp), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 152(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 160(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 224(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 280(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 288(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 312(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd %xmm10, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 344(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd 352(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 376(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 408(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm13[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 -; AVX2-ONLY-NEXT: vbroadcastsd 416(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 440(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovaps 496(%rax), %xmm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 152(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 160(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 216(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 224(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 280(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 288(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 312(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 344(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vbroadcastsd 352(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 376(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm8[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm15, %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm14, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 408(%rcx), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 +; AVX2-ONLY-NEXT: vbroadcastsd 416(%rcx), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 440(%rcx), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 440(%r9), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm11[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 472(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 472(%r9), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vmovaps 448(%r8), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 448(%r9), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm15[2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 448(%rax), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm15[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 472(%rcx), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 472(%r9), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm13[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 448(%rax), %ymm0 -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm8 -; AVX2-ONLY-NEXT: vbroadcastsd 480(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 496(%r9), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 504(%rcx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 504(%r9), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 480(%rax), %ymm13 -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm13[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm7 +; AVX2-ONLY-NEXT: vbroadcastsd 480(%rcx), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 480(%r8), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 480(%r9), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 480(%rax), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 496(%r9), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 504(%rcx), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 504(%r9), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 480(%rax), %ymm11 +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm11[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm12, 3552(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm12, 3520(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 3488(%rcx) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm8, 3552(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm8, 3520(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 3488(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3456(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3424(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 3392(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 3360(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 3328(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 3392(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 3360(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 3328(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3296(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 3264(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 3232(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 3232(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3200(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 3168(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 3168(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3136(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 3104(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 3104(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3072(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12432,8 +12337,8 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2976(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2944(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 2912(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 2880(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 2912(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 2880(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2848(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12442,10 +12347,10 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2784(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2752(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 2720(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 2720(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2688(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 2656(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 2656(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2624(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12456,8 +12361,9 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2528(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2496(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 2464(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 2432(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 2464(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2432(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2400(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12466,7 +12372,8 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2336(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2304(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 2272(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2272(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2240(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -12609,7 +12516,7 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-ONLY-NEXT: addq $3880, %rsp # imm = 0xF28 +; AVX2-ONLY-NEXT: addq $3816, %rsp # imm = 0xEE8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll @@ -33,20 +33,20 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] ; SSE-NEXT: movaps %xmm2, %xmm9 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm3[0] -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm7[0] -; SSE-NEXT: movaps %xmm4, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm5[0] +; SSE-NEXT: movaps %xmm4, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm5[0] +; SSE-NEXT: movaps %xmm6, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm7[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] -; SSE-NEXT: movaps %xmm4, 96(%rax) +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] ; SSE-NEXT: movaps %xmm6, 112(%rax) +; SSE-NEXT: movaps %xmm4, 96(%rax) ; SSE-NEXT: movaps %xmm2, 80(%rax) ; SSE-NEXT: movaps %xmm0, 64(%rax) -; SSE-NEXT: movaps %xmm11, 32(%rax) -; SSE-NEXT: movaps %xmm10, 48(%rax) +; SSE-NEXT: movaps %xmm11, 48(%rax) +; SSE-NEXT: movaps %xmm10, 32(%rax) ; SSE-NEXT: movaps %xmm9, 16(%rax) ; SSE-NEXT: movaps %xmm8, (%rax) ; SSE-NEXT: retq @@ -153,63 +153,63 @@ ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movaps (%rdi), %xmm4 ; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps (%rsi), %xmm10 -; SSE-NEXT: movaps 16(%rsi), %xmm14 +; SSE-NEXT: movaps (%rsi), %xmm8 +; SSE-NEXT: movaps 16(%rsi), %xmm11 ; SSE-NEXT: movaps (%rdx), %xmm1 -; SSE-NEXT: movaps 16(%rdx), %xmm3 -; SSE-NEXT: movaps (%rcx), %xmm7 +; SSE-NEXT: movaps 16(%rdx), %xmm5 +; SSE-NEXT: movaps (%rcx), %xmm10 ; SSE-NEXT: movaps 16(%rcx), %xmm12 -; SSE-NEXT: movaps (%r8), %xmm5 -; SSE-NEXT: movaps 16(%r8), %xmm0 -; SSE-NEXT: movaps (%r9), %xmm13 +; SSE-NEXT: movaps (%r8), %xmm3 +; SSE-NEXT: movaps 16(%r8), %xmm7 +; SSE-NEXT: movaps (%r9), %xmm14 +; SSE-NEXT: movaps 16(%r9), %xmm13 ; SSE-NEXT: movaps (%r10), %xmm6 -; SSE-NEXT: movaps 16(%r10), %xmm9 +; SSE-NEXT: movaps 16(%r10), %xmm0 ; SSE-NEXT: movaps (%rax), %xmm15 -; SSE-NEXT: movaps 16(%rax), %xmm11 +; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] ; SSE-NEXT: movaps %xmm1, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm7[0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: movaps %xmm4, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] ; SSE-NEXT: movaps %xmm3, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm12[1] -; SSE-NEXT: movaps %xmm2, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm14[1] +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm14[1] ; SSE-NEXT: movaps %xmm6, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm15[1] -; SSE-NEXT: movaps %xmm5, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] -; SSE-NEXT: movaps %xmm9, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm11[1] -; SSE-NEXT: movaps 16(%r9), %xmm11 -; SSE-NEXT: movaps %xmm0, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] +; SSE-NEXT: movaps %xmm2, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] +; SSE-NEXT: movaps %xmm5, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm12[1] +; SSE-NEXT: movaps %xmm7, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm13[1] +; SSE-NEXT: movaps 16(%rax), %xmm13 +; SSE-NEXT: movaps %xmm0, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm0, 224(%rax) -; SSE-NEXT: movaps %xmm9, 240(%rax) -; SSE-NEXT: movaps %xmm8, 160(%rax) -; SSE-NEXT: movaps %xmm13, 176(%rax) -; SSE-NEXT: movaps %xmm5, 96(%rax) -; SSE-NEXT: movaps %xmm6, 112(%rax) -; SSE-NEXT: movaps %xmm15, 32(%rax) -; SSE-NEXT: movaps %xmm14, 48(%rax) +; SSE-NEXT: movaps %xmm0, 240(%rax) +; SSE-NEXT: movaps %xmm7, 224(%rax) +; SSE-NEXT: movaps %xmm5, 208(%rax) ; SSE-NEXT: movaps %xmm2, 192(%rax) -; SSE-NEXT: movaps %xmm3, 208(%rax) -; SSE-NEXT: movaps %xmm12, 128(%rax) -; SSE-NEXT: movaps %xmm10, 144(%rax) -; SSE-NEXT: movaps %xmm4, 64(%rax) +; SSE-NEXT: movaps %xmm9, 176(%rax) +; SSE-NEXT: movaps %xmm12, 160(%rax) +; SSE-NEXT: movaps %xmm11, 144(%rax) +; SSE-NEXT: movaps %xmm15, 128(%rax) +; SSE-NEXT: movaps %xmm6, 112(%rax) +; SSE-NEXT: movaps %xmm3, 96(%rax) ; SSE-NEXT: movaps %xmm1, 80(%rax) -; SSE-NEXT: movaps %xmm7, (%rax) +; SSE-NEXT: movaps %xmm4, 64(%rax) +; SSE-NEXT: movaps %xmm14, 48(%rax) +; SSE-NEXT: movaps %xmm10, 32(%rax) +; SSE-NEXT: movaps %xmm8, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride8_vf4: @@ -219,52 +219,52 @@ ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%r11), %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%r10), %ymm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovaps 16(%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm6[0],xmm5[0] +; AVX1-ONLY-NEXT: vmovaps (%r11), %ymm4 +; AVX1-ONLY-NEXT: vmovaps (%r10), %ymm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm6[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovaps 16(%r9), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm5 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm5[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm7 -; AVX1-ONLY-NEXT: vmovaps (%r11), %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm5[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm8[0],xmm7[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm7 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm7[0] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm11[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm11[0],xmm8[0] -; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 80(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm7, (%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 112(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0] +; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm10 +; AVX1-ONLY-NEXT: vmovaps (%r11), %xmm11 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm10[0] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm13 +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm14 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm14[1],xmm13[1] +; AVX1-ONLY-NEXT: vmovaps %xmm8, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 112(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 80(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 48(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm9, (%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -281,38 +281,38 @@ ; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm5 ; AVX2-ONLY-NEXT: vmovaps (%r11), %ymm6 ; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm7 -; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%r11), %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm9, %ymm9 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm11 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm11, %ymm11 +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm12 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%r11), %ymm12, %ymm12 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm11 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm11, %ymm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm13[2,3],ymm11[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 224(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 160(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm12, (%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 32(%rax) +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 128(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm10, (%rax) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -536,245 +536,237 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride8_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: pushq %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm8[1],ymm2[2],ymm8[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm6 +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm5 ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm6[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2],ymm9[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%r10), %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm9[0],xmm8[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm5[0] +; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm10[1],ymm1[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rdx), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm8[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%r10), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm9[0],xmm8[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm11 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm11[0],xmm10[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm11[1],xmm10[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rdx), %ymm9 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovaps 16(%r9), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm11[0],xmm10[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 16(%rax), %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm11[0],xmm10[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%r9), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm13 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm13[0],xmm12[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm14 +; AVX1-ONLY-NEXT: vbroadcastsd 16(%rax), %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%r9), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm14[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm14[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm13[1],xmm12[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1] ; AVX1-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm13[1],xmm12[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm13[0],xmm12[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%r9), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm0 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm15[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm13[1],xmm12[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 352(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 320(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 288(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 256(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 224(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm6, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 416(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 480(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%rdx) -; AVX1-ONLY-NEXT: popq %rax +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride8_vf8: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: pushq %rax ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm3 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm7 -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm8 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm7[1] -; AVX2-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm2 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm7[1] +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rdx), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 +; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm5 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm13 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm10 +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm11 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm14[1],xmm13[1] -; AVX2-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm4 +; AVX2-ONLY-NEXT: vbroadcastsd 8(%r10), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm10[1],xmm9[1] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%r10), %ymm15 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm9[1],xmm8[1] +; AVX2-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm12[1],xmm11[1] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rdx), %ymm15 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm11[1],xmm10[1] +; AVX2-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm15 +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm15 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm8[0],xmm7[0] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm12[0],xmm7[0] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm3, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm8 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm13[0] -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm1 -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm14[0],xmm13[0] +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm13 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm12, %ymm12 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm5, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm13 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm10[0],xmm9[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm12 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm9[0],xmm8[0] ; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm14 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm8, %ymm8 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm6, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm12[0],xmm11[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm11[0],xmm10[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm8, %ymm8 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm15, %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 16(%rax), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm15 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm15[1],ymm1[3],ymm15[3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm3 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[2],ymm15[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 16(%rax), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm11 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm13 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm14[1],ymm12[3],ymm14[3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm13 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm13[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm8 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm11[0],ymm7[2],ymm11[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm7 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm11 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm11[2,3] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 416(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 480(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 448(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 384(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 224(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm10, 160(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 352(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 256(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 288(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 256(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rdx) -; AVX2-ONLY-NEXT: popq %rax +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -783,113 +775,119 @@ ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm9 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm8 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm7 ; AVX512F-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512F-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512F-NEXT: vmovdqa64 (%r11), %zmm1 ; AVX512F-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [0,8,0,8,0,8,0,8] ; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm8 ; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 ; AVX512F-NEXT: movb $-64, %r8b ; AVX512F-NEXT: kmovw %r8d, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm5, %ymm10 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512F-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm12 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm17 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] -; AVX512F-NEXT: vpermi2q %zmm9, %zmm6, %zmm5 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] -; AVX512F-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm11, %zmm5 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm10[0],xmm8[0] +; AVX512F-NEXT: vmovdqa (%rcx), %xmm12 +; AVX512F-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm12[0] +; AVX512F-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-NEXT: vinserti32x4 $0, %xmm11, %zmm14, %zmm11 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm11[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm11, %zmm14 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm10[1],xmm8[1] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm13[1],xmm12[1] +; AVX512F-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-NEXT: vinserti32x4 $0, %xmm8, %zmm10, %zmm8 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm11[4,5,6,7] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,4,12,4,12,4,12] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm10, %zmm11 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512F-NEXT: vpermi2q %zmm9, %zmm5, %zmm10 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] ; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm11, %zmm12 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512F-NEXT: vpermi2q %zmm9, %zmm5, %zmm11 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [5,13,5,13] +; AVX512F-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] -; AVX512F-NEXT: vpermi2q %zmm9, %zmm6, %zmm11 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [7,15,7,15] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm12, %zmm13 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] +; AVX512F-NEXT: vpermi2q %zmm9, %zmm5, %zmm12 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [6,14,6,14] ; AVX512F-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] +; AVX512F-NEXT: vpermi2q %zmm7, %zmm6, %zmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15] ; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm13, %zmm14 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512F-NEXT: vpermi2q %zmm9, %zmm6, %zmm13 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] -; AVX512F-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm15 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512F-NEXT: vpermt2q %zmm9, %zmm14, %zmm6 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] +; AVX512F-NEXT: vpermt2q %zmm9, %zmm13, %zmm5 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [7,15,7,15] ; AVX512F-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm16, %zmm6 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm8 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa (%rcx), %ymm9 -; AVX512F-NEXT: vmovdqa (%rdx), %ymm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX512F-NEXT: vpermi2q %zmm7, %zmm6, %zmm9 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512F-NEXT: vmovdqa (%rdx), %ymm9 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] ; AVX512F-NEXT: vmovdqa (%rsi), %ymm14 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm13[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 448(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 384(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, 448(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, 384(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, 256(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -898,113 +896,119 @@ ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm8 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm7 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%r11), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [0,8,0,8,0,8,0,8] ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm8 ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 ; AVX512BW-NEXT: movb $-64, %r8b ; AVX512BW-NEXT: kmovd %r8d, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm5, %ymm10 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm5, %ymm12 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm17 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm11 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm5 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] -; AVX512BW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm11, %zmm5 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm10[0],xmm8[0] +; AVX512BW-NEXT: vmovdqa (%rcx), %xmm12 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm12[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm11, %zmm14, %zmm11 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm11[0,1,2,3],zmm4[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm10[1],xmm8[1] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm13[1],xmm12[1] +; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm8, %zmm10, %zmm8 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm11[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,4,12,4,12,4,12] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm11 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm9, %zmm5, %zmm10 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm12 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm9, %zmm5, %zmm11 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [5,13,5,13] +; AVX512BW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm11 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [7,15,7,15] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm13 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] +; AVX512BW-NEXT: vpermi2q %zmm9, %zmm5, %zmm12 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [6,14,6,14] ; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm14 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm13 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] -; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm15 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm16 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm6 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm5 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [7,15,7,15] ; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm16, %zmm6 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm8 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa (%rcx), %ymm9 -; AVX512BW-NEXT: vmovdqa (%rdx), %ymm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512BW-NEXT: vmovdqa (%rdx), %ymm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] ; AVX512BW-NEXT: vmovdqa (%rsi), %ymm14 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm15 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm4 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm0 +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm13[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <8 x i64>, ptr %in.vecptr0, align 64 @@ -1888,460 +1892,1798 @@ ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; -; AVX512F-LABEL: store_i64_stride8_vf16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm19 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm0 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm26 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm17 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm31 -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm27 -; AVX512F-NEXT: vmovdqa64 (%r11), %zmm8 -; AVX512F-NEXT: vmovdqa64 64(%r11), %zmm30 -; AVX512F-NEXT: vmovdqa64 (%r10), %zmm9 -; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm29 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] -; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm18, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm18, %zmm4 -; AVX512F-NEXT: movb $-64, %r8b -; AVX512F-NEXT: kmovw %r8d, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512F-NEXT: vinserti128 $1, (%rdx), %ymm10, %ymm10 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm10[1],ymm1[1],ymm10[3],ymm1[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm23, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm23, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm21 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [5,13,5,13,5,13,5,13] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm10, %zmm1 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm6[1],zmm31[1],zmm6[3],zmm31[3],zmm6[5],zmm31[5],zmm6[7],zmm31[7] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512F-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm14, %zmm13 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm22 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [4,12,4,12,4,12,4,12] -; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm6[0],zmm31[0],zmm6[2],zmm31[2],zmm6[4],zmm31[4],zmm6[6],zmm31[6] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm15, %zmm13 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm1, %zmm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm24 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm13, %zmm5 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm13, %zmm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm4, %zmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm28, %zmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] -; AVX512F-NEXT: vpermt2q %zmm19, %zmm28, %zmm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512F-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm17, %zmm7, %zmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm18, %zmm0 -; AVX512F-NEXT: vpermi2q %zmm27, %zmm16, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512F-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512F-NEXT: vinserti128 $1, 64(%rdx), %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm23, %zmm3 -; AVX512F-NEXT: vpermi2q %zmm27, %zmm16, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm16[1],zmm27[1],zmm16[3],zmm27[3],zmm16[5],zmm27[5],zmm16[7],zmm27[7] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vpermi2q %zmm25, %zmm11, %zmm10 -; AVX512F-NEXT: vpermi2q %zmm26, %zmm12, %zmm14 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm27[0],zmm16[2],zmm27[2],zmm16[4],zmm27[4],zmm16[6],zmm27[6] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512F-NEXT: vpermi2q %zmm25, %zmm11, %zmm15 -; AVX512F-NEXT: vpermi2q %zmm26, %zmm12, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm13, %zmm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm30[1],zmm29[1],zmm30[3],zmm29[3],zmm30[5],zmm29[5],zmm30[7],zmm29[7] -; AVX512F-NEXT: vpermi2q %zmm25, %zmm11, %zmm13 -; AVX512F-NEXT: vpermi2q %zmm26, %zmm12, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm28, %zmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] -; AVX512F-NEXT: vpermt2q %zmm25, %zmm28, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm7, %zmm12 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm3, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512F-NEXT: vmovdqa 64(%rcx), %ymm11 -; AVX512F-NEXT: vmovdqa (%rdx), %ymm12 -; AVX512F-NEXT: vmovdqa 64(%rdx), %ymm13 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] -; AVX512F-NEXT: vmovdqa (%rsi), %ymm15 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %ymm23 -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm26 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm25[1],ymm15[1],ymm25[3],ymm15[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm14[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm10, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm10, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm25[0],ymm15[0],ymm25[2],ymm15[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm3, %zmm7 -; AVX512F-NEXT: vpermi2q %zmm27, %zmm16, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm26[1],ymm23[1],ymm26[3],ymm23[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm10, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm10, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm16 {%k1} -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm26[0],ymm23[0],ymm26[2],ymm23[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm16, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm7, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 704(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm0, 896(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 960(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, 768(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 832(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, 576(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm20, 448(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm24, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm22, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm21, (%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-ONLY-SLOW-LABEL: store_i64_stride8_vf16: +; AVX512F-ONLY-SLOW: # %bb.0: +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512F-ONLY-SLOW-NEXT: vmovaps 64(%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r11), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r11), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: movb $-64, %r8b +; AVX512F-ONLY-SLOW-NEXT: kmovw %r8d, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %xmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm22 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm17[0],xmm15[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm14, %zmm18, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm14[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm10, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm13[1],xmm4[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm17[1],xmm15[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm4, %zmm13, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm4[0,1,2,3],zmm19[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm22[0],xmm16[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm14, %zmm15, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm13[1],xmm4[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm22[1],xmm16[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm1[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,4,12,4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm29[0],zmm5[2],zmm29[2],zmm5[4],zmm29[4],zmm5[6],zmm29[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,5,13,5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm15, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm29[1],zmm5[3],zmm29[3],zmm5[5],zmm29[5],zmm5[7],zmm29[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm15, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm8[0],zmm30[0],zmm8[2],zmm30[2],zmm8[4],zmm30[4],zmm8[6],zmm30[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm7, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm8[1],zmm30[1],zmm8[3],zmm30[3],zmm8[5],zmm30[5],zmm8[7],zmm30[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm21, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm10, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm3, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm12, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm3, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm2[0],zmm28[0],zmm2[2],zmm28[2],zmm2[4],zmm28[4],zmm2[6],zmm28[6] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm3, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm12, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm7, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm2[1],zmm28[1],zmm2[3],zmm28[3],zmm2[5],zmm28[5],zmm2[7],zmm28[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm7, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm21, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm7[0],ymm11[2],ymm7[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm9, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm21[0],ymm14[0],ymm21[2],ymm14[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm9, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm14[1],ymm21[3],ymm14[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 896(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vzeroupper +; AVX512F-ONLY-SLOW-NEXT: retq ; -; AVX512BW-LABEL: store_i64_stride8_vf16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm26 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm17 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm16 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm31 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm27 -; AVX512BW-NEXT: vmovdqa64 (%r11), %zmm8 -; AVX512BW-NEXT: vmovdqa64 64(%r11), %zmm30 -; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm9 -; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm29 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm18, %zmm4 -; AVX512BW-NEXT: movb $-64, %r8b -; AVX512BW-NEXT: kmovd %r8d, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm10, %ymm10 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm10[1],ymm1[1],ymm10[3],ymm1[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm23, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm21 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm1 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm6[1],zmm31[1],zmm6[3],zmm31[3],zmm6[5],zmm31[5],zmm6[7],zmm31[7] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm10, %zmm1 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm14, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm22 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [4,12,4,12,4,12,4,12] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm15, %zmm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm6[0],zmm31[0],zmm6[2],zmm31[2],zmm6[4],zmm31[4],zmm6[6],zmm31[6] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm15, %zmm13 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm24 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm13, %zmm5 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm8[1],zmm9[1],zmm8[3],zmm9[3],zmm8[5],zmm9[5],zmm8[7],zmm9[7] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm13, %zmm2 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15] -; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm20 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm28 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm28, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm8[0],zmm9[0],zmm8[2],zmm9[2],zmm8[4],zmm9[4],zmm8[6],zmm9[6] -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm28, %zmm3 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [6,14,6,14] -; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm7, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm18, %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm27, %zmm16, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512BW-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm2, %ymm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm23, %zmm3 -; AVX512BW-NEXT: vpermi2q %zmm27, %zmm16, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm10, %zmm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm16[1],zmm27[1],zmm16[3],zmm27[3],zmm16[5],zmm27[5],zmm16[7],zmm27[7] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm25, %zmm11, %zmm10 -; AVX512BW-NEXT: vpermi2q %zmm26, %zmm12, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm27[0],zmm16[2],zmm27[2],zmm16[4],zmm27[4],zmm16[6],zmm27[6] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm25, %zmm11, %zmm15 -; AVX512BW-NEXT: vpermi2q %zmm26, %zmm12, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm13, %zmm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm30[1],zmm29[1],zmm30[3],zmm29[3],zmm30[5],zmm29[5],zmm30[7],zmm29[7] -; AVX512BW-NEXT: vpermi2q %zmm25, %zmm11, %zmm13 -; AVX512BW-NEXT: vpermi2q %zmm26, %zmm12, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm28, %zmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm28, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm3, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512BW-NEXT: vmovdqa 64(%rcx), %ymm11 -; AVX512BW-NEXT: vmovdqa (%rdx), %ymm12 -; AVX512BW-NEXT: vmovdqa 64(%rdx), %ymm13 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] -; AVX512BW-NEXT: vmovdqa (%rsi), %ymm15 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %ymm23 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm26 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm25[1],ymm15[1],ymm25[3],ymm15[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm14[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm10, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm25[0],ymm15[0],ymm25[2],ymm15[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm3, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm27, %zmm16, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm26[1],ymm23[1],ymm26[3],ymm23[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm10, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm10, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm16 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm26[0],ymm23[0],ymm26[2],ymm23[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm16, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 896(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 960(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 832(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, (%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq - %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64 - %in.vec1 = load <16 x i64>, ptr %in.vecptr1, align 64 - %in.vec2 = load <16 x i64>, ptr %in.vecptr2, align 64 - %in.vec3 = load <16 x i64>, ptr %in.vecptr3, align 64 - %in.vec4 = load <16 x i64>, ptr %in.vecptr4, align 64 - %in.vec5 = load <16 x i64>, ptr %in.vecptr5, align 64 - %in.vec6 = load <16 x i64>, ptr %in.vecptr6, align 64 - %in.vec7 = load <16 x i64>, ptr %in.vecptr7, align 64 - %1 = shufflevector <16 x i64> %in.vec0, <16 x i64> %in.vec1, <32 x i32> - %2 = shufflevector <16 x i64> %in.vec2, <16 x i64> %in.vec3, <32 x i32> - %3 = shufflevector <16 x i64> %in.vec4, <16 x i64> %in.vec5, <32 x i32> - %4 = shufflevector <16 x i64> %in.vec6, <16 x i64> %in.vec7, <32 x i32> - %5 = shufflevector <32 x i64> %1, <32 x i64> %2, <64 x i32> - %6 = shufflevector <32 x i64> %3, <32 x i64> %4, <64 x i32> - %7 = shufflevector <64 x i64> %5, <64 x i64> %6, <128 x i32> - %interleaved.vec = shufflevector <128 x i64> %7, <128 x i64> poison, <128 x i32> - store <128 x i64> %interleaved.vec, ptr %out.vec, align 64 - ret void -} - -define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { -; SSE-LABEL: store_i64_stride8_vf32: -; SSE: # %bb.0: -; SSE-NEXT: subq $1688, %rsp # imm = 0x698 -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movaps (%rdi), %xmm7 -; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps (%rsi), %xmm1 -; SSE-NEXT: movaps 16(%rsi), %xmm0 -; SSE-NEXT: movaps (%rdx), %xmm9 -; SSE-NEXT: movaps 16(%rdx), %xmm10 -; SSE-NEXT: movaps (%rcx), %xmm3 -; SSE-NEXT: movaps 16(%rcx), %xmm2 -; SSE-NEXT: movaps (%r8), %xmm11 -; SSE-NEXT: movaps 16(%r8), %xmm12 -; SSE-NEXT: movaps (%r9), %xmm5 -; SSE-NEXT: movaps 16(%r9), %xmm4 -; SSE-NEXT: movaps (%r10), %xmm13 -; SSE-NEXT: movaps 16(%r10), %xmm14 -; SSE-NEXT: movaps (%rax), %xmm6 +; AVX512F-ONLY-FAST-LABEL: store_i64_stride8_vf16: +; AVX512F-ONLY-FAST: # %bb.0: +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512F-ONLY-FAST-NEXT: vmovaps 64(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r11), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r11), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: movb $-64, %r8b +; AVX512F-ONLY-FAST-NEXT: kmovw %r8d, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %xmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm22 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm17[0],xmm15[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm14, %zmm18, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm14[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm10, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm19 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm13[1],xmm4[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm17[1],xmm15[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm4, %zmm13, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm4[0,1,2,3],zmm19[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm22[0],xmm16[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm14, %zmm15, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm13[1],xmm4[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm22[1],xmm16[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm4, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm1[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,4,12,4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm29[0],zmm5[2],zmm29[2],zmm5[4],zmm29[4],zmm5[6],zmm29[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,5,13,5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm29[1],zmm5[3],zmm29[3],zmm5[5],zmm29[5],zmm5[7],zmm29[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm8[0],zmm30[0],zmm8[2],zmm30[2],zmm8[4],zmm30[4],zmm8[6],zmm30[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm7, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm8[1],zmm30[1],zmm8[3],zmm30[3],zmm8[5],zmm30[5],zmm8[7],zmm30[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm10, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm3, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm26, %zmm12, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm3, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm26, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm2[0],zmm28[0],zmm2[2],zmm28[2],zmm2[4],zmm28[4],zmm2[6],zmm28[6] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm26, %zmm12, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm2[1],zmm28[1],zmm2[3],zmm28[3],zmm2[5],zmm28[5],zmm2[7],zmm28[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm7, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm21, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm7[0],ymm11[2],ymm7[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm9, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm21[0],ymm14[0],ymm21[2],ymm14[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm9, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm14[1],ymm21[3],ymm14[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 896(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512F-ONLY-FAST-NEXT: vzeroupper +; AVX512F-ONLY-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: store_i64_stride8_vf16: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512DQ-SLOW-NEXT: vmovaps 64(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r11), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r11), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r10), %zmm28 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: movb $-64, %r8b +; AVX512DQ-SLOW-NEXT: kmovw %r8d, %k1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %xmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm22 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm17[0],xmm15[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm14, %zmm18, %zmm14 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm14[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] +; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm10, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm13[1],xmm4[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm17[1],xmm15[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm4, %zmm13, %zmm4 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm4[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm22[0],xmm16[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm14, %zmm15, %zmm14 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm10, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm13[1],xmm4[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm22[1],xmm16[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm1[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,4,12,4,12,4,12] +; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm29[0],zmm5[2],zmm29[2],zmm5[4],zmm29[4],zmm5[6],zmm29[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm16 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,5,13,5,13,5,13] +; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm15, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm29[1],zmm5[3],zmm29[3],zmm5[5],zmm29[5],zmm5[7],zmm29[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm15, %zmm13 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm17 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm8[0],zmm30[0],zmm8[2],zmm30[2],zmm8[4],zmm30[4],zmm8[6],zmm30[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm22 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] +; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm7, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm8[1],zmm30[1],zmm8[3],zmm30[3],zmm8[5],zmm30[5],zmm8[7],zmm30[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [7,15,7,15] +; AVX512DQ-SLOW-NEXT: # ymm21 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm21, %zmm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm10, %zmm6 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm25, %zmm3, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm26, %zmm12, %zmm14 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm9 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm25, %zmm3, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm26, %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm2[0],zmm28[0],zmm2[2],zmm28[2],zmm2[4],zmm28[4],zmm2[6],zmm28[6] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm25, %zmm3, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm26, %zmm12, %zmm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm7, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm2[1],zmm28[1],zmm2[3],zmm28[3],zmm2[5],zmm28[5],zmm2[7],zmm28[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm7, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm21, %zmm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,2,10,2,10,2,10] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm7[0],ymm11[2],ymm7[2] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm14 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [3,11,3,11,3,11,3,11] +; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm9, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm21[0],ymm14[0],ymm21[2],ymm14[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm14[1],ymm21[3],ymm14[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 896(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 832(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: store_i64_stride8_vf16: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512DQ-FAST-NEXT: vmovaps 64(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r11), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r11), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r10), %zmm28 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: movb $-64, %r8b +; AVX512DQ-FAST-NEXT: kmovw %r8d, %k1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %xmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %xmm22 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm17[0],xmm15[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm14, %zmm18, %zmm14 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm14[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm10, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm13[1],xmm4[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm17[1],xmm15[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm4, %zmm13, %zmm4 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm4[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm22[0],xmm16[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm14, %zmm15, %zmm14 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm13[1],xmm4[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm22[1],xmm16[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm1[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,4,12,4,12,4,12] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm29[0],zmm5[2],zmm29[2],zmm5[4],zmm29[4],zmm5[6],zmm29[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm16 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,5,13,5,13,5,13] +; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm29[1],zmm5[3],zmm29[3],zmm5[5],zmm29[5],zmm5[7],zmm29[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm13 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm17 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm8[0],zmm30[0],zmm8[2],zmm30[2],zmm8[4],zmm30[4],zmm8[6],zmm30[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm22 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm7, %zmm3 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm8[1],zmm30[1],zmm8[3],zmm30[3],zmm8[5],zmm30[5],zmm8[7],zmm30[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm11 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [7,15,7,15] +; AVX512DQ-FAST-NEXT: # ymm21 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm10, %zmm6 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm25, %zmm3, %zmm10 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm26, %zmm12, %zmm14 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm9 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2q %zmm25, %zmm3, %zmm15 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm26, %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm2[0],zmm28[0],zmm2[2],zmm28[2],zmm2[4],zmm28[4],zmm2[6],zmm28[6] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm25, %zmm3, %zmm13 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm26, %zmm12, %zmm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm7, %zmm1 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm2[1],zmm28[1],zmm2[3],zmm28[3],zmm2[5],zmm28[5],zmm2[7],zmm28[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm7, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm21, %zmm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,2,10,2,10,2,10] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm7[0],ymm11[2],ymm7[2] +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [3,11,3,11,3,11,3,11] +; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm9, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm21[0],ymm14[0],ymm21[2],ymm14[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm14[1],ymm21[3],ymm14[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 896(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 832(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride8_vf16: +; AVX512BW-ONLY-SLOW: # %bb.0: +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512BW-ONLY-SLOW-NEXT: vmovaps 64(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r11), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r11), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %r8b +; AVX512BW-ONLY-SLOW-NEXT: kmovd %r8d, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %xmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm17[0],xmm15[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm14, %zmm18, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm14[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm10, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm13[1],xmm4[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm17[1],xmm15[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm4, %zmm13, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm4[0,1,2,3],zmm19[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm22[0],xmm16[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm14, %zmm15, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm13[1],xmm4[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm22[1],xmm16[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm1[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,4,12,4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm29[0],zmm5[2],zmm29[2],zmm5[4],zmm29[4],zmm5[6],zmm29[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,5,13,5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm29[1],zmm5[3],zmm29[3],zmm5[5],zmm29[5],zmm5[7],zmm29[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm15, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm8[0],zmm30[0],zmm8[2],zmm30[2],zmm8[4],zmm30[4],zmm8[6],zmm30[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm7, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm8[1],zmm30[1],zmm8[3],zmm30[3],zmm8[5],zmm30[5],zmm8[7],zmm30[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm21, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm10, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm3, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm12, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm3, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm2[0],zmm28[0],zmm2[2],zmm28[2],zmm2[4],zmm28[4],zmm2[6],zmm28[6] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm3, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm12, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm7, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm2[1],zmm28[1],zmm2[3],zmm28[3],zmm2[5],zmm28[5],zmm2[7],zmm28[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm7, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm21, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm7[0],ymm11[2],ymm7[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm9, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm21[0],ymm14[0],ymm21[2],ymm14[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm9, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm14[1],ymm21[3],ymm14[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 896(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 832(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vzeroupper +; AVX512BW-ONLY-SLOW-NEXT: retq +; +; AVX512BW-ONLY-FAST-LABEL: store_i64_stride8_vf16: +; AVX512BW-ONLY-FAST: # %bb.0: +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512BW-ONLY-FAST-NEXT: vmovaps 64(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r11), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r11), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: movb $-64, %r8b +; AVX512BW-ONLY-FAST-NEXT: kmovd %r8d, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %xmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm22 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm17[0],xmm15[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm14, %zmm18, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm14[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm10, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm19 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm13[1],xmm4[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm17[1],xmm15[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm4, %zmm13, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm4[0,1,2,3],zmm19[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm22[0],xmm16[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm14, %zmm15, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm13[1],xmm4[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm22[1],xmm16[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm4, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm1[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,4,12,4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm29[0],zmm5[2],zmm29[2],zmm5[4],zmm29[4],zmm5[6],zmm29[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,5,13,5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm15, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm29[1],zmm5[3],zmm29[3],zmm5[5],zmm29[5],zmm5[7],zmm29[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm8[0],zmm30[0],zmm8[2],zmm30[2],zmm8[4],zmm30[4],zmm8[6],zmm30[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm4, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm7, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm8[1],zmm30[1],zmm8[3],zmm30[3],zmm8[5],zmm30[5],zmm8[7],zmm30[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm10, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm3, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm26, %zmm12, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm3, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm26, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm2[0],zmm28[0],zmm2[2],zmm28[2],zmm2[4],zmm28[4],zmm2[6],zmm28[6] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm3, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm26, %zmm12, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm2[1],zmm28[1],zmm2[3],zmm28[3],zmm2[5],zmm28[5],zmm2[7],zmm28[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm7, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm21, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm7[0],ymm11[2],ymm7[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm9, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm4, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm21[0],ymm14[0],ymm21[2],ymm14[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm9, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm14[1],ymm21[3],ymm14[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 896(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 832(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vzeroupper +; AVX512BW-ONLY-FAST-NEXT: retq +; +; AVX512DQBW-SLOW-LABEL: store_i64_stride8_vf16: +; AVX512DQBW-SLOW: # %bb.0: +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512DQBW-SLOW-NEXT: vmovaps 64(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r11), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r11), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r10), %zmm28 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: movb $-64, %r8b +; AVX512DQBW-SLOW-NEXT: kmovd %r8d, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %xmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm22 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm17[0],xmm15[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm14, %zmm18, %zmm14 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm14[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] +; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm10, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm13[1],xmm4[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm17[1],xmm15[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm4, %zmm13, %zmm4 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm4[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm22[0],xmm16[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm14, %zmm15, %zmm14 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm10, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm13[1],xmm4[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm22[1],xmm16[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm1[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,4,12,4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm29[0],zmm5[2],zmm29[2],zmm5[4],zmm29[4],zmm5[6],zmm29[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm16 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,5,13,5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm15, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm29[1],zmm5[3],zmm29[3],zmm5[5],zmm29[5],zmm5[7],zmm29[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm15, %zmm13 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm17 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm8[0],zmm30[0],zmm8[2],zmm30[2],zmm8[4],zmm30[4],zmm8[6],zmm30[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm22 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm7, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm8[1],zmm30[1],zmm8[3],zmm30[3],zmm8[5],zmm30[5],zmm8[7],zmm30[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # ymm21 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm21, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm10, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm25, %zmm3, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm26, %zmm12, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm25, %zmm3, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm26, %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm2[0],zmm28[0],zmm2[2],zmm28[2],zmm2[4],zmm28[4],zmm2[6],zmm28[6] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm25, %zmm3, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm26, %zmm12, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm7, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm2[1],zmm28[1],zmm2[3],zmm28[3],zmm2[5],zmm28[5],zmm2[7],zmm28[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm7, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm21, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm7[0],ymm11[2],ymm7[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %ymm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm9, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rcx), %ymm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm21[0],ymm14[0],ymm21[2],ymm14[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm9, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm14[1],ymm21[3],ymm14[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 896(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 832(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512DQBW-SLOW-NEXT: vzeroupper +; AVX512DQBW-SLOW-NEXT: retq +; +; AVX512DQBW-FAST-LABEL: store_i64_stride8_vf16: +; AVX512DQBW-FAST: # %bb.0: +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512DQBW-FAST-NEXT: vmovaps 64(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r11), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r11), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r10), %zmm28 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: movb $-64, %r8b +; AVX512DQBW-FAST-NEXT: kmovd %r8d, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %xmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %xmm22 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm17[0],xmm15[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm14, %zmm18, %zmm14 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm14[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm10, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm19 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm13[1],xmm4[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm17[1],xmm15[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm4, %zmm13, %zmm4 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm4[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm22[0],xmm16[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm14, %zmm15, %zmm14 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm13[1],xmm4[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm22[1],xmm16[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm1[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,4,12,4,12,4,12] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm29[0],zmm5[2],zmm29[2],zmm5[4],zmm29[4],zmm5[6],zmm29[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm13 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm16 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,5,13,5,13,5,13] +; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm15, %zmm1 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm29[1],zmm5[3],zmm29[3],zmm5[5],zmm29[5],zmm5[7],zmm29[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm13 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm17 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm8[0],zmm30[0],zmm8[2],zmm30[2],zmm8[4],zmm30[4],zmm8[6],zmm30[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm4, %zmm7 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm22 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm7, %zmm3 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm8[1],zmm30[1],zmm8[3],zmm30[3],zmm8[5],zmm30[5],zmm8[7],zmm30[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm11 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [7,15,7,15] +; AVX512DQBW-FAST-NEXT: # ymm21 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm10, %zmm6 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm25, %zmm3, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm26, %zmm12, %zmm14 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm9 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm25, %zmm3, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm26, %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm2[0],zmm28[0],zmm2[2],zmm28[2],zmm2[4],zmm28[4],zmm2[6],zmm28[6] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm25, %zmm3, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm26, %zmm12, %zmm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm7, %zmm1 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm2[1],zmm28[1],zmm2[3],zmm28[3],zmm2[5],zmm28[5],zmm2[7],zmm28[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm7, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm21, %zmm12 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm7[0],ymm11[2],ymm7[2] +; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %ymm14 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm9, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm4, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rcx), %ymm7 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm21[0],ymm14[0],ymm21[2],ymm14[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm9, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm14[1],ymm21[3],ymm14[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 896(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 832(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 768(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512DQBW-FAST-NEXT: vzeroupper +; AVX512DQBW-FAST-NEXT: retq + %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64 + %in.vec1 = load <16 x i64>, ptr %in.vecptr1, align 64 + %in.vec2 = load <16 x i64>, ptr %in.vecptr2, align 64 + %in.vec3 = load <16 x i64>, ptr %in.vecptr3, align 64 + %in.vec4 = load <16 x i64>, ptr %in.vecptr4, align 64 + %in.vec5 = load <16 x i64>, ptr %in.vecptr5, align 64 + %in.vec6 = load <16 x i64>, ptr %in.vecptr6, align 64 + %in.vec7 = load <16 x i64>, ptr %in.vecptr7, align 64 + %1 = shufflevector <16 x i64> %in.vec0, <16 x i64> %in.vec1, <32 x i32> + %2 = shufflevector <16 x i64> %in.vec2, <16 x i64> %in.vec3, <32 x i32> + %3 = shufflevector <16 x i64> %in.vec4, <16 x i64> %in.vec5, <32 x i32> + %4 = shufflevector <16 x i64> %in.vec6, <16 x i64> %in.vec7, <32 x i32> + %5 = shufflevector <32 x i64> %1, <32 x i64> %2, <64 x i32> + %6 = shufflevector <32 x i64> %3, <32 x i64> %4, <64 x i32> + %7 = shufflevector <64 x i64> %5, <64 x i64> %6, <128 x i32> + %interleaved.vec = shufflevector <128 x i64> %7, <128 x i64> poison, <128 x i32> + store <128 x i64> %interleaved.vec, ptr %out.vec, align 64 + ret void +} + +define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { +; SSE-LABEL: store_i64_stride8_vf32: +; SSE: # %bb.0: +; SSE-NEXT: subq $1688, %rsp # imm = 0x698 +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE-NEXT: movaps (%rdi), %xmm7 +; SSE-NEXT: movaps 16(%rdi), %xmm8 +; SSE-NEXT: movaps (%rsi), %xmm1 +; SSE-NEXT: movaps 16(%rsi), %xmm0 +; SSE-NEXT: movaps (%rdx), %xmm9 +; SSE-NEXT: movaps 16(%rdx), %xmm10 +; SSE-NEXT: movaps (%rcx), %xmm3 +; SSE-NEXT: movaps 16(%rcx), %xmm2 +; SSE-NEXT: movaps (%r8), %xmm11 +; SSE-NEXT: movaps 16(%r8), %xmm12 +; SSE-NEXT: movaps (%r9), %xmm5 +; SSE-NEXT: movaps 16(%r9), %xmm4 +; SSE-NEXT: movaps (%r10), %xmm13 +; SSE-NEXT: movaps 16(%r10), %xmm14 +; SSE-NEXT: movaps (%rax), %xmm6 ; SSE-NEXT: movaps %xmm7, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm1[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3040,84 +4382,84 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2],ymm5[2] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm1[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[2] ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm5 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r10), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[2] ; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 72(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 72(%r10), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r10), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r10), %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 72(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 72(%r10), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r10), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 104(%r10), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -3158,22 +4500,22 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdx), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r10), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r10), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 168(%rdx), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 168(%r10), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -4148,455 +5490,476 @@ ; AVX512F-NEXT: vmovaps 128(%rdi), %zmm0 ; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm25 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm20 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512F-NEXT: vmovaps 192(%rdx), %zmm2 -; AVX512F-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm13 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm21 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512F-NEXT: vmovaps 192(%rdx), %zmm0 +; AVX512F-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm9 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm19 ; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm23 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm18 -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm28 -; AVX512F-NEXT: vmovdqa64 (%r10), %zmm17 -; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm16 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm21 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm20 +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm23 +; AVX512F-NEXT: vmovdqa64 (%r10), %zmm18 +; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm17 ; AVX512F-NEXT: vmovdqa64 (%rax), %zmm24 ; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm22 ; AVX512F-NEXT: movb $-64, %r11b ; AVX512F-NEXT: kmovw %r11d, %k1 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [4,12,4,12,4,12,4,12] ; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm15, %zmm9 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm15, %zmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm21[0],zmm28[0],zmm21[2],zmm28[2],zmm21[4],zmm28[4],zmm21[6],zmm28[6] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12] +; AVX512F-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm13, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm3, %zmm0 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,13,5,13] +; AVX512F-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm14, %zmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm18[0],zmm24[0],zmm18[2],zmm24[2],zmm18[4],zmm24[4],zmm18[6],zmm24[6] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm14, %zmm10 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,6,14] ; AVX512F-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm11 ; AVX512F-NEXT: vpermt2q %zmm7, %zmm12, %zmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [5,13,5,13,5,13,5,13] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm8, %zmm9 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm8, %zmm9 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512F-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm14, %zmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm9 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm17[0],zmm24[0],zmm17[2],zmm24[2],zmm17[4],zmm24[4],zmm17[6],zmm24[6] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm10 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm4, %zmm11 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm9, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm29, %zmm9 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm5 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] -; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm15, %zmm5 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm19[0],zmm28[0],zmm19[2],zmm28[2],zmm19[4],zmm28[4],zmm19[6],zmm28[6] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm15, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm8, %zmm5 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm19[1],zmm28[1],zmm19[3],zmm28[3],zmm19[5],zmm28[5],zmm19[7],zmm28[7] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm8, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm3, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm5 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm29, %zmm3 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm10 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm18[1],zmm24[1],zmm18[3],zmm24[3],zmm18[5],zmm24[5],zmm18[7],zmm24[7] +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [7,15,7,15] +; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm7, %zmm5, %zmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm20[0],zmm23[0],zmm20[2],zmm23[2],zmm20[4],zmm23[4],zmm20[6],zmm23[6] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm15, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm13, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm3, %zmm2 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm20[1],zmm23[1],zmm20[3],zmm23[3],zmm20[5],zmm23[5],zmm20[7],zmm23[7] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm8, %zmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm12, %zmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm14, %zmm4 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm17[0],zmm22[0],zmm17[2],zmm22[2],zmm17[4],zmm22[4],zmm17[6],zmm22[6] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm3 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm22[1],zmm17[3],zmm22[3],zmm17[5],zmm22[5],zmm17[7],zmm22[7] +; AVX512F-NEXT: vpermt2q %zmm19, %zmm5, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm4, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm5, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm1, %zmm13, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm13, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm13, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm1, %zmm9, %zmm13 ; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm9, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm25, %zmm29, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm15, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%r10), %zmm30 -; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm21 -; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqa64 192(%r10), %zmm14 -; AVX512F-NEXT: vmovdqa64 192(%rax), %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 +; AVX512F-NEXT: vmovdqa64 128(%r10), %zmm29 +; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm16 +; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm14, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqa64 192(%r10), %zmm9 +; AVX512F-NEXT: vmovdqa64 192(%rax), %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm1 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm15 ; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm11, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm30 +; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm14, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm29, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm4, %zmm19 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] -; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm29, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm4, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm5, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm7, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm21 +; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm5, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm17 ; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm29, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm2, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm4, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm5, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm7, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm5, %zmm18 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6] +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7] +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm29 ; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm29, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm2, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm29, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm16 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm6[0],zmm30[2],zmm6[2],zmm30[4],zmm6[4],zmm30[6],zmm6[6] +; AVX512F-NEXT: vpermt2q %zmm10, %zmm4, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm5, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm7, %zmm12 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm16[0],zmm10[0],zmm16[2],zmm10[2],zmm16[4],zmm10[4],zmm16[6],zmm10[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7] +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm9[0],zmm26[0],zmm9[2],zmm26[2],zmm9[4],zmm26[4],zmm9[6],zmm26[6] +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm26[1],zmm9[3],zmm26[3],zmm9[5],zmm26[5],zmm9[7],zmm26[7] +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm27, %zmm30, %zmm4 +; AVX512F-NEXT: vpermi2q %zmm27, %zmm30, %zmm5 +; AVX512F-NEXT: vpermi2q %zmm27, %zmm30, %zmm7 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm27[0],zmm30[2],zmm27[2],zmm30[4],zmm27[4],zmm30[6],zmm27[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm26 = zmm30[1],zmm27[1],zmm30[3],zmm27[3],zmm30[5],zmm27[5],zmm30[7],zmm27[7] +; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512F-NEXT: vmovdqa64 (%rdx), %xmm19 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm19[0],xmm2[0] +; AVX512F-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm27 +; AVX512F-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vinserti32x4 $0, %xmm10, %zmm27, %zmm10 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm13[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm30[1],zmm6[1],zmm30[3],zmm6[3],zmm30[5],zmm6[5],zmm30[7],zmm6[7] -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm29, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm29, %zmm11 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm21[0],zmm9[0],zmm21[2],zmm9[2],zmm21[4],zmm9[4],zmm21[6],zmm9[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm21[1],zmm9[1],zmm21[3],zmm9[3],zmm21[5],zmm9[5],zmm21[7],zmm9[7] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm25 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm14[0],zmm4[0],zmm14[2],zmm4[2],zmm14[4],zmm4[4],zmm14[6],zmm4[6] -; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm14[1],zmm4[1],zmm14[3],zmm4[3],zmm14[5],zmm4[5],zmm14[7],zmm4[7] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm29, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 -; AVX512F-NEXT: vpermi2q %zmm3, %zmm31, %zmm2 -; AVX512F-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 -; AVX512F-NEXT: vpermi2q %zmm3, %zmm31, %zmm29 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512F-NEXT: vinserti128 $1, (%rdx), %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm28, %zmm13, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm12 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512F-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm12 -; AVX512F-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm19, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa 128(%rsi), %xmm0 -; AVX512F-NEXT: vinserti128 $1, 128(%rcx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX512F-NEXT: vinserti128 $1, 128(%rdx), %ymm12, %ymm13 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm19[1],xmm2[1] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-NEXT: vinserti32x4 $0, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm11[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512F-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm10 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm11[0],xmm10[0] +; AVX512F-NEXT: vinserti32x4 $0, %xmm13, %zmm2, %zmm2 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm6[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm20 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm11[1],xmm10[1] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm0[0,1,2,3],zmm20[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqa 128(%rcx), %xmm0 +; AVX512F-NEXT: vmovdqa 128(%rdx), %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-NEXT: vmovdqa 128(%rsi), %xmm6 +; AVX512F-NEXT: vmovdqa 128(%rdi), %xmm10 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm10[0],xmm6[0] +; AVX512F-NEXT: vinserti32x4 $0, %xmm11, %zmm2, %zmm2 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm2[0,1,2,3],zmm23[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm31 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm10[1],xmm6[1] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm0[0,1,2,3],zmm31[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa 192(%rcx), %xmm1 +; AVX512F-NEXT: vmovdqa 192(%rdx), %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 192(%rsi), %xmm6 +; AVX512F-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm8[0],xmm6[0] +; AVX512F-NEXT: vinserti32x4 $0, %xmm10, %zmm0, %zmm0 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm0[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm8[1],xmm6[1] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-NEXT: vinserti32x4 $0, %xmm4, %zmm1, %zmm1 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm1[0,1,2,3],zmm5[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm28 +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm20 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm26 +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm19 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinsertf64x4 $0, %ymm4, %zmm0, %zmm4 ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm5, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rsi), %xmm7 -; AVX512F-NEXT: vinserti128 $1, 192(%rcx), %ymm7, %ymm7 -; AVX512F-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX512F-NEXT: vinserti128 $1, 192(%rdx), %ymm10, %ymm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] -; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm2, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm10[1],ymm7[1],ymm10[3],ymm7[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm1, %zmm1 +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm6, %zmm0, %zmm6 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm7 +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm3, %zmm3 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm26, %zmm26 ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm6, %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm15 +; AVX512F-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm15 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vblendps $240, (%rsp), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm18 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512F-NEXT: vmovdqa64 (%rdx), %ymm16 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm6[0],ymm16[2],ymm6[2] -; AVX512F-NEXT: vmovdqa64 (%rsi), %ymm20 +; AVX512F-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512F-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX512F-NEXT: vmovdqa64 (%rsi), %ymm17 ; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm23 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm23[0],ymm20[0],ymm23[2],ymm20[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm24, %zmm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm17[0],ymm23[2],ymm17[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm24, %zmm11 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm16[1],ymm6[1],ymm16[3],ymm6[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm23[1],ymm20[1],ymm23[3],ymm20[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm12[2,3],ymm6[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm23[1],ymm17[1],ymm23[3],ymm17[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm1, %zmm8 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqa 64(%rcx), %ymm12 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %ymm16 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %ymm20 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm23 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm23[0],ymm20[0],ymm23[2],ymm20[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX512F-NEXT: vmovdqa 64(%rsi), %ymm10 +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm13 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %ymm17 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %ymm23 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm17[0],ymm23[2],ymm17[2] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm10[0],ymm13[2],ymm10[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm23[1],ymm20[1],ymm23[3],ymm20[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm17[1],ymm23[3],ymm17[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm10[2,3],ymm1[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa 128(%rcx), %ymm13 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %ymm16 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm16[0],ymm13[0],ymm16[2],ymm13[2] -; AVX512F-NEXT: vmovdqa64 128(%rsi), %ymm20 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa 128(%rcx), %ymm10 +; AVX512F-NEXT: vmovdqa 128(%rdx), %ymm13 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm10[0],ymm13[2],ymm10[2] +; AVX512F-NEXT: vmovdqa64 128(%rsi), %ymm17 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm22 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm20[0],ymm22[2],ymm20[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm8[2,3],ymm5[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm11, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm21 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm16[1],ymm13[1],ymm16[3],ymm13[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm22[1],ymm20[1],ymm22[3],ymm20[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm11[2,3],ymm8[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rcx), %ymm9 -; AVX512F-NEXT: vmovdqa 192(%rdx), %ymm11 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm22[0],ymm17[0],ymm22[2],ymm17[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm14[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm16 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm22[1],ymm17[1],ymm22[3],ymm17[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm10 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa 192(%rcx), %ymm12 +; AVX512F-NEXT: vmovdqa 192(%rdx), %ymm13 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] ; AVX512F-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm20 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm20[0],ymm16[0],ymm20[2],ymm16[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm13[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm20[1],ymm16[1],ymm20[3],ymm16[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm9 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm17 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm14[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm30 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm9[2,3],ymm7[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 1664(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, 1216(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 1152(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm12, 704(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 1728(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, 1664(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 1152(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, 704(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm15, 1984(%rax) -; AVX512F-NEXT: vmovaps %zmm10, 1920(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 1792(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, 1536(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, 1472(%rax) -; AVX512F-NEXT: vmovaps %zmm19, 1408(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm26, 1344(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm28, 1280(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1088(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512F-NEXT: vmovaps %zmm18, 1984(%rax) +; AVX512F-NEXT: vmovaps %zmm15, 1920(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm26, 1856(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 1792(%rax) +; AVX512F-NEXT: vmovaps %zmm6, 1472(%rax) +; AVX512F-NEXT: vmovaps %zmm4, 1408(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm19, 1344(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm20, 1280(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -4606,10 +5969,6 @@ ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 768(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) @@ -4617,7 +5976,14 @@ ; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm25, 1600(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm21, 1536(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm31, 1088(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm27, 1024(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm28, 576(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, (%rax) @@ -4633,455 +5999,476 @@ ; AVX512BW-NEXT: vmovaps 128(%rdi), %zmm0 ; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vmovaps 192(%rdx), %zmm2 -; AVX512BW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm13 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm21 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512BW-NEXT: vmovaps 192(%rdx), %zmm0 +; AVX512BW-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm9 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm19 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm23 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm18 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm28 -; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm17 -; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm16 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm21 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm20 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm28 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm23 +; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm18 +; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm17 ; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm24 ; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm22 ; AVX512BW-NEXT: movb $-64, %r11b ; AVX512BW-NEXT: kmovd %r11d, %k1 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm15, %zmm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm9 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm15, %zmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm21[0],zmm28[0],zmm21[2],zmm28[2],zmm21[4],zmm28[4],zmm21[6],zmm28[6] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,4,12] +; AVX512BW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm3, %zmm0 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm21[1],zmm28[1],zmm21[3],zmm28[3],zmm21[5],zmm28[5],zmm21[7],zmm28[7] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,13,5,13] +; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm14, %zmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm18[0],zmm24[0],zmm18[2],zmm24[2],zmm18[4],zmm24[4],zmm18[6],zmm24[6] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm10 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,6,14] ; AVX512BW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm8, %zmm9 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm23[1],zmm18[1],zmm23[3],zmm18[3],zmm23[5],zmm18[5],zmm23[7],zmm18[7] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm9 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,5,13] -; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm17[0],zmm24[0],zmm17[2],zmm24[2],zmm17[4],zmm24[4],zmm17[6],zmm24[6] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm10 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] -; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm9, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm9 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm5 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [7,15,7,15] -; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm15, %zmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm19[0],zmm28[0],zmm19[2],zmm28[2],zmm19[4],zmm28[4],zmm19[6],zmm28[6] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm15, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm8, %zmm5 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm19[1],zmm28[1],zmm19[3],zmm28[3],zmm19[5],zmm28[5],zmm19[7],zmm28[7] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm4, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm22[0],zmm16[2],zmm22[2],zmm16[4],zmm22[4],zmm16[6],zmm22[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm29, %zmm3 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm16[1],zmm22[1],zmm16[3],zmm22[3],zmm16[5],zmm22[5],zmm16[7],zmm22[7] -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm10 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm18[1],zmm24[1],zmm18[3],zmm24[3],zmm18[5],zmm24[5],zmm18[7],zmm24[7] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [7,15,7,15] +; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm20[0],zmm23[0],zmm20[2],zmm23[2],zmm20[4],zmm23[4],zmm20[6],zmm23[6] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm15, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm13, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm3, %zmm2 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm20[1],zmm23[1],zmm20[3],zmm23[3],zmm20[5],zmm23[5],zmm20[7],zmm23[7] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm8, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm12, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm14, %zmm4 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm17[0],zmm22[0],zmm17[2],zmm22[2],zmm17[4],zmm22[4],zmm17[6],zmm22[6] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm3 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm17[1],zmm22[1],zmm17[3],zmm22[3],zmm17[5],zmm22[5],zmm17[7],zmm22[7] +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm13, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm9, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm9, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm9, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm29, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm14, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm30 -; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm21 -; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 192(%r10), %zmm14 -; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 +; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm29 +; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm16 +; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqa64 192(%r10), %zmm9 +; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm15 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm30 +; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm14, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm27 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm19 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm29, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm4, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm5, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm7, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm5, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm17 ; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm29, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm4, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm5, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm7, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm18 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6] +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7] +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm29 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm29, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm29, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm30[0],zmm6[0],zmm30[2],zmm6[2],zmm30[4],zmm6[4],zmm30[6],zmm6[6] +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm4, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm5, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm7, %zmm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm16[0],zmm10[0],zmm16[2],zmm10[2],zmm16[4],zmm10[4],zmm16[6],zmm10[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7] +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm9[0],zmm26[0],zmm9[2],zmm26[2],zmm9[4],zmm26[4],zmm9[6],zmm26[6] +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm26[1],zmm9[3],zmm26[3],zmm9[5],zmm26[5],zmm9[7],zmm26[7] +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm27, %zmm30, %zmm4 +; AVX512BW-NEXT: vpermi2q %zmm27, %zmm30, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm27, %zmm30, %zmm7 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm27[0],zmm30[2],zmm27[2],zmm30[4],zmm27[4],zmm30[6],zmm27[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm26 = zmm30[1],zmm27[1],zmm30[3],zmm27[3],zmm30[5],zmm27[5],zmm30[7],zmm27[7] +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %xmm19 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm19[0],xmm2[0] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm27 +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm10, %zmm27, %zmm10 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm13[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm20 {%k1} = zmm30[1],zmm6[1],zmm30[3],zmm6[3],zmm30[5],zmm6[5],zmm30[7],zmm6[7] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm29, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm21[0],zmm9[0],zmm21[2],zmm9[2],zmm21[4],zmm9[4],zmm21[6],zmm9[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm21[1],zmm9[1],zmm21[3],zmm9[3],zmm21[5],zmm9[5],zmm21[7],zmm9[7] -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm25 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k1} = zmm14[0],zmm4[0],zmm14[2],zmm4[2],zmm14[4],zmm4[4],zmm14[6],zmm4[6] -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm14[1],zmm4[1],zmm14[3],zmm4[3],zmm14[5],zmm4[5],zmm14[7],zmm4[7] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm29, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm31, %zmm2 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm31, %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm31, %zmm29 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512BW-NEXT: vinserti128 $1, (%rdx), %ymm3, %ymm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm28 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm28, %zmm13, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512BW-NEXT: vinserti128 $1, 64(%rcx), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm12 -; AVX512BW-NEXT: vinserti128 $1, 64(%rdx), %ymm12, %ymm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm19, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa 128(%rsi), %xmm0 -; AVX512BW-NEXT: vinserti128 $1, 128(%rcx), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX512BW-NEXT: vinserti128 $1, 128(%rdx), %ymm12, %ymm13 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm17, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm19[1],xmm2[1] +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm11[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm10 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm11[0],xmm10[0] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm13, %zmm2, %zmm2 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm6[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm20 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm11[1],xmm10[1] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm0[0,1,2,3],zmm20[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqa 128(%rcx), %xmm0 +; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-NEXT: vmovdqa 128(%rsi), %xmm6 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm10[0],xmm6[0] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm11, %zmm2, %zmm2 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm2[0,1,2,3],zmm23[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm31 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm10[1],xmm6[1] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm0[0,1,2,3],zmm31[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rcx), %xmm1 +; AVX512BW-NEXT: vmovdqa 192(%rdx), %xmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm1[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 192(%rsi), %xmm6 +; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm8[0],xmm6[0] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm10, %zmm0, %zmm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm0[0,1,2,3],zmm4[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm5 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm8[1],xmm6[1] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm1, %zmm1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm1[0,1,2,3],zmm5[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm28 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm20 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm26 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm19 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm5, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm5 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rsi), %xmm7 -; AVX512BW-NEXT: vinserti128 $1, 192(%rcx), %ymm7, %ymm7 -; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX512BW-NEXT: vinserti128 $1, 192(%rdx), %ymm10, %ymm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm2, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm10[1],ymm7[1],ymm10[3],ymm7[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm1, %zmm1 +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm6, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm7 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm3, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm4 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm26, %zmm26 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm6, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm15 +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, (%rsp), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm18 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %ymm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm16[0],ymm6[0],ymm16[2],ymm6[2] -; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm20 +; AVX512BW-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512BW-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm17 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm23[0],ymm20[0],ymm23[2],ymm20[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm24, %zmm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm17[0],ymm23[2],ymm17[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm24, %zmm11 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm16[1],ymm6[1],ymm16[3],ymm6[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm23[1],ymm20[1],ymm23[3],ymm20[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm12[2,3],ymm6[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm10[1],ymm8[1],ymm10[3],ymm8[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm23[1],ymm17[1],ymm23[3],ymm17[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm1, %zmm8 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rcx), %ymm12 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %ymm16 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %ymm20 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm16[0],ymm12[0],ymm16[2],ymm12[2] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm23[0],ymm20[0],ymm23[2],ymm20[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX512BW-NEXT: vmovdqa 64(%rsi), %ymm10 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm13 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %ymm17 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %ymm23 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm23[0],ymm17[0],ymm23[2],ymm17[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm10[0],ymm13[2],ymm10[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm16[1],ymm12[1],ymm16[3],ymm12[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm23[1],ymm20[1],ymm23[3],ymm20[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm17[1],ymm23[3],ymm17[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm10[2,3],ymm1[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa 128(%rcx), %ymm13 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %ymm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm16[0],ymm13[0],ymm16[2],ymm13[2] -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %ymm20 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa 128(%rcx), %ymm10 +; AVX512BW-NEXT: vmovdqa 128(%rdx), %ymm13 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm10[0],ymm13[2],ymm10[2] +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %ymm17 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm22 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm22[0],ymm20[0],ymm22[2],ymm20[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm8[2,3],ymm5[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm11, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm21 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm16[1],ymm13[1],ymm16[3],ymm13[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm22[1],ymm20[1],ymm22[3],ymm20[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm11[2,3],ymm8[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rcx), %ymm9 -; AVX512BW-NEXT: vmovdqa 192(%rdx), %ymm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm22[0],ymm17[0],ymm22[2],ymm17[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm14[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm12, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm16 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm22[1],ymm17[1],ymm22[3],ymm17[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rcx), %ymm12 +; AVX512BW-NEXT: vmovdqa 192(%rdx), %ymm13 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] ; AVX512BW-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm20 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm20[0],ymm16[0],ymm20[2],ymm16[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm20[1],ymm16[1],ymm20[3],ymm16[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm31, %zmm9 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm17 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm14[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm30 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm9[2,3],ymm7[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm30, %zmm7 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 1664(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 1216(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 1152(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 1728(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 1664(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 1216(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 1152(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 704(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 1984(%rax) -; AVX512BW-NEXT: vmovaps %zmm10, 1920(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 1856(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 1792(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 1536(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 1472(%rax) -; AVX512BW-NEXT: vmovaps %zmm19, 1408(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 1344(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 1280(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1088(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512BW-NEXT: vmovaps %zmm18, 1984(%rax) +; AVX512BW-NEXT: vmovaps %zmm15, 1920(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 1856(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 1792(%rax) +; AVX512BW-NEXT: vmovaps %zmm6, 1472(%rax) +; AVX512BW-NEXT: vmovaps %zmm4, 1408(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 1344(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 1280(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -5091,18 +6478,21 @@ ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 768(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm25, 1600(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 1536(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 1088(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 1024(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 576(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, (%rax) @@ -6550,58 +7940,59 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm4[0] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm8[0],xmm7[0] -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm3, %ymm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm5 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm5 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r10), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 72(%rdx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r10), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 72(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 72(%r10), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -6609,19 +8000,18 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r10), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm2[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r10), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 104(%rdx), %ymm3 @@ -6629,7 +8019,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 104(%r10), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -6662,86 +8052,86 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdx), %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r10), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 168(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 168(%r10), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%rdx), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdx), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm4 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r10), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r10), %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 200(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vbroadcastsd 168(%rdx), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 200(%r10), %ymm2 +; AVX1-ONLY-NEXT: vbroadcastsd 168(%r10), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdx), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%rdx), %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[2] ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 224(%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 224(%rax), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm5 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm3[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r10), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r10), %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] ; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 232(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vbroadcastsd 200(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 200(%r10), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdx), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 224(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 224(%rax), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r10), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 232(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 232(%r10), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -6802,30 +8192,30 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%rdx), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 320(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%rdx), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 320(%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 320(%rax), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%r10), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 328(%rdx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 320(%rax), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm3[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%r10), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 328(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 328(%r10), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -6886,30 +8276,30 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdx), %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%r9), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps 416(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdx), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%r9), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 416(%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 416(%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%r10), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 424(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 416(%rax), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%r10), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 424(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 424(%r10), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -8878,618 +10268,635 @@ ; ; AVX512F-ONLY-SLOW-LABEL: store_i64_stride8_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $5512, %rsp # imm = 0x1588 +; AVX512F-ONLY-SLOW-NEXT: subq $5448, %rsp # imm = 0x1548 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm31 ; AVX512F-ONLY-SLOW-NEXT: movb $-64, %r11b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r11d, %k1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [4,12,4,12,4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm15[0],zmm2[0],zmm15[2],zmm2[2],zmm15[4],zmm2[4],zmm15[6],zmm2[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] ; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] -; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,5,13,5,13,5,13] ; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm15[1],zmm2[1],zmm15[3],zmm2[3],zmm15[5],zmm2[5],zmm15[7],zmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm19[0],zmm31[0],zmm19[2],zmm31[2],zmm19[4],zmm31[4],zmm19[6],zmm31[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm19[1],zmm31[1],zmm19[3],zmm31[3],zmm19[5],zmm31[5],zmm19[7],zmm31[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm26 = [7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # ymm26 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm26, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm21[0],zmm20[2],zmm21[2],zmm20[4],zmm21[4],zmm20[6],zmm21[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm7, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm21[1],zmm20[3],zmm21[3],zmm20[5],zmm21[5],zmm20[7],zmm21[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm2, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm6, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm12, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm26, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm12, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm17[0],zmm16[2],zmm17[2],zmm16[4],zmm17[4],zmm16[6],zmm17[6] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm7, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm17[1],zmm16[3],zmm17[3],zmm16[5],zmm17[5],zmm16[7],zmm17[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm14, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm29[0],zmm20[0],zmm29[2],zmm20[2],zmm29[4],zmm20[4],zmm29[6],zmm20[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm26, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm29[1],zmm20[1],zmm29[3],zmm20[3],zmm29[5],zmm20[5],zmm29[7],zmm20[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r10), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm28[0],zmm9[2],zmm28[2],zmm9[4],zmm28[4],zmm9[6],zmm28[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm26, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm28[1],zmm9[3],zmm28[3],zmm9[5],zmm28[5],zmm9[7],zmm28[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r10), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm25[0],zmm2[0],zmm25[2],zmm2[2],zmm25[4],zmm2[4],zmm25[6],zmm2[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm13, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm25[1],zmm3[1],zmm25[3],zmm3[3],zmm25[5],zmm3[5],zmm25[7],zmm3[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm15[0],zmm0[2],zmm15[2],zmm0[4],zmm15[4],zmm0[6],zmm15[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm26, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm0[1],zmm15[1],zmm0[3],zmm15[3],zmm0[5],zmm15[5],zmm0[7],zmm15[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm26, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm26, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r10), %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm26, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r10), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm27[0],zmm17[0],zmm27[2],zmm17[2],zmm27[4],zmm17[4],zmm27[6],zmm17[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm27[1],zmm17[1],zmm27[3],zmm17[3],zmm27[5],zmm17[5],zmm27[7],zmm17[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm26[0],zmm24[0],zmm26[2],zmm24[2],zmm26[4],zmm24[4],zmm26[6],zmm24[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm24[1],zmm26[3],zmm24[3],zmm26[5],zmm24[5],zmm26[7],zmm24[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r10), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r10), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r10), %zmm18 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm23 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] +; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm10, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm13, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm5, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm13, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm10, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm5, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm5, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm27 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm4[0],zmm16[2],zmm4[2],zmm16[4],zmm4[4],zmm16[6],zmm4[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm16[1],zmm4[1],zmm16[3],zmm4[3],zmm16[5],zmm4[5],zmm16[7],zmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm21[0],zmm3[0],zmm21[2],zmm3[2],zmm21[4],zmm3[4],zmm21[6],zmm3[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm21[1],zmm3[1],zmm21[3],zmm3[3],zmm21[5],zmm3[5],zmm21[7],zmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm18[0],zmm8[0],zmm18[2],zmm8[2],zmm18[4],zmm8[4],zmm18[6],zmm8[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm18[1],zmm8[1],zmm18[3],zmm8[3],zmm18[5],zmm8[5],zmm18[7],zmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm23, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm23, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm23, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm23[0],zmm7[0],zmm23[2],zmm7[2],zmm23[4],zmm7[4],zmm23[6],zmm7[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm23[1],zmm7[1],zmm23[3],zmm7[3],zmm23[5],zmm7[5],zmm23[7],zmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm23 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -9508,976 +10915,1045 @@ ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm19, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm15 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rsi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rsi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm23 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm4[0],xmm3[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm7, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm2[0,1,2,3],zmm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm4[1],xmm3[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm0[0,1,2,3],zmm7[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm7[0],xmm4[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm8, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm4[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm0[0,1,2,3],zmm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm7[0],xmm4[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm8, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm4[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm0[0,1,2,3],zmm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm7[0],xmm4[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm17, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm4[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rcx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %xmm17 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm17[0],xmm4[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm22, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm4[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm4, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rcx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %xmm17 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm17[0],xmm0[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %xmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm30 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm30[0],xmm22[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm1[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm19 = xmm30[1],xmm22[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm17[1],xmm0[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %xmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %xmm22 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm22[0],xmm19[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm0[0],xmm30[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm14, %zmm17, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm14[0,1,2,3],zmm24[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm18 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm18 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm25 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm18 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm25 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm22[1],xmm19[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm0[0,1,2,3],zmm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rcx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm14[0],xmm0[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm24 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm24[0],xmm20[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm19[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm19 = xmm24[1],xmm20[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm14[1],xmm0[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm13[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 3776(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 3712(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 3264(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 3200(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 2752(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 2240(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1664(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1152(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 4032(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3968(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3904(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3840(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 3648(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 3584(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3520(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3456(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 3136(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 3008(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2944(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2880(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 2624(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2304(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 2048(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1088(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 3776(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 3712(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 3264(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 3200(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2752(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 2688(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 2176(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 1728(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 1664(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 1216(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 1152(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 4032(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 3968(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 3904(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 3840(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 3520(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 3456(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 3392(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 3328(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 3008(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 2944(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 2880(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 2816(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 2496(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 2432(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 2368(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 2304(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 1984(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 1920(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 1856(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 1792(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 1472(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 1408(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 1344(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 1280(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 960(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 896(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 3648(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 3584(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 3136(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 3072(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 2624(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 2560(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 2048(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1600(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1536(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 1088(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 1024(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $5512, %rsp # imm = 0x1588 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $5448, %rsp # imm = 0x1548 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i64_stride8_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $5512, %rsp # imm = 0x1588 +; AVX512F-ONLY-FAST-NEXT: subq $5448, %rsp # imm = 0x1548 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm31 ; AVX512F-ONLY-FAST-NEXT: movb $-64, %r11b ; AVX512F-ONLY-FAST-NEXT: kmovw %r11d, %k1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [4,12,4,12,4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm15[0],zmm2[0],zmm15[2],zmm2[2],zmm15[4],zmm2[4],zmm15[6],zmm2[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] ; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] -; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,5,13,5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm15[1],zmm2[1],zmm15[3],zmm2[3],zmm15[5],zmm2[5],zmm15[7],zmm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm19[0],zmm31[0],zmm19[2],zmm31[2],zmm19[4],zmm31[4],zmm19[6],zmm31[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm19[1],zmm31[1],zmm19[3],zmm31[3],zmm19[5],zmm31[5],zmm19[7],zmm31[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm26 = [7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # ymm26 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm26, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm21[0],zmm20[2],zmm21[2],zmm20[4],zmm21[4],zmm20[6],zmm21[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm21[1],zmm20[3],zmm21[3],zmm20[5],zmm21[5],zmm20[7],zmm21[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm2, %zmm10 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm6, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm26, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm12, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm17[0],zmm16[2],zmm17[2],zmm16[4],zmm17[4],zmm16[6],zmm17[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm13, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm17[1],zmm16[3],zmm17[3],zmm16[5],zmm17[5],zmm16[7],zmm17[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm29[0],zmm20[0],zmm29[2],zmm20[2],zmm29[4],zmm20[4],zmm29[6],zmm20[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm29[1],zmm20[1],zmm29[3],zmm20[3],zmm29[5],zmm20[5],zmm29[7],zmm20[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r10), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm28[0],zmm9[2],zmm28[2],zmm9[4],zmm28[4],zmm9[6],zmm28[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm28[1],zmm9[3],zmm28[3],zmm9[5],zmm28[5],zmm9[7],zmm28[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r10), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm25[0],zmm2[0],zmm25[2],zmm2[2],zmm25[4],zmm2[4],zmm25[6],zmm2[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm25[1],zmm3[1],zmm25[3],zmm3[3],zmm25[5],zmm3[5],zmm25[7],zmm3[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm15[0],zmm0[2],zmm15[2],zmm0[4],zmm15[4],zmm0[6],zmm15[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm0[1],zmm15[1],zmm0[3],zmm15[3],zmm0[5],zmm15[5],zmm0[7],zmm15[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm26, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r10), %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r10), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm11, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm27[0],zmm17[0],zmm27[2],zmm17[2],zmm27[4],zmm17[4],zmm27[6],zmm17[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm27[1],zmm17[1],zmm27[3],zmm17[3],zmm27[5],zmm17[5],zmm27[7],zmm17[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm26[0],zmm24[0],zmm26[2],zmm24[2],zmm26[4],zmm24[4],zmm26[6],zmm24[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm24[1],zmm26[3],zmm24[3],zmm26[5],zmm24[5],zmm26[7],zmm24[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r10), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r10), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r10), %zmm18 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm23 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] +; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm10, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm13, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm5, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm13, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm10, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm5, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm13, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm5, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm27 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm4[0],zmm16[2],zmm4[2],zmm16[4],zmm4[4],zmm16[6],zmm4[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm16[1],zmm4[1],zmm16[3],zmm4[3],zmm16[5],zmm4[5],zmm16[7],zmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm21[0],zmm3[0],zmm21[2],zmm3[2],zmm21[4],zmm3[4],zmm21[6],zmm3[6] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm21[1],zmm3[1],zmm21[3],zmm3[3],zmm21[5],zmm3[5],zmm21[7],zmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm18[0],zmm8[0],zmm18[2],zmm8[2],zmm18[4],zmm8[4],zmm18[6],zmm8[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm18[1],zmm8[1],zmm18[3],zmm8[3],zmm18[5],zmm8[5],zmm18[7],zmm8[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm23, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm23, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm23, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm23[0],zmm7[0],zmm23[2],zmm7[2],zmm23[4],zmm7[4],zmm23[6],zmm7[6] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm23[1],zmm7[1],zmm23[3],zmm7[3],zmm23[5],zmm7[5],zmm23[7],zmm7[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm23 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -10496,976 +11972,1045 @@ ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm19, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %ymm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %ymm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %ymm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm15 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %ymm23 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rsi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rsi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm23 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm4[0],xmm3[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm7, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm2[0,1,2,3],zmm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm4[1],xmm3[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm0[0,1,2,3],zmm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm7[0],xmm4[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm8, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm4[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm0[0,1,2,3],zmm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm7[0],xmm4[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm8, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm4[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm0[0,1,2,3],zmm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm7[0],xmm4[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm17, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm4[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rcx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %xmm17 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm17[0],xmm4[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm22, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm4[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm4, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rcx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %xmm17 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm17[0],xmm0[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %xmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %xmm30 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm30[0],xmm22[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm19 = xmm30[1],xmm22[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm17[1],xmm0[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %xmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %xmm22 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm22[0],xmm19[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %xmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm0[0],xmm30[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm14, %zmm17, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm14[0,1,2,3],zmm24[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %xmm13 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %xmm18 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %xmm18 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %xmm25 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %xmm18 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %xmm25 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm22[1],xmm19[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm0[0,1,2,3],zmm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rcx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm14[0],xmm0[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %xmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %xmm24 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm24[0],xmm20[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm19[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm19 = xmm24[1],xmm20[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm14[1],xmm0[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm13[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 3776(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 3712(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 3264(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 3200(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 2752(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 2240(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2176(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1664(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1152(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 4032(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3968(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3904(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3840(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 3648(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 3584(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3520(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3456(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 3136(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 3072(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 3008(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2944(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2880(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2624(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2304(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 2048(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 3776(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 3712(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 3264(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 3200(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2752(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 2688(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 2176(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 1728(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 1664(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 1216(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 1152(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 4032(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 3968(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 3904(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 3840(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 3520(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 3456(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 3392(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 3328(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 3008(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 2944(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 2880(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 2816(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 2496(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 2432(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 2368(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 2304(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 1984(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 1920(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 1856(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 1792(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 1472(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 1408(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 1344(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 1280(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 960(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 896(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 3648(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 3584(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 3136(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 3072(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 2624(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 2560(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 2048(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1600(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1536(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 1088(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 1024(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512F-ONLY-FAST-NEXT: addq $5512, %rsp # imm = 0x1588 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512F-ONLY-FAST-NEXT: addq $5448, %rsp # imm = 0x1548 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i64_stride8_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $5512, %rsp # imm = 0x1588 +; AVX512DQ-SLOW-NEXT: subq $5448, %rsp # imm = 0x1548 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r10), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r10), %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm31 ; AVX512DQ-SLOW-NEXT: movb $-64, %r11b ; AVX512DQ-SLOW-NEXT: kmovw %r11d, %k1 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [4,12,4,12,4,12,4,12] +; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm15[0],zmm2[0],zmm15[2],zmm2[2],zmm15[4],zmm2[4],zmm15[6],zmm2[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] ; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] -; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,5,13,5,13,5,13] ; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] -; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [7,15,7,15] -; AVX512DQ-SLOW-NEXT: # ymm30 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm15[1],zmm2[1],zmm15[3],zmm2[3],zmm15[5],zmm2[5],zmm15[7],zmm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] +; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm19[0],zmm31[0],zmm19[2],zmm31[2],zmm19[4],zmm31[4],zmm19[6],zmm31[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm15 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [7,15,7,15,7,15,7,15] +; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm11 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm19[1],zmm31[1],zmm19[3],zmm31[3],zmm19[5],zmm31[5],zmm19[7],zmm31[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm8 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [7,15,7,15] +; AVX512DQ-SLOW-NEXT: # ymm26 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm26, %zmm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm8 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm21[0],zmm20[2],zmm21[2],zmm20[4],zmm21[4],zmm20[6],zmm21[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm7, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm21[1],zmm20[3],zmm21[3],zmm20[5],zmm21[5],zmm20[7],zmm21[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm8 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm2, %zmm10 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r10), %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm6, %zmm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r10), %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm12, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm26, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm12, %zmm6 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm17[0],zmm16[2],zmm17[2],zmm16[4],zmm17[4],zmm16[6],zmm17[6] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm7, %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm8 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm5 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm17[1],zmm16[3],zmm17[3],zmm16[5],zmm17[5],zmm16[7],zmm17[7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm14, %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r10), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm29 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm6 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm29[0],zmm20[0],zmm29[2],zmm20[2],zmm29[4],zmm20[4],zmm29[6],zmm20[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm26, %zmm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm29[1],zmm20[1],zmm29[3],zmm20[3],zmm29[5],zmm20[5],zmm29[7],zmm20[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r10), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [5,13,5,13] +; AVX512DQ-SLOW-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r10), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r8), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r9), %zmm24 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm28[0],zmm9[2],zmm28[2],zmm9[4],zmm28[4],zmm9[6],zmm28[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm26, %zmm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm28[1],zmm9[3],zmm28[3],zmm9[5],zmm28[5],zmm9[7],zmm28[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r10), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rax), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r8), %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r9), %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm25[0],zmm2[0],zmm25[2],zmm2[2],zmm25[4],zmm2[4],zmm25[6],zmm2[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm13, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm25[1],zmm3[1],zmm25[3],zmm3[3],zmm25[5],zmm3[5],zmm25[7],zmm3[7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm15[0],zmm0[2],zmm15[2],zmm0[4],zmm15[4],zmm0[6],zmm15[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm26, %zmm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm0[1],zmm15[1],zmm0[3],zmm15[3],zmm0[5],zmm15[5],zmm0[7],zmm15[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [5,13,5,13] +; AVX512DQ-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm26, %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm26, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm6 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r10), %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rax), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r8), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r9), %zmm4 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm5 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm26, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r10), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rax), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm11, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r8), %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r9), %zmm17 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm27[0],zmm17[0],zmm27[2],zmm17[2],zmm27[4],zmm17[4],zmm27[6],zmm17[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm27[1],zmm17[1],zmm27[3],zmm17[3],zmm27[5],zmm17[5],zmm27[7],zmm17[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm6 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm26[0],zmm24[0],zmm26[2],zmm24[2],zmm26[4],zmm24[4],zmm26[6],zmm24[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm24[1],zmm26[3],zmm24[3],zmm26[5],zmm24[5],zmm26[7],zmm24[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r10), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rax), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r10), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rax), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r8), %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r9), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r8), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r9), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r10), %zmm18 ; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rax), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r8), %zmm19 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r8), %zmm23 ; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] -; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] -; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] +; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,10,2,10,2,10,2,10] +; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm10, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm13, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm5, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm13, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm10, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm5, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm5, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm27 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm4[0],zmm16[2],zmm4[2],zmm16[4],zmm4[4],zmm16[6],zmm4[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm16[1],zmm4[1],zmm16[3],zmm4[3],zmm16[5],zmm4[5],zmm16[7],zmm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm29 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm21[0],zmm3[0],zmm21[2],zmm3[2],zmm21[4],zmm3[4],zmm21[6],zmm3[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm21[1],zmm3[1],zmm21[3],zmm3[3],zmm21[5],zmm3[5],zmm21[7],zmm3[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm18[0],zmm8[0],zmm18[2],zmm8[2],zmm18[4],zmm8[4],zmm18[6],zmm8[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm18[1],zmm8[1],zmm18[3],zmm8[3],zmm18[5],zmm8[5],zmm18[7],zmm8[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm23, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm23, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm23, %zmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm23[0],zmm7[0],zmm23[2],zmm7[2],zmm23[4],zmm7[4],zmm23[6],zmm7[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm23[1],zmm7[1],zmm23[3],zmm7[3],zmm23[5],zmm7[5],zmm23[7],zmm7[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm23 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -11484,976 +13029,1045 @@ ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm4 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm19, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %ymm4 ; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rsi), %ymm4 ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rsi), %ymm4 ; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm7 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm15 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rsi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %ymm23 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rsi), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rsi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm18 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rsi), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rsi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm10 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rsi), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm6 ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm23 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm4[0],xmm3[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm7, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm2[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm4[1],xmm3[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm0[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm7[0],xmm4[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm8, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm4[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm0[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rcx), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm7[0],xmm4[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm8, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm4[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm0[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rcx), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rsi), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm7[0],xmm4[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm17, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %xmm1 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm4[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rcx), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdx), %xmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rsi), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %xmm17 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm17[0],xmm4[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm22, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm4[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm4, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rcx), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %xmm17 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm17[0],xmm0[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %xmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm30 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm30[0],xmm22[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm19 = xmm30[1],xmm22[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm17[1],xmm0[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %xmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdx), %xmm22 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm22[0],xmm19[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm0[0],xmm30[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm14, %zmm17, %zmm14 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm14[0,1,2,3],zmm24[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rsi), %xmm12 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %xmm12 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rsi), %xmm13 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm18 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm18 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm25 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm18 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm25 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm22[1],xmm19[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm0[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rcx), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdx), %xmm14 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm14[0],xmm0[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm24 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm24[0],xmm20[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm19[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm19 = xmm24[1],xmm20[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm14[1],xmm0[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm13[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 3776(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 3712(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 3264(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 3200(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 2752(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 2240(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1664(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1152(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 4032(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3968(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3904(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3840(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 3648(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 3584(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3520(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3456(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 3136(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 3008(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2944(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2880(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 2624(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2304(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 2048(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 1088(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 3776(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 3712(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 3264(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 3200(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 2752(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 2688(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 2176(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 1728(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 1664(%rax) +; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 1216(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 1152(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 4032(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 3968(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 3904(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 3840(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 3520(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 3456(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 3392(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 3328(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 3008(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 2944(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 2880(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 2816(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 2496(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 2432(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 2368(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 2304(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 1984(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 1920(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 1856(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 1792(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 1472(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 1408(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 1344(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 1280(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 960(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 896(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 832(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 3648(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 3584(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 3136(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 3072(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 2624(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 2560(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 2048(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 1600(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 1536(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 1088(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 1024(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512DQ-SLOW-NEXT: addq $5512, %rsp # imm = 0x1588 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512DQ-SLOW-NEXT: addq $5448, %rsp # imm = 0x1548 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i64_stride8_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $5512, %rsp # imm = 0x1588 +; AVX512DQ-FAST-NEXT: subq $5448, %rsp # imm = 0x1548 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r10), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r10), %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm31 ; AVX512DQ-FAST-NEXT: movb $-64, %r11b ; AVX512DQ-FAST-NEXT: kmovw %r11d, %k1 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [4,12,4,12,4,12,4,12] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm15[0],zmm2[0],zmm15[2],zmm2[2],zmm15[4],zmm2[4],zmm15[6],zmm2[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] ; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] -; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512DQ-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,5,13,5,13,5,13] ; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [7,15,7,15] -; AVX512DQ-FAST-NEXT: # ymm30 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm15[1],zmm2[1],zmm15[3],zmm2[3],zmm15[5],zmm2[5],zmm15[7],zmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm19[0],zmm31[0],zmm19[2],zmm31[2],zmm19[4],zmm31[4],zmm19[6],zmm31[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm11 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm15 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm11 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm19[1],zmm31[1],zmm19[3],zmm31[3],zmm19[5],zmm31[5],zmm19[7],zmm31[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm8 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [7,15,7,15] +; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm26, %zmm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm8 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm21[0],zmm20[2],zmm21[2],zmm20[4],zmm21[4],zmm20[6],zmm21[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm7, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm21[1],zmm20[3],zmm21[3],zmm20[5],zmm21[5],zmm20[7],zmm21[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm8 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm2, %zmm10 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r10), %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm6, %zmm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r10), %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm26, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm12, %zmm6 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm17[0],zmm16[2],zmm17[2],zmm16[4],zmm17[4],zmm16[6],zmm17[6] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm8 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm13, %zmm5 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm17[1],zmm16[3],zmm17[3],zmm16[5],zmm17[5],zmm16[7],zmm17[7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r10), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm29 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm6 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm29[0],zmm20[0],zmm29[2],zmm20[2],zmm29[4],zmm20[4],zmm29[6],zmm20[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm29[1],zmm20[1],zmm29[3],zmm20[3],zmm29[5],zmm20[5],zmm29[7],zmm20[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r10), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [5,13,5,13] +; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r10), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r8), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r9), %zmm24 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm28[0],zmm9[2],zmm28[2],zmm9[4],zmm28[4],zmm9[6],zmm28[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm28[1],zmm9[3],zmm28[3],zmm9[5],zmm28[5],zmm9[7],zmm28[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r10), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rax), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r8), %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r9), %zmm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm25[0],zmm2[0],zmm25[2],zmm2[2],zmm25[4],zmm2[4],zmm25[6],zmm2[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdx), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdx), %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm25[1],zmm3[1],zmm25[3],zmm3[3],zmm25[5],zmm3[5],zmm25[7],zmm3[7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm15[0],zmm0[2],zmm15[2],zmm0[4],zmm15[4],zmm0[6],zmm15[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm0[1],zmm15[1],zmm0[3],zmm15[3],zmm0[5],zmm15[5],zmm0[7],zmm15[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [5,13,5,13] +; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm26, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdx), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %zmm6 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r10), %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rax), %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r8), %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r9), %zmm4 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %zmm5 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r10), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rax), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm11, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r8), %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r9), %zmm17 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm27[0],zmm17[0],zmm27[2],zmm17[2],zmm27[4],zmm17[4],zmm27[6],zmm17[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm27[1],zmm17[1],zmm27[3],zmm17[3],zmm27[5],zmm17[5],zmm27[7],zmm17[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm6 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm26[0],zmm24[0],zmm26[2],zmm24[2],zmm26[4],zmm24[4],zmm26[6],zmm24[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm24[1],zmm26[3],zmm24[3],zmm26[5],zmm24[5],zmm26[7],zmm24[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r10), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rax), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r10), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rax), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r8), %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r9), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r8), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r9), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r10), %zmm18 ; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rax), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r8), %zmm19 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r8), %zmm23 ; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r9), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] +; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,10,2,10,2,10,2,10] +; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm10, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm13, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm5, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm13, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm10, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm5, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm13, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm5, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm27 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm4[0],zmm16[2],zmm4[2],zmm16[4],zmm4[4],zmm16[6],zmm4[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm16[1],zmm4[1],zmm16[3],zmm4[3],zmm16[5],zmm4[5],zmm16[7],zmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm29 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm21[0],zmm3[0],zmm21[2],zmm3[2],zmm21[4],zmm3[4],zmm21[6],zmm3[6] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm21[1],zmm3[1],zmm21[3],zmm3[3],zmm21[5],zmm3[5],zmm21[7],zmm3[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm18[0],zmm8[0],zmm18[2],zmm8[2],zmm18[4],zmm8[4],zmm18[6],zmm8[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm18[1],zmm8[1],zmm18[3],zmm8[3],zmm18[5],zmm8[5],zmm18[7],zmm8[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm18 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm23, %zmm10 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm23, %zmm13 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm23, %zmm5 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm23[0],zmm7[0],zmm23[2],zmm7[2],zmm23[4],zmm7[4],zmm23[6],zmm7[6] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm23[1],zmm7[1],zmm23[3],zmm7[3],zmm23[5],zmm7[5],zmm23[7],zmm7[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm23 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -12472,976 +14086,1045 @@ ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm4 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm19, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %ymm4 ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rsi), %ymm4 ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rsi), %ymm4 ; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm7 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm15 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 320(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdx), %ymm1 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rsi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %ymm23 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rsi), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 384(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdx), %ymm1 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rsi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %ymm18 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rsi), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 448(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdx), %ymm1 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rsi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm10 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rsi), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm6 ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-FAST-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm23 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm4[0],xmm3[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm7, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm2[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm4[1],xmm3[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm0[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm7[0],xmm4[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm8, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm4[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm0[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rcx), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %xmm7 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm7[0],xmm4[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm8, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm4[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm0[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rcx), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rsi), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm7[0],xmm4[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm17, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %xmm1 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm4[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rcx), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdx), %xmm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rsi), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %xmm17 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm17[0],xmm4[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm22, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm4[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm4, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rcx), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %xmm17 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm17[0],xmm0[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %xmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %xmm30 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm30[0],xmm22[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm1[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm19 = xmm30[1],xmm22[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm17[1],xmm0[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %xmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdx), %xmm22 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm22[0],xmm19[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %xmm30 +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm0[0],xmm30[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm14, %zmm17, %zmm14 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm14[0,1,2,3],zmm24[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rsi), %xmm12 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %xmm12 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rsi), %xmm13 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %xmm18 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %xmm18 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %xmm25 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %xmm18 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %xmm25 -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm22[1],xmm19[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm0[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rcx), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdx), %xmm14 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm14[0],xmm0[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %xmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %xmm24 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm24[0],xmm20[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm19[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm19 = xmm24[1],xmm20[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm14[1],xmm0[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm13[4,5,6,7] ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 3776(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 3712(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 3264(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 3200(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 2752(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 2240(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 2176(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1664(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1152(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 4032(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3968(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3904(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3840(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 3648(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 3584(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3520(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3456(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 3136(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 3072(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 3008(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2944(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2880(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 2624(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2304(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 2048(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 3776(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 3712(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 3264(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 3200(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 2752(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 2688(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 2176(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 1728(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 1664(%rax) +; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 1216(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 1152(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 4032(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 3968(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 3904(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 3840(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 3520(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 3456(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 3392(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 3328(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 3008(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 2944(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 2880(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 2816(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 2496(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 2432(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 2368(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 2304(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 1984(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 1920(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 1856(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 1792(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 1472(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 1408(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 1344(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 1280(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 960(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 896(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 832(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 3648(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 3584(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 3136(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 3072(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 2624(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 2560(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 2048(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 1600(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 1536(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 1088(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 1024(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512DQ-FAST-NEXT: addq $5512, %rsp # imm = 0x1588 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512DQ-FAST-NEXT: addq $5448, %rsp # imm = 0x1548 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride8_vf64: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $5512, %rsp # imm = 0x1588 +; AVX512BW-ONLY-SLOW-NEXT: subq $5448, %rsp # imm = 0x1548 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm31 ; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %r11b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r11d, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [4,12,4,12,4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm15[0],zmm2[0],zmm15[2],zmm2[2],zmm15[4],zmm2[4],zmm15[6],zmm2[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] ; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] -; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,5,13,5,13,5,13] ; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm15[1],zmm2[1],zmm15[3],zmm2[3],zmm15[5],zmm2[5],zmm15[7],zmm2[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm19[0],zmm31[0],zmm19[2],zmm31[2],zmm19[4],zmm31[4],zmm19[6],zmm31[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm19[1],zmm31[1],zmm19[3],zmm31[3],zmm19[5],zmm31[5],zmm19[7],zmm31[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm26 = [7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # ymm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm26, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm21[0],zmm20[2],zmm21[2],zmm20[4],zmm21[4],zmm20[6],zmm21[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm7, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm21[1],zmm20[3],zmm21[3],zmm20[5],zmm21[5],zmm20[7],zmm21[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm2, %zmm10 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm6, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm12, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm26, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm12, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm17[0],zmm16[2],zmm17[2],zmm16[4],zmm17[4],zmm16[6],zmm17[6] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm7, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm17[1],zmm16[3],zmm17[3],zmm16[5],zmm17[5],zmm16[7],zmm17[7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm14, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm29[0],zmm20[0],zmm29[2],zmm20[2],zmm29[4],zmm20[4],zmm29[6],zmm20[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm26, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm29[1],zmm20[1],zmm29[3],zmm20[3],zmm29[5],zmm20[5],zmm29[7],zmm20[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r10), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm28[0],zmm9[2],zmm28[2],zmm9[4],zmm28[4],zmm9[6],zmm28[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm26, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm28[1],zmm9[3],zmm28[3],zmm9[5],zmm28[5],zmm9[7],zmm28[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r10), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm25[0],zmm2[0],zmm25[2],zmm2[2],zmm25[4],zmm2[4],zmm25[6],zmm2[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm13, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm25[1],zmm3[1],zmm25[3],zmm3[3],zmm25[5],zmm3[5],zmm25[7],zmm3[7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm15[0],zmm0[2],zmm15[2],zmm0[4],zmm15[4],zmm0[6],zmm15[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm26, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm0[1],zmm15[1],zmm0[3],zmm15[3],zmm0[5],zmm15[5],zmm0[7],zmm15[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm26, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm26, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r10), %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm26, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r10), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm11, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm27[0],zmm17[0],zmm27[2],zmm17[2],zmm27[4],zmm17[4],zmm27[6],zmm17[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm27[1],zmm17[1],zmm27[3],zmm17[3],zmm27[5],zmm17[5],zmm27[7],zmm17[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm26[0],zmm24[0],zmm26[2],zmm24[2],zmm26[4],zmm24[4],zmm26[6],zmm24[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm24[1],zmm26[3],zmm24[3],zmm26[5],zmm24[5],zmm26[7],zmm24[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r10), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r10), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r10), %zmm18 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm23 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm10, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm13, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm5, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm13, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm10, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm5, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm5, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm27 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm4[0],zmm16[2],zmm4[2],zmm16[4],zmm4[4],zmm16[6],zmm4[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm16[1],zmm4[1],zmm16[3],zmm4[3],zmm16[5],zmm4[5],zmm16[7],zmm4[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm21[0],zmm3[0],zmm21[2],zmm3[2],zmm21[4],zmm3[4],zmm21[6],zmm3[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm21[1],zmm3[1],zmm21[3],zmm3[3],zmm21[5],zmm3[5],zmm21[7],zmm3[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm18[0],zmm8[0],zmm18[2],zmm8[2],zmm18[4],zmm8[4],zmm18[6],zmm8[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm18[1],zmm8[1],zmm18[3],zmm8[3],zmm18[5],zmm8[5],zmm18[7],zmm8[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm23, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm23, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm23, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm23[0],zmm7[0],zmm23[2],zmm7[2],zmm23[4],zmm7[4],zmm23[6],zmm7[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm23[1],zmm7[1],zmm23[3],zmm7[3],zmm23[5],zmm7[5],zmm23[7],zmm7[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm23 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -13460,976 +15143,1045 @@ ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm19, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %ymm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %ymm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %ymm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm15 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %ymm23 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rsi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rsi), %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm23 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm4[0],xmm3[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm7, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm2[0,1,2,3],zmm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm4[1],xmm3[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm0[0,1,2,3],zmm7[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm7[0],xmm4[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm8, %zmm3, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm4[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm0[0,1,2,3],zmm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm7[0],xmm4[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm8, %zmm3, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm4[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm0[0,1,2,3],zmm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm7[0],xmm4[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm17, %zmm3, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm4[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rcx), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %xmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm17[0],xmm4[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm22, %zmm3, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm4[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm4, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rcx), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %xmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm17[0],xmm0[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %xmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm30[0],xmm22[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm1[0,1,2,3],zmm2[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm19 = xmm30[1],xmm22[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm17[1],xmm0[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %xmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm22[0],xmm19[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm0[0],xmm30[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm14, %zmm17, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm14[0,1,2,3],zmm24[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %xmm13 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm18 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm18 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm25 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm18 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm25 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm22[1],xmm19[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm0[0,1,2,3],zmm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rcx), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %xmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm14[0],xmm0[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm24[0],xmm20[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm19[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm19 = xmm24[1],xmm20[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm14[1],xmm0[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm13[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 3776(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 3712(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 3264(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 3200(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 2752(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 2240(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1664(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1152(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 4032(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3968(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3904(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3840(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 3648(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 3584(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3520(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3456(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 3136(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 3008(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2944(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2880(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 2624(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2304(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 2048(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1088(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 3776(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 3712(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 3264(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 3200(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2752(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 2688(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 2176(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 1728(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 1664(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 1216(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 1152(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 4032(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 3968(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 3904(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 3840(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 3520(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 3456(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 3392(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 3328(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 3008(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 2944(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 2880(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 2816(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 2496(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 2432(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 2368(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 2304(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 1984(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 1920(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 1856(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 1792(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 1472(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 1408(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 1344(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 1280(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 960(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 896(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 832(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 3648(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 3584(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 3136(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 3072(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 2624(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 2560(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 2048(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1600(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1536(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 1088(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 1024(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 512(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $5512, %rsp # imm = 0x1588 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: addq $5448, %rsp # imm = 0x1548 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: store_i64_stride8_vf64: ; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $5512, %rsp # imm = 0x1588 +; AVX512BW-ONLY-FAST-NEXT: subq $5448, %rsp # imm = 0x1548 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm31 ; AVX512BW-ONLY-FAST-NEXT: movb $-64, %r11b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r11d, %k1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [4,12,4,12,4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm15[0],zmm2[0],zmm15[2],zmm2[2],zmm15[4],zmm2[4],zmm15[6],zmm2[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] ; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] -; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512BW-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,5,13,5,13,5,13] ; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm15[1],zmm2[1],zmm15[3],zmm2[3],zmm15[5],zmm2[5],zmm15[7],zmm2[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm19[0],zmm31[0],zmm19[2],zmm31[2],zmm19[4],zmm31[4],zmm19[6],zmm31[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm19[1],zmm31[1],zmm19[3],zmm31[3],zmm19[5],zmm31[5],zmm19[7],zmm31[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm26 = [7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # ymm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm26, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm21[0],zmm20[2],zmm21[2],zmm20[4],zmm21[4],zmm20[6],zmm21[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm7, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm21[1],zmm20[3],zmm21[3],zmm20[5],zmm21[5],zmm20[7],zmm21[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm2, %zmm10 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm6, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm26, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm12, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm17[0],zmm16[2],zmm17[2],zmm16[4],zmm17[4],zmm16[6],zmm17[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm13, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm17[1],zmm16[3],zmm17[3],zmm16[5],zmm17[5],zmm16[7],zmm17[7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm29[0],zmm20[0],zmm29[2],zmm20[2],zmm29[4],zmm20[4],zmm29[6],zmm20[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm29[1],zmm20[1],zmm29[3],zmm20[3],zmm29[5],zmm20[5],zmm29[7],zmm20[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r10), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm28[0],zmm9[2],zmm28[2],zmm9[4],zmm28[4],zmm9[6],zmm28[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm28[1],zmm9[3],zmm28[3],zmm9[5],zmm28[5],zmm9[7],zmm28[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r10), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm25[0],zmm2[0],zmm25[2],zmm2[2],zmm25[4],zmm2[4],zmm25[6],zmm2[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm25[1],zmm3[1],zmm25[3],zmm3[3],zmm25[5],zmm3[5],zmm25[7],zmm3[7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r10), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r10), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r10), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm15[0],zmm0[2],zmm15[2],zmm0[4],zmm15[4],zmm0[6],zmm15[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm0[1],zmm15[1],zmm0[3],zmm15[3],zmm0[5],zmm15[5],zmm0[7],zmm15[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm16 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm26, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r10), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm11, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm27[0],zmm17[0],zmm27[2],zmm17[2],zmm27[4],zmm17[4],zmm27[6],zmm17[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm27[1],zmm17[1],zmm27[3],zmm17[3],zmm27[5],zmm17[5],zmm27[7],zmm17[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm26[0],zmm24[0],zmm26[2],zmm24[2],zmm26[4],zmm24[4],zmm26[6],zmm24[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm24[1],zmm26[3],zmm24[3],zmm26[5],zmm24[5],zmm26[7],zmm24[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r10), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r10), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm13, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm5, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm13, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm13, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm5, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm27 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm4[0],zmm16[2],zmm4[2],zmm16[4],zmm4[4],zmm16[6],zmm4[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm16[1],zmm4[1],zmm16[3],zmm4[3],zmm16[5],zmm4[5],zmm16[7],zmm4[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm21[0],zmm3[0],zmm21[2],zmm3[2],zmm21[4],zmm3[4],zmm21[6],zmm3[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm21[1],zmm3[1],zmm21[3],zmm3[3],zmm21[5],zmm3[5],zmm21[7],zmm3[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm18[0],zmm8[0],zmm18[2],zmm8[2],zmm18[4],zmm8[4],zmm18[6],zmm8[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm18[1],zmm8[1],zmm18[3],zmm8[3],zmm18[5],zmm8[5],zmm18[7],zmm8[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm23, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm23, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm23, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm23[0],zmm7[0],zmm23[2],zmm7[2],zmm23[4],zmm7[4],zmm23[6],zmm7[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm23[1],zmm7[1],zmm23[3],zmm7[3],zmm23[5],zmm7[5],zmm23[7],zmm7[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm23 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -14448,976 +16200,1045 @@ ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm19, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %ymm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %ymm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %ymm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm15 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %ymm23 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rsi), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm10 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rsi), %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm23 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm4[0],xmm3[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm7, %zmm2, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm2[0,1,2,3],zmm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm4[1],xmm3[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm0[0,1,2,3],zmm7[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm7[0],xmm4[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm8, %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm4[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm0[0,1,2,3],zmm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm7[0],xmm4[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm8, %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm4[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm0[0,1,2,3],zmm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm7[0],xmm4[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm17, %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm4[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rcx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm17[0],xmm4[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm22, %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm4[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm4, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rcx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm17[0],xmm0[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %xmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %xmm30 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm30[0],xmm22[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm1, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm1[0,1,2,3],zmm2[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm19 = xmm30[1],xmm22[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm17[1],xmm0[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %xmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %xmm22 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm22[0],xmm19[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %xmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm0[0],xmm30[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm14, %zmm17, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm14[0,1,2,3],zmm24[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %xmm12 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %xmm12 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %xmm13 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %xmm25 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %xmm25 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm22[1],xmm19[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm0[0,1,2,3],zmm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rcx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm14[0],xmm0[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %xmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %xmm24 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm24[0],xmm20[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm19[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm19 = xmm24[1],xmm20[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm14[1],xmm0[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm13[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 3776(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 3712(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 3264(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 3200(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 2752(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 2240(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2176(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1664(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1152(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 4032(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3968(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3904(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3840(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 3648(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 3584(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3520(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3456(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 3136(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 3072(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 3008(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2944(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2880(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2624(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2304(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 2048(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 3776(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 3712(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 3264(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 3200(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2752(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 2688(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 2176(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 1728(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 1664(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 1216(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 1152(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 4032(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 3968(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 3904(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 3840(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 3520(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 3456(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 3392(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 3328(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 3008(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 2944(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 2880(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 2816(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 2496(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 2432(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 2368(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 2304(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 1984(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 1920(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 1856(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 1792(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 1472(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 1408(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 1344(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 1280(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 960(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 896(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 832(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 3648(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 3584(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 3136(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 3072(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 2624(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 2560(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 2048(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1600(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1536(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 1088(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 1024(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 512(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $5512, %rsp # imm = 0x1588 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512BW-ONLY-FAST-NEXT: addq $5448, %rsp # imm = 0x1548 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: store_i64_stride8_vf64: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $5512, %rsp # imm = 0x1588 +; AVX512DQBW-SLOW-NEXT: subq $5448, %rsp # imm = 0x1548 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r10), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r10), %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm31 ; AVX512DQBW-SLOW-NEXT: movb $-64, %r11b ; AVX512DQBW-SLOW-NEXT: kmovd %r11d, %k1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [4,12,4,12,4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm15[0],zmm2[0],zmm15[2],zmm2[2],zmm15[4],zmm2[4],zmm15[6],zmm2[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] ; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] -; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512DQBW-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,5,13,5,13,5,13] ; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # ymm30 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm15[1],zmm2[1],zmm15[3],zmm2[3],zmm15[5],zmm2[5],zmm15[7],zmm2[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm19[0],zmm31[0],zmm19[2],zmm31[2],zmm19[4],zmm31[4],zmm19[6],zmm31[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm19[1],zmm31[1],zmm19[3],zmm31[3],zmm19[5],zmm31[5],zmm19[7],zmm31[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm8 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # ymm26 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm26, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm21[0],zmm20[2],zmm21[2],zmm20[4],zmm21[4],zmm20[6],zmm21[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm7, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm13, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm21[1],zmm20[3],zmm21[3],zmm20[5],zmm21[5],zmm20[7],zmm21[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm8 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm2, %zmm10 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r10), %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm6, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r10), %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm12, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm26, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm12, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm17[0],zmm16[2],zmm17[2],zmm16[4],zmm17[4],zmm16[6],zmm17[6] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm7, %zmm6 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm8 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm17[1],zmm16[3],zmm17[3],zmm16[5],zmm17[5],zmm16[7],zmm17[7] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm14, %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r10), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm29 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm29[0],zmm20[0],zmm29[2],zmm20[2],zmm29[4],zmm20[4],zmm29[6],zmm20[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm26, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm29[1],zmm20[1],zmm29[3],zmm20[3],zmm29[5],zmm20[5],zmm29[7],zmm20[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r10), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r10), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r8), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r9), %zmm24 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm28[0],zmm9[2],zmm28[2],zmm9[4],zmm28[4],zmm9[6],zmm28[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm26, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm28[1],zmm9[3],zmm28[3],zmm9[5],zmm28[5],zmm9[7],zmm28[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r10), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rax), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r8), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r9), %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm25[0],zmm2[0],zmm25[2],zmm2[2],zmm25[4],zmm2[4],zmm25[6],zmm2[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm13, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm25[1],zmm3[1],zmm25[3],zmm3[3],zmm25[5],zmm3[5],zmm25[7],zmm3[7] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r10), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rax), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r8), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r9), %zmm4 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm6 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm15[0],zmm0[2],zmm15[2],zmm0[4],zmm15[4],zmm0[6],zmm15[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm26, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm0[1],zmm15[1],zmm0[3],zmm15[3],zmm0[5],zmm15[5],zmm0[7],zmm15[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm10, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm26, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm10, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm26, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm26, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r10), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rax), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm11, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r8), %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r9), %zmm17 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm27[0],zmm17[0],zmm27[2],zmm17[2],zmm27[4],zmm17[4],zmm27[6],zmm17[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm27[1],zmm17[1],zmm27[3],zmm17[3],zmm27[5],zmm17[5],zmm27[7],zmm17[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm26[0],zmm24[0],zmm26[2],zmm24[2],zmm26[4],zmm24[4],zmm26[6],zmm24[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm24[1],zmm26[3],zmm24[3],zmm26[5],zmm24[5],zmm26[7],zmm24[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r10), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rax), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r10), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rax), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r8), %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r9), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r8), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r9), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r10), %zmm18 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rax), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r8), %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r8), %zmm23 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r9), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] +; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm10, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm13, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm5, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm13, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm10, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm5, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm5, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm5, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm5, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm27 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm4[0],zmm16[2],zmm4[2],zmm16[4],zmm4[4],zmm16[6],zmm4[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm16[1],zmm4[1],zmm16[3],zmm4[3],zmm16[5],zmm4[5],zmm16[7],zmm4[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm21[0],zmm3[0],zmm21[2],zmm3[2],zmm21[4],zmm3[4],zmm21[6],zmm3[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm21[1],zmm3[1],zmm21[3],zmm3[3],zmm21[5],zmm3[5],zmm21[7],zmm3[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm18[0],zmm8[0],zmm18[2],zmm8[2],zmm18[4],zmm8[4],zmm18[6],zmm8[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm18[1],zmm8[1],zmm18[3],zmm8[3],zmm18[5],zmm8[5],zmm18[7],zmm8[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm23, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm23, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm23, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm23[0],zmm7[0],zmm23[2],zmm7[2],zmm23[4],zmm7[4],zmm23[6],zmm7[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm23[1],zmm7[1],zmm23[3],zmm7[3],zmm23[5],zmm7[5],zmm23[7],zmm7[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm23 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -15436,976 +17257,1045 @@ ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm4 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rcx), %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm19, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %ymm4 ; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rsi), %ymm4 ; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rsi), %ymm4 ; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %ymm7 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm15 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rsi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %ymm23 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rsi), %ymm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rsi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %ymm18 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rsi), %ymm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm7 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rsi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdi), %ymm10 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rsi), %ymm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdi), %ymm7 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm6 ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm23 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm4[0],xmm3[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm7, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm2[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm4[1],xmm3[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm0[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm7[0],xmm4[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm8, %zmm3, %zmm3 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm4[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm0[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rcx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %xmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm7 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm7[0],xmm4[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm8, %zmm3, %zmm3 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm4[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm0[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rcx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rsi), %xmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm7[0],xmm4[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm17, %zmm3, %zmm3 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm4[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rcx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdx), %xmm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rsi), %xmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %xmm17 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm17[0],xmm4[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm22, %zmm3, %zmm3 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm4[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm4, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rcx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %xmm17 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm17[0],xmm0[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %xmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm30 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm30[0],xmm22[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm1, %zmm1 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm1[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm19 = xmm30[1],xmm22[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm17[1],xmm0[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rcx), %xmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %xmm22 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm22[0],xmm19[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm0[0],xmm30[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm14, %zmm17, %zmm14 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm14[0,1,2,3],zmm24[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rsi), %xmm12 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %xmm12 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rsi), %xmm13 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm18 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm18 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm25 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm18 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm25 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm22[1],xmm19[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm0[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rcx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdx), %xmm14 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm14[0],xmm0[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm24 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm24[0],xmm20[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm19[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm19 = xmm24[1],xmm20[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm14[1],xmm0[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm13[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 3776(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 3712(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 3264(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 3200(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 2752(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 2240(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1664(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1152(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 4032(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3968(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3904(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3840(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 3648(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 3584(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3520(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3456(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 3136(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 3008(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2944(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2880(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 2624(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2304(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 2048(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 1088(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 3776(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 3712(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 3264(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 3200(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 2752(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 2688(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 2176(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 1728(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 1664(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 1216(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 1152(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 4032(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 3968(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 3904(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 3840(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 3520(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 3456(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 3392(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 3328(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 3008(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 2944(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 2880(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 2816(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 2496(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 2432(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 2368(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 2304(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 1984(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 1920(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 1856(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 1792(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 1472(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 1408(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 1344(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 1280(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 960(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 896(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 832(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 768(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 3648(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 3584(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 3136(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 3072(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 2624(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 2560(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 2048(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 1600(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 1536(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 1088(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 1024(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 512(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512DQBW-SLOW-NEXT: addq $5512, %rsp # imm = 0x1588 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512DQBW-SLOW-NEXT: addq $5448, %rsp # imm = 0x1548 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: store_i64_stride8_vf64: ; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $5512, %rsp # imm = 0x1588 +; AVX512DQBW-FAST-NEXT: subq $5448, %rsp # imm = 0x1548 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r10), %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r10), %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm31 ; AVX512DQBW-FAST-NEXT: movb $-64, %r11b ; AVX512DQBW-FAST-NEXT: kmovd %r11d, %k1 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,12,4,12,4,12,4,12] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm1[0],zmm28[0],zmm1[2],zmm28[2],zmm1[4],zmm28[4],zmm1[6],zmm28[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [4,12,4,12,4,12,4,12] +; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm15[0],zmm2[0],zmm15[2],zmm2[2],zmm15[4],zmm2[4],zmm15[6],zmm2[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm0 ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] ; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [5,13,5,13,5,13,5,13] -; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm1[1],zmm28[1],zmm1[3],zmm28[3],zmm1[5],zmm28[5],zmm1[7],zmm28[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm0 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512DQBW-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm7, %zmm15 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm12 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,5,13,5,13,5,13] ; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm27[0],zmm21[2],zmm27[2],zmm21[4],zmm27[4],zmm21[6],zmm27[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm12 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm15 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm21[1],zmm27[1],zmm21[3],zmm27[3],zmm21[5],zmm27[5],zmm21[7],zmm27[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm18, %zmm8 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [7,15,7,15] -; AVX512DQBW-FAST-NEXT: # ymm30 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm30, %zmm10 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm15[1],zmm2[1],zmm15[3],zmm2[3],zmm15[5],zmm2[5],zmm15[7],zmm2[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] +; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm25[0],zmm26[0],zmm25[2],zmm26[2],zmm25[4],zmm26[4],zmm25[6],zmm26[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm11 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm19[0],zmm31[0],zmm19[2],zmm31[2],zmm19[4],zmm31[4],zmm19[6],zmm31[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm11 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm15 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm9, %zmm8 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm25[1],zmm26[1],zmm25[3],zmm26[3],zmm25[5],zmm26[5],zmm25[7],zmm26[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm11 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm19[1],zmm31[1],zmm19[3],zmm31[3],zmm19[5],zmm31[5],zmm19[7],zmm31[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [7,15,7,15] +; AVX512DQBW-FAST-NEXT: # ymm26 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm26, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm8 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm20[0],zmm21[0],zmm20[2],zmm21[2],zmm20[4],zmm21[4],zmm20[6],zmm21[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm11 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm7, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm13, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm20[1],zmm21[1],zmm20[3],zmm21[3],zmm20[5],zmm21[5],zmm20[7],zmm21[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm8 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm2, %zmm10 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm14[0],zmm16[0],zmm14[2],zmm16[2],zmm14[4],zmm16[4],zmm14[6],zmm16[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r10), %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm30, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm6, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r10), %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm12, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm26, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm12, %zmm6 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm23[0],zmm24[0],zmm23[2],zmm24[2],zmm23[4],zmm24[4],zmm23[6],zmm24[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm16[0],zmm17[0],zmm16[2],zmm17[2],zmm16[4],zmm17[4],zmm16[6],zmm17[6] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm7, %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm8 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm24[1],zmm23[3],zmm24[3],zmm23[5],zmm24[5],zmm23[7],zmm24[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm13, %zmm5 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm17[1],zmm16[3],zmm17[3],zmm16[5],zmm17[5],zmm16[7],zmm17[7] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm5 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm14[0],zmm10[2],zmm14[2],zmm10[4],zmm14[4],zmm10[6],zmm14[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm18, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm4 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm14[1],zmm10[3],zmm14[3],zmm10[5],zmm14[5],zmm10[7],zmm14[7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r10), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm29 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm23[0],zmm29[0],zmm23[2],zmm29[2],zmm23[4],zmm29[4],zmm23[6],zmm29[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm6 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm29[0],zmm20[0],zmm29[2],zmm20[2],zmm29[4],zmm20[4],zmm29[6],zmm20[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm29[1],zmm20[1],zmm29[3],zmm20[3],zmm29[5],zmm20[5],zmm29[7],zmm20[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r10), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm9, %zmm4 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm23[1],zmm29[1],zmm23[3],zmm29[3],zmm23[5],zmm29[5],zmm23[7],zmm29[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm3[1],zmm0[1],zmm3[3],zmm0[3],zmm3[5],zmm0[5],zmm3[7],zmm0[7] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [5,13,5,13] +; AVX512DQBW-FAST-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm26[0],zmm10[2],zmm26[2],zmm10[4],zmm26[4],zmm10[6],zmm26[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm18, %zmm3 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm26[1],zmm10[3],zmm26[3],zmm10[5],zmm26[5],zmm10[7],zmm26[7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r10), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r8), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r9), %zmm24 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm0[0],zmm24[0],zmm0[2],zmm24[2],zmm0[4],zmm24[4],zmm0[6],zmm24[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm28[0],zmm9[2],zmm28[2],zmm9[4],zmm28[4],zmm9[6],zmm28[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm28[1],zmm9[3],zmm28[3],zmm9[5],zmm28[5],zmm9[7],zmm28[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r10), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rax), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r8), %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r9), %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm25[0],zmm2[0],zmm25[2],zmm2[2],zmm25[4],zmm2[4],zmm25[6],zmm2[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdx), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdx), %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm4 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm24[1],zmm17[3],zmm24[3],zmm17[5],zmm24[5],zmm17[7],zmm24[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm25[1],zmm3[1],zmm25[3],zmm3[3],zmm25[5],zmm3[5],zmm25[7],zmm3[7] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm10[0],zmm22[0],zmm10[2],zmm22[2],zmm10[4],zmm22[4],zmm10[6],zmm22[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm18, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm3 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm10[1],zmm22[1],zmm10[3],zmm22[3],zmm10[5],zmm22[5],zmm10[7],zmm22[7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm11, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm7 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm14 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm15[0],zmm0[2],zmm15[2],zmm0[4],zmm15[4],zmm0[6],zmm15[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm0[1],zmm15[1],zmm0[3],zmm15[3],zmm0[5],zmm15[5],zmm0[7],zmm15[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [5,13,5,13] +; AVX512DQBW-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm10, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm26, %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm15, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %zmm6 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm30, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r10), %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rax), %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r8), %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r9), %zmm4 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm17[0],zmm4[0],zmm17[2],zmm4[2],zmm17[4],zmm4[4],zmm17[6],zmm4[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm9, %zmm1 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm17[1],zmm4[1],zmm17[3],zmm4[3],zmm17[5],zmm4[5],zmm17[7],zmm4[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm9, %zmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm5 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm31[0],zmm12[0],zmm31[2],zmm12[2],zmm31[4],zmm12[4],zmm31[6],zmm12[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm2 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm18, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm31[1],zmm12[1],zmm31[3],zmm12[3],zmm31[5],zmm12[5],zmm31[7],zmm12[7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %zmm5 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm26, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r10), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rax), %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm11, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r8), %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r9), %zmm17 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm27[0],zmm17[0],zmm27[2],zmm17[2],zmm27[4],zmm17[4],zmm27[6],zmm17[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm9, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm27[1],zmm17[1],zmm27[3],zmm17[3],zmm27[5],zmm17[5],zmm27[7],zmm17[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm6 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm26[0],zmm24[0],zmm26[2],zmm24[2],zmm26[4],zmm24[4],zmm26[6],zmm24[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm24[1],zmm26[3],zmm24[3],zmm26[5],zmm24[5],zmm26[7],zmm24[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r10), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rax), %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r10), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rax), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r8), %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r9), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r10), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r8), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r9), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r10), %zmm18 ; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rax), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r8), %zmm19 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r8), %zmm23 ; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r9), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm6, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] +; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm14, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm6, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm1, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm10, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm13, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm27, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm5, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm14, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm6, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm13, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm15, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm14, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm6, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm10, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm5, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm13, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm11[0],zmm20[0],zmm11[2],zmm20[2],zmm11[4],zmm20[4],zmm11[6],zmm20[6] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm11[1],zmm20[1],zmm11[3],zmm20[3],zmm11[5],zmm20[5],zmm11[7],zmm20[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm14, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm20 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm30[0],zmm0[0],zmm30[2],zmm0[2],zmm30[4],zmm0[4],zmm30[6],zmm0[6] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm30[1],zmm0[1],zmm30[3],zmm0[3],zmm30[5],zmm0[5],zmm30[7],zmm0[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm5, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm5, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm27 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm15, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm16[0],zmm8[0],zmm16[2],zmm8[2],zmm16[4],zmm8[4],zmm16[6],zmm8[6] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm16[1],zmm8[1],zmm16[3],zmm8[3],zmm16[5],zmm8[5],zmm16[7],zmm8[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm19, %zmm6 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm19[0],zmm7[0],zmm19[2],zmm7[2],zmm19[4],zmm7[4],zmm19[6],zmm7[6] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm19[1],zmm7[1],zmm19[3],zmm7[3],zmm19[5],zmm7[5],zmm19[7],zmm7[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm16[0],zmm4[0],zmm16[2],zmm4[2],zmm16[4],zmm4[4],zmm16[6],zmm4[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm16[1],zmm4[1],zmm16[3],zmm4[3],zmm16[5],zmm4[5],zmm16[7],zmm4[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm29 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm21[0],zmm3[0],zmm21[2],zmm3[2],zmm21[4],zmm3[4],zmm21[6],zmm3[6] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm21[1],zmm3[1],zmm21[3],zmm3[3],zmm21[5],zmm3[5],zmm21[7],zmm3[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm18[0],zmm8[0],zmm18[2],zmm8[2],zmm18[4],zmm8[4],zmm18[6],zmm8[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm18[1],zmm8[1],zmm18[3],zmm8[3],zmm18[5],zmm8[5],zmm18[7],zmm8[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm5, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm23, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm23, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm23, %zmm5 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm23[0],zmm7[0],zmm23[2],zmm7[2],zmm23[4],zmm7[4],zmm23[6],zmm7[6] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm23[1],zmm7[1],zmm23[3],zmm7[3],zmm23[5],zmm7[5],zmm23[7],zmm7[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm23 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -16424,359 +18314,411 @@ ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %ymm4 ; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %ymm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm19, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %ymm4 ; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm13, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rsi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rsi), %ymm4 ; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm9 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rsi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rsi), %ymm4 ; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %ymm7 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm26, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm15 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdx), %ymm1 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rsi), %ymm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %ymm23 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm23[0],ymm3[0],ymm23[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rsi), %ymm4 +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm17 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm3[1],ymm23[3],ymm3[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdx), %ymm1 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rsi), %ymm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %ymm18 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm18[0],ymm3[0],ymm18[2],ymm3[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm20, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm30 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rsi), %ymm4 +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdi), %ymm7 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm21 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm3[1],ymm18[3],ymm3[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdx), %ymm1 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rsi), %ymm3 -; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdi), %ymm10 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm3[0],ymm10[2],ymm3[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rsi), %ymm4 +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdi), %ymm7 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm6 ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm3[1],ymm10[3],ymm3[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm19 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, (%rdx), %ymm2, %ymm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm4, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 64(%rcx), %ymm1, %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 64(%rdx), %ymm4, %ymm4 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm23 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm4[0],xmm3[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm7, %zmm2, %zmm2 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm2[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm4[1],xmm3[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm0[0,1,2,3],zmm7[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm7[0],xmm4[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm8, %zmm3, %zmm3 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm4[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm0[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rcx), %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %xmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %xmm7 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm7[0],xmm4[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm8, %zmm3, %zmm3 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm4[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm0[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rcx), %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rsi), %xmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm7[0],xmm4[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm17, %zmm3, %zmm3 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %xmm1 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 128(%rcx), %ymm1, %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 128(%rdx), %ymm4, %ymm12 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm4[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rcx), %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdx), %xmm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rsi), %xmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %xmm17 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm17[0],xmm4[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm22, %zmm3, %zmm3 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm4[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm4, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rcx), %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %xmm17 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm17[0],xmm0[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %xmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %xmm30 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm30[0],xmm22[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm1, %zmm1 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm1[0,1,2,3],zmm2[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm18 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 192(%rcx), %ymm1, %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 192(%rdx), %ymm12, %ymm12 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm18, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm19 = xmm30[1],xmm22[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm17[1],xmm0[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %xmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %xmm22 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm22[0],xmm19[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %xmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm0[0],xmm30[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm14, %zmm17, %zmm14 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm14[0,1,2,3],zmm24[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rsi), %xmm12 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 256(%rcx), %ymm12, %ymm13 -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %xmm12 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 256(%rdx), %ymm12, %ymm18 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm5, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rsi), %xmm13 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 320(%rcx), %ymm13, %ymm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %xmm18 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 320(%rdx), %ymm18, %ymm18 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm18[0],ymm13[0],ymm18[2],ymm13[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm29, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm18[1],ymm13[1],ymm18[3],ymm13[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %xmm18 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 384(%rcx), %ymm18, %ymm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %xmm25 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 384(%rdx), %ymm25, %ymm25 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm21, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm28, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %xmm18 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 448(%rcx), %ymm18, %ymm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %xmm25 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, 448(%rdx), %ymm25, %ymm25 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm27 = ymm25[0],ymm18[0],ymm25[2],ymm18[2] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm18 = ymm25[1],ymm18[1],ymm25[3],ymm18[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm14, %zmm5 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm22[1],xmm19[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm0[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rcx), %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdx), %xmm14 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm14[0],xmm0[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %xmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %xmm24 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm24[0],xmm20[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm19[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm19 = xmm24[1],xmm20[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm14[1],xmm0[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm13[4,5,6,7] ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 3776(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 3712(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 3264(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 3200(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 2752(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 2240(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 2176(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 1728(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1664(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1216(%rax) -; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1152(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 640(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 4032(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3968(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3904(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3840(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 3648(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 3584(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3520(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3456(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3392(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3328(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 3136(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 3072(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 3008(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2944(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2880(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2816(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 2624(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2304(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 2112(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 2048(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 1600(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 1536(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 1088(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 1024(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 3776(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 3712(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 3264(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 3200(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 2752(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 2688(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 2240(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 2176(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 1728(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 1664(%rax) +; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 1216(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 1152(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 4032(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 3968(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 3904(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 3840(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 3520(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 3456(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 3392(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 3328(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 3008(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 2944(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 2880(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 2816(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 2496(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 2432(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 2368(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 2304(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 1984(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 1920(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 1856(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 1792(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 1472(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 1408(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 1344(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 1280(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 960(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 896(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 832(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 768(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 3648(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 3584(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 3136(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 3072(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 2624(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 2560(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 2112(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 2048(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 1600(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 1536(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 1088(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 1024(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 512(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512DQBW-FAST-NEXT: addq $5512, %rsp # imm = 0x1588 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512DQBW-FAST-NEXT: addq $5448, %rsp # imm = 0x1548 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll @@ -130,15 +130,15 @@ ; SSE-NEXT: movdqa (%rsi), %xmm2 ; SSE-NEXT: movdqa 16(%rsi), %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: movdqa %xmm1, 32(%rdx) -; SSE-NEXT: movdqa %xmm2, 48(%rdx) -; SSE-NEXT: movdqa %xmm0, (%rdx) -; SSE-NEXT: movdqa %xmm4, 16(%rdx) +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; SSE-NEXT: movdqa %xmm1, 48(%rdx) +; SSE-NEXT: movdqa %xmm2, 32(%rdx) +; SSE-NEXT: movdqa %xmm0, 16(%rdx) +; SSE-NEXT: movdqa %xmm4, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride2_vf32: @@ -216,25 +216,25 @@ ; SSE-NEXT: movdqa 32(%rsi), %xmm6 ; SSE-NEXT: movdqa 48(%rsi), %xmm7 ; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] ; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] ; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] ; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; SSE-NEXT: movdqa %xmm3, 96(%rdx) -; SSE-NEXT: movdqa %xmm6, 112(%rdx) -; SSE-NEXT: movdqa %xmm2, 64(%rdx) -; SSE-NEXT: movdqa %xmm5, 80(%rdx) -; SSE-NEXT: movdqa %xmm1, 32(%rdx) -; SSE-NEXT: movdqa %xmm4, 48(%rdx) -; SSE-NEXT: movdqa %xmm0, (%rdx) -; SSE-NEXT: movdqa %xmm8, 16(%rdx) +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] +; SSE-NEXT: movdqa %xmm3, 112(%rdx) +; SSE-NEXT: movdqa %xmm6, 96(%rdx) +; SSE-NEXT: movdqa %xmm2, 80(%rdx) +; SSE-NEXT: movdqa %xmm5, 64(%rdx) +; SSE-NEXT: movdqa %xmm1, 48(%rdx) +; SSE-NEXT: movdqa %xmm4, 32(%rdx) +; SSE-NEXT: movdqa %xmm0, 16(%rdx) +; SSE-NEXT: movdqa %xmm8, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride2_vf64: @@ -247,22 +247,22 @@ ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 16(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 48(%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm3, 96(%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm6, 112(%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, 64(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 80(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 80(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 48(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 16(%rdx) ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i8_stride2_vf64: @@ -273,16 +273,16 @@ ; AVX2-ONLY-NEXT: vmovdqa 32(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] ; AVX2-ONLY-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[0,1],ymm4[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] ; AVX2-ONLY-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 96(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rdx) +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[0,1],ymm4[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 64(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -324,22 +324,22 @@ ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm5 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll @@ -147,28 +147,28 @@ ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm0[0],zero,zero,xmm0[1],zero,zero,xmm0[2],zero,zero,xmm0[3],zero,zero,xmm0[4],zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[13],zero,xmm1[6,14],zero,xmm1[7,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5],zero,zero,xmm0[6],zero,zero,xmm0[7,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovq %xmm0, 16(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rcx) +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,u,13,12,u,15,14,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,11,4,6,13,10,12,15,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[2,3],zero,xmm0[4,5],zero,xmm0[6,7],zero,xmm0[8,9],zero,xmm0[10] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rcx) +; AVX1-ONLY-NEXT: vmovq %xmm3, 16(%rcx) ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i8_stride3_vf8: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-ONLY-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,0] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5,21],zero,ymm0[30,22],zero,ymm0[31,23],zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-ONLY-NEXT: vpbroadcastq (%rdx), %ymm1 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[0],zero,zero,ymm1[1],zero,zero,ymm1[2],zero,zero,ymm1[3],zero,zero,ymm1[4],zero,zero,ymm1[21],zero,zero,ymm1[22],zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-ONLY-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-ONLY-NEXT: vmovq %xmm1, 16(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rcx) @@ -179,13 +179,12 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,0] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5,21],zero,ymm0[30,22],zero,ymm0[31,23],zero,ymm0[u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpbroadcastq (%rdx), %ymm1 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[0],zero,zero,ymm1[1],zero,zero,ymm1[2],zero,zero,ymm1[3],zero,zero,ymm1[4],zero,zero,ymm1[21],zero,zero,ymm1[22],zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vmovq %xmm1, 16(%rcx) ; AVX512F-NEXT: vmovdqa %xmm0, (%rcx) @@ -196,13 +195,12 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,0] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5,21],zero,ymm0[30,22],zero,ymm0[31,23],zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpbroadcastq (%rdx), %ymm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[0],zero,zero,ymm1[1],zero,zero,ymm1[2],zero,zero,ymm1[3],zero,zero,ymm1[4],zero,zero,ymm1[21],zero,zero,ymm1[22],zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vmovq %xmm1, 16(%rcx) ; AVX512BW-NEXT: vmovdqa %xmm0, (%rcx) @@ -340,12 +338,12 @@ define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride3_vf32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm8 -; SSE-NEXT: movdqa (%rsi), %xmm4 -; SSE-NEXT: movdqa 16(%rsi), %xmm10 -; SSE-NEXT: movdqa (%rdx), %xmm1 -; SSE-NEXT: movdqa 16(%rdx), %xmm7 +; SSE-NEXT: movdqa (%rdi), %xmm8 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa (%rsi), %xmm10 +; SSE-NEXT: movdqa 16(%rsi), %xmm4 +; SSE-NEXT: movdqa (%rdx), %xmm7 +; SSE-NEXT: movdqa 16(%rdx), %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] @@ -355,107 +353,107 @@ ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,6] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] -; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,5,5,6] +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: por %xmm5, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] +; SSE-NEXT: pand %xmm6, %xmm9 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm9, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,6,5] ; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pandn %xmm6, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm6[0,1,2,3,4,5,5,6] +; SSE-NEXT: pandn %xmm5, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm5[0,1,2,3,4,5,5,6] ; SSE-NEXT: pand %xmm0, %xmm11 ; SSE-NEXT: por %xmm9, %xmm11 -; SSE-NEXT: pand %xmm5, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm9, %xmm6 -; SSE-NEXT: por %xmm11, %xmm6 +; SSE-NEXT: pand %xmm6, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm9, %xmm5 +; SSE-NEXT: por %xmm11, %xmm5 ; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,2,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,1,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm11 ; SSE-NEXT: pandn %xmm9, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,5,5,6,6] +; SSE-NEXT: pand %xmm6, %xmm12 ; SSE-NEXT: por %xmm11, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] -; SSE-NEXT: pand %xmm9, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm7[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,6,5,7,7] -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: pandn %xmm13, %xmm11 -; SSE-NEXT: por %xmm12, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,5,5,6,6] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm11, %xmm9 +; SSE-NEXT: por %xmm12, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,2,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: pandn %xmm10, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: por %xmm11, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] +; SSE-NEXT: pand %xmm8, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm7[0,1,2,3,6,5,7,7] +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: pandn %xmm11, %xmm7 +; SSE-NEXT: por %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: pandn %xmm10, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,6,6] -; SSE-NEXT: pand %xmm5, %xmm8 -; SSE-NEXT: por %xmm12, %xmm8 -; SSE-NEXT: pand %xmm0, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,5,5,6,6] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm10, %xmm7 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,2,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: pandn %xmm8, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm0, %xmm8 -; SSE-NEXT: por %xmm10, %xmm8 -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,7,7] -; SSE-NEXT: pandn %xmm10, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,6] -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,6] -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: movdqa %xmm9, 32(%rcx) -; SSE-NEXT: movdqa %xmm7, 48(%rcx) -; SSE-NEXT: movdqa %xmm11, 80(%rcx) -; SSE-NEXT: movdqa %xmm6, 16(%rcx) -; SSE-NEXT: movdqa %xmm3, 64(%rcx) +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,6,6] +; SSE-NEXT: pand %xmm6, %xmm11 +; SSE-NEXT: pandn %xmm10, %xmm6 +; SSE-NEXT: por %xmm11, %xmm6 +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,6,6] +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: pandn %xmm10, %xmm11 +; SSE-NEXT: por %xmm6, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm8, 80(%rcx) +; SSE-NEXT: movdqa %xmm11, 48(%rcx) +; SSE-NEXT: movdqa %xmm7, 32(%rcx) +; SSE-NEXT: movdqa %xmm9, (%rcx) +; SSE-NEXT: movdqa %xmm5, 64(%rcx) +; SSE-NEXT: movdqa %xmm3, 16(%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride3_vf32: @@ -489,10 +487,10 @@ ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 64(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 80(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm3, 48(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm5, 32(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 80(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX1-ONLY-NEXT: retq @@ -590,252 +588,252 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride3_vf64: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 16(%rdi), %xmm10 +; SSE-NEXT: movdqa (%rdi), %xmm7 +; SSE-NEXT: movdqa 16(%rdi), %xmm13 +; SSE-NEXT: movdqa 32(%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm12 -; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: movdqa 16(%rsi), %xmm9 -; SSE-NEXT: movdqa 32(%rsi), %xmm14 -; SSE-NEXT: movdqa 48(%rsi), %xmm4 -; SSE-NEXT: movdqa 16(%rdx), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdx), %xmm11 +; SSE-NEXT: movdqa (%rsi), %xmm4 +; SSE-NEXT: movdqa 16(%rsi), %xmm14 +; SSE-NEXT: movdqa 32(%rsi), %xmm9 +; SSE-NEXT: movdqa (%rdx), %xmm12 +; SSE-NEXT: movdqa 16(%rdx), %xmm11 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdx), %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,5,5,6] -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] -; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,5] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,5] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,1,2,3] +; SSE-NEXT: movdqa 32(%rdx), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,6] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm12[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm13[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,6,5] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,5,5,6] +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: pand %xmm3, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: por %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,6,5] +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm6[0,1,2,3,4,5,5,6] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: pandn %xmm8, %xmm13 -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,5] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: movdqa (%rsi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm8, %xmm3 -; SSE-NEXT: movdqa (%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm8[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: pandn %xmm15, %xmm10 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm10 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: pandn %xmm3, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm15, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,1,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,5,5,6,6] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,5,5,6,6] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm3, %xmm10 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,6,5] +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: movdqa 48(%rsi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm8[0,1,2,3,4,5,5,6] +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: por %xmm10, %xmm15 +; SSE-NEXT: movdqa 48(%rdx), %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm10[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: pand %xmm3, %xmm15 +; SSE-NEXT: por %xmm15, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm15 +; SSE-NEXT: pandn %xmm1, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,6] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm15, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,5,5,6,6] +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,2,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,6,5,7,7] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] ; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm7, %xmm6 ; SSE-NEXT: pand %xmm1, %xmm4 ; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,5,7,7] -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,1,1,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,6] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,5,5,6,6] -; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm4[0,1,2,3,5,5,6,6] +; SSE-NEXT: pand %xmm3, %xmm12 +; SSE-NEXT: por %xmm7, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,5,5,6,6] +; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: pandn %xmm7, %xmm5 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,7,7] -; SSE-NEXT: movdqa %xmm15, %xmm12 +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: por %xmm12, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[1,2,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm12 ; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,1,1,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,6] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,5,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: por %xmm12, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,6,5,7,7] +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: pandn %xmm13, %xmm12 +; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: por %xmm7, %xmm12 +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pandn %xmm7, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,6,6] +; SSE-NEXT: pand %xmm3, %xmm7 +; SSE-NEXT: por %xmm13, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm13[0,1,2,3,5,5,6,6] +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: pandn %xmm14, %xmm13 +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: por %xmm7, %xmm13 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[1,2,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm7, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm7[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: por %xmm9, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,6,5,7,7] ; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: pandn %xmm9, %xmm7 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; SSE-NEXT: pand %xmm1, %xmm14 +; SSE-NEXT: por %xmm14, %xmm7 +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3],xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,1,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,6,6] +; SSE-NEXT: pand %xmm3, %xmm14 +; SSE-NEXT: pandn %xmm9, %xmm3 +; SSE-NEXT: por %xmm14, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,6,6] +; SSE-NEXT: movdqa %xmm0, %xmm14 +; SSE-NEXT: pandn %xmm9, %xmm14 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm3, %xmm14 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,2,2,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm9, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] -; SSE-NEXT: pandn %xmm9, %xmm15 -; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,1,1,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,6,6] -; SSE-NEXT: pand %xmm2, %xmm9 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm9, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: por %xmm9, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rcx) -; SSE-NEXT: movdqa %xmm15, 32(%rcx) -; SSE-NEXT: movdqa %xmm7, 48(%rcx) +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, 176(%rcx) +; SSE-NEXT: movdqa %xmm14, 144(%rcx) +; SSE-NEXT: movdqa %xmm7, 128(%rcx) +; SSE-NEXT: movdqa %xmm13, 96(%rcx) ; SSE-NEXT: movdqa %xmm12, 80(%rcx) -; SSE-NEXT: movdqa %xmm5, 96(%rcx) -; SSE-NEXT: movdqa %xmm4, 128(%rcx) -; SSE-NEXT: movdqa %xmm6, 144(%rcx) -; SSE-NEXT: movdqa %xmm8, 176(%rcx) -; SSE-NEXT: movdqa %xmm10, 16(%rcx) -; SSE-NEXT: movdqa %xmm13, 64(%rcx) +; SSE-NEXT: movdqa %xmm5, 48(%rcx) +; SSE-NEXT: movdqa %xmm6, 32(%rcx) +; SSE-NEXT: movdqa %xmm15, (%rcx) +; SSE-NEXT: movdqa %xmm8, 160(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rcx) +; SSE-NEXT: movaps %xmm0, 64(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride3_vf64: @@ -940,18 +938,18 @@ ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm7 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 64(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 80(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm14, (%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 16(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm12, 48(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm5, 160(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm8, 176(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm13, 96(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, 112(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 128(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm10, 144(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm13, 96(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 112(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 64(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 80(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, 48(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm14, (%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 16(%rcx) ; AVX1-ONLY-NEXT: addq $24, %rsp ; AVX1-ONLY-NEXT: retq ; @@ -1007,12 +1005,12 @@ ; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm3, 128(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 64(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm1, 160(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, (%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 64(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm5, 96(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, (%rcx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -1054,8 +1052,8 @@ ; AVX512F-NEXT: vpshufb %ymm7, %ymm3, %ymm3 ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] ; AVX512F-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm3, 128(%rcx) ; AVX512F-NEXT: vmovdqa %ymm1, 160(%rcx) +; AVX512F-NEXT: vmovdqa %ymm3, 128(%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, 64(%rcx) ; AVX512F-NEXT: vmovdqa %ymm2, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm5, 96(%rcx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll @@ -69,38 +69,53 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,0,3,1,4,5,6,7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,0,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, (%r8) +; SSE-NEXT: movdqa %xmm0, (%r8) ; SSE-NEXT: retq ; -; AVX-LABEL: store_i8_stride4_vf4: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa (%rdx), %xmm1 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] -; AVX-NEXT: vmovdqa %xmm0, (%r8) -; AVX-NEXT: retq +; AVX1-ONLY-LABEL: store_i8_stride4_vf4: +; AVX1-ONLY: # %bb.0: +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%r8) +; AVX1-ONLY-NEXT: retq +; +; AVX2-ONLY-LABEL: store_i8_stride4_vf4: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpbroadcastd (%rcx), %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastd (%rdx), %xmm2 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,12,9,13,10,14,11,15,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%r8) +; AVX2-ONLY-NEXT: retq +; +; AVX512-LABEL: store_i8_stride4_vf4: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpbroadcastd (%rdx), %xmm1 +; AVX512-NEXT: vpunpckldq (%rcx){1to4}, %xmm1, %xmm1 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,12,9,13,10,14,11,15,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-NEXT: vmovdqa %xmm0, (%r8) +; AVX512-NEXT: retq %in.vec0 = load <4 x i8>, ptr %in.vecptr0, align 64 %in.vec1 = load <4 x i8>, ptr %in.vecptr1, align 64 %in.vec2 = load <4 x i8>, ptr %in.vecptr2, align 64 @@ -171,10 +186,10 @@ ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movdqa %xmm0, 32(%r8) -; SSE-NEXT: movdqa %xmm1, 48(%r8) +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa %xmm0, 48(%r8) +; SSE-NEXT: movdqa %xmm1, 32(%r8) ; SSE-NEXT: movdqa %xmm5, 16(%r8) ; SSE-NEXT: movdqa %xmm6, (%r8) ; SSE-NEXT: retq @@ -193,8 +208,8 @@ ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-NEXT: vmovdqa %xmm4, 32(%r8) ; AVX1-NEXT: vmovdqa %xmm0, 48(%r8) +; AVX1-NEXT: vmovdqa %xmm4, 32(%r8) ; AVX1-NEXT: vmovdqa %xmm1, 16(%r8) ; AVX1-NEXT: vmovdqa %xmm3, (%r8) ; AVX1-NEXT: retq @@ -268,33 +283,33 @@ ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] ; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] ; SSE-NEXT: movdqa %xmm4, %xmm7 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] ; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] ; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE-NEXT: movdqa %xmm1, 96(%r8) -; SSE-NEXT: movdqa %xmm6, 112(%r8) -; SSE-NEXT: movdqa %xmm8, 64(%r8) -; SSE-NEXT: movdqa %xmm10, 80(%r8) -; SSE-NEXT: movdqa %xmm0, 32(%r8) -; SSE-NEXT: movdqa %xmm5, 48(%r8) -; SSE-NEXT: movdqa %xmm2, (%r8) -; SSE-NEXT: movdqa %xmm3, 16(%r8) +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: movdqa %xmm1, 112(%r8) +; SSE-NEXT: movdqa %xmm6, 96(%r8) +; SSE-NEXT: movdqa %xmm8, 80(%r8) +; SSE-NEXT: movdqa %xmm10, 64(%r8) +; SSE-NEXT: movdqa %xmm0, 48(%r8) +; SSE-NEXT: movdqa %xmm5, 32(%r8) +; SSE-NEXT: movdqa %xmm2, 16(%r8) +; SSE-NEXT: movdqa %xmm3, (%r8) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride4_vf32: @@ -327,10 +342,10 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -441,74 +456,74 @@ ; SSE-NEXT: movdqa %xmm5, %xmm6 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] ; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] ; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] ; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] ; SSE-NEXT: movdqa %xmm13, %xmm15 ; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] ; SSE-NEXT: movdqa %xmm11, %xmm7 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] ; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] ; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm3[8],xmm11[9],xmm3[9],xmm11[10],xmm3[10],xmm11[11],xmm3[11],xmm11[12],xmm3[12],xmm11[13],xmm3[13],xmm11[14],xmm3[14],xmm11[15],xmm3[15] ; SSE-NEXT: movdqa %xmm11, %xmm8 -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] ; SSE-NEXT: movdqa %xmm10, %xmm15 ; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3],xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] ; SSE-NEXT: movdqa %xmm4, %xmm13 ; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] ; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] ; SSE-NEXT: movdqa 48(%rdx), %xmm15 ; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm12[8],xmm10[9],xmm12[9],xmm10[10],xmm12[10],xmm10[11],xmm12[11],xmm10[12],xmm12[12],xmm10[13],xmm12[13],xmm10[14],xmm12[14],xmm10[15],xmm12[15] ; SSE-NEXT: movdqa 48(%rcx), %xmm12 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] ; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] ; SSE-NEXT: movdqa %xmm15, %xmm10 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] ; SSE-NEXT: movdqa 48(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm12[8],xmm15[9],xmm12[9],xmm15[10],xmm12[10],xmm15[11],xmm12[11],xmm15[12],xmm12[12],xmm15[13],xmm12[13],xmm15[14],xmm12[14],xmm15[15],xmm12[15] ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE-NEXT: movdqa %xmm2, 224(%r8) -; SSE-NEXT: movdqa %xmm1, 240(%r8) -; SSE-NEXT: movdqa %xmm3, 192(%r8) -; SSE-NEXT: movdqa %xmm0, 208(%r8) -; SSE-NEXT: movdqa %xmm4, 160(%r8) -; SSE-NEXT: movdqa %xmm9, 176(%r8) -; SSE-NEXT: movdqa %xmm13, 128(%r8) -; SSE-NEXT: movdqa %xmm14, 144(%r8) -; SSE-NEXT: movdqa %xmm11, 96(%r8) -; SSE-NEXT: movdqa %xmm8, 112(%r8) -; SSE-NEXT: movdqa %xmm7, 64(%r8) +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; SSE-NEXT: movdqa %xmm2, 240(%r8) +; SSE-NEXT: movdqa %xmm1, 224(%r8) +; SSE-NEXT: movdqa %xmm3, 208(%r8) +; SSE-NEXT: movdqa %xmm0, 192(%r8) +; SSE-NEXT: movdqa %xmm4, 176(%r8) +; SSE-NEXT: movdqa %xmm9, 160(%r8) +; SSE-NEXT: movdqa %xmm13, 144(%r8) +; SSE-NEXT: movdqa %xmm14, 128(%r8) +; SSE-NEXT: movdqa %xmm11, 112(%r8) +; SSE-NEXT: movdqa %xmm8, 96(%r8) +; SSE-NEXT: movdqa %xmm7, 80(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%r8) -; SSE-NEXT: movdqa %xmm5, 32(%r8) +; SSE-NEXT: movaps %xmm0, 64(%r8) +; SSE-NEXT: movdqa %xmm5, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r8) -; SSE-NEXT: movdqa %xmm6, (%r8) +; SSE-NEXT: movaps %xmm0, 32(%r8) +; SSE-NEXT: movdqa %xmm6, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r8) +; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride4_vf64: @@ -573,14 +588,14 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -618,14 +633,14 @@ ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm9 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 192(%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm1, 224(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 64(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 192(%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 96(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 128(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 64(%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm9, 160(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 128(%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm5, 32(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%r8) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -663,14 +678,14 @@ ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm9 ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512F-NEXT: vmovdqa %ymm5, 192(%r8) ; AVX512F-NEXT: vmovdqa %ymm1, 224(%r8) -; AVX512F-NEXT: vmovdqa %ymm4, 64(%r8) +; AVX512F-NEXT: vmovdqa %ymm5, 192(%r8) ; AVX512F-NEXT: vmovdqa %ymm0, 96(%r8) -; AVX512F-NEXT: vmovdqa %ymm7, 128(%r8) +; AVX512F-NEXT: vmovdqa %ymm4, 64(%r8) ; AVX512F-NEXT: vmovdqa %ymm9, 160(%r8) -; AVX512F-NEXT: vmovdqa %ymm2, (%r8) +; AVX512F-NEXT: vmovdqa %ymm7, 128(%r8) ; AVX512F-NEXT: vmovdqa %ymm3, 32(%r8) +; AVX512F-NEXT: vmovdqa %ymm2, (%r8) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -690,19 +705,19 @@ ; AVX512BW-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31] ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2 ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5 -; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm6 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm3, %ymm7 -; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm8 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm4, %ymm9 -; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm3, %ymm6 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm7 +; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm7 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm4, %ymm8 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm9 +; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm9 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm4 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm3 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,6,7],zmm4[2,3,6,7] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm4 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm0[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm4 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm0[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm6[0,1,2,3],zmm1[4,5,6,7] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,6,7],zmm0[2,3,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%r8) @@ -726,7 +741,6 @@ ; AVX2-FAST: {{.*}} ; AVX2-FAST-PERLANE: {{.*}} ; AVX2-SLOW: {{.*}} -; AVX512: {{.*}} ; AVX512-FAST: {{.*}} ; AVX512-SLOW: {{.*}} ; AVX512BW-FAST: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -20,8 +20,8 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa (%rdx), %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -46,8 +46,8 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa (%rdx), %xmm1 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u] @@ -75,8 +75,8 @@ ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa (%rdx), %xmm2 ; SSE-NEXT: movdqa (%r8), %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: pxor %xmm3, %xmm3 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[3,1,2,1] @@ -119,28 +119,30 @@ ; ; AVX1-ONLY-LABEL: store_i8_stride5_vf4: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,4,8,12],zero,xmm0[1,5,9,13],zero,xmm0[2,6,10,14],zero,xmm0[3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,xmm2[2],zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6] +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,4,8,12],zero,xmm1[1,5,9,13],zero,xmm1[2,6,10,14],zero,xmm1[3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vmovd %xmm0, 16(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%r9) ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i8_stride5_vf4: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-ONLY-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX2-ONLY-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -154,11 +156,13 @@ ; ; AVX512F-LABEL: store_i8_stride5_vf4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512F-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512F-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -172,11 +176,13 @@ ; ; AVX512BW-LABEL: store_i8_stride5_vf4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -288,128 +294,137 @@ ; ; AVX1-ONLY-LABEL: store_i8_stride5_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,2,13,6,8,10,12,15,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm1[0,8,u],zero,zero,xmm1[1,9,u],zero,zero,xmm1[2,10,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[6,14,u],zero,zero,xmm3[7,15,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,2,13,6,8,10,12,15,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm3[0,8,u],zero,zero,xmm3[1,9,u],zero,zero,xmm3[2,10,u],zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,8],zero,zero,xmm0[u,1,9],zero,zero,xmm0[u,2,10],zero,zero,xmm0[u,3] ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3],zero,xmm4[5,6,7,8],zero,xmm4[10,11,12,13],zero,xmm4[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,xmm2[2],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,xmm1[2],zero ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[3,11,u],zero,zero,xmm1[4,12,u],zero,zero,xmm1[5,13,u],zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm3[3,11,u],zero,zero,xmm3[4,12,u],zero,zero,xmm3[5,13,u],zero,zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[11],zero,zero,xmm0[u,4,12],zero,zero,xmm0[u,5,13],zero,zero,xmm0[u,6,14] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6,7],zero,xmm0[9,10,11,12],zero,xmm0[14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm2[3],zero,zero,zero,zero,xmm2[4],zero,zero,zero,zero,xmm2[5],zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3],zero,zero,zero,zero,xmm1[4],zero,zero,zero,zero,xmm1[5],zero,zero ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 16(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%r9) -; AVX1-ONLY-NEXT: vmovq %xmm3, 32(%r9) +; AVX1-ONLY-NEXT: vmovq %xmm2, 32(%r9) +; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i8_stride5_vf8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: movq (%r8), %rax ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vmovq %rax, %xmm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30] -; AVX2-SLOW-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-SLOW-NEXT: movq (%r8), %rax +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX2-SLOW-NEXT: vmovq %rax, %xmm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[u,7,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,14,u],zero,zero,xmm4[7,15,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX2-SLOW-NEXT: shrq $48, %rax -; AVX2-SLOW-NEXT: vmovd %eax, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0] -; AVX2-SLOW-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vmovq %xmm0, 32(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-SLOW-NEXT: vmovd %eax, %xmm4 +; AVX2-SLOW-NEXT: vpbroadcastw %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0] +; AVX2-SLOW-NEXT: vpblendvb %xmm5, %xmm3, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8],zero,zero,zero,ymm0[1,9],zero,zero,zero,ymm0[2,10],zero,zero,ymm0[27],zero,zero,zero,ymm0[20,28],zero,zero,zero,ymm0[21,29],zero,zero,zero,ymm0[22,30] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,ymm1[1,9],zero,zero,zero,ymm1[2,10],zero,zero,zero,ymm1[3],zero,ymm1[19,27],zero,zero,zero,ymm1[20,28],zero,zero,zero,ymm1[21,29],zero,zero,zero +; AVX2-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-SLOW-NEXT: vmovq %xmm3, 32(%r9) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i8_stride5_vf8: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: movq (%r8), %rax ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovq %rax, %xmm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30] -; AVX2-FAST-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-FAST-NEXT: movq (%r8), %rax +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX2-FAST-NEXT: vmovq %rax, %xmm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[u,7,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,14,u],zero,zero,xmm4[7,15,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX2-FAST-NEXT: shrq $48, %rax -; AVX2-FAST-NEXT: vmovd %eax, %xmm1 -; AVX2-FAST-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0] -; AVX2-FAST-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vmovq %xmm0, 32(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-FAST-NEXT: vmovd %eax, %xmm4 +; AVX2-FAST-NEXT: vpbroadcastw %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0] +; AVX2-FAST-NEXT: vpblendvb %xmm5, %xmm3, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8],zero,zero,zero,ymm0[1,9],zero,zero,zero,ymm0[2,10],zero,zero,ymm0[27],zero,zero,zero,ymm0[20,28],zero,zero,zero,ymm0[21,29],zero,zero,zero,ymm0[22,30] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,ymm1[1,9],zero,zero,zero,ymm1[2,10],zero,zero,zero,ymm1[3],zero,ymm1[19,27],zero,zero,zero,ymm1[20,28],zero,zero,zero,ymm1[21,29],zero,zero,zero +; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,1] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-FAST-NEXT: vmovq %xmm3, 32(%r9) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i8_stride5_vf8: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: movq (%r8), %rax ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovq %rax, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: movq (%r8), %rax +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovq %rax, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[u,7,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,14,u],zero,zero,xmm4[7,15,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX2-FAST-PERLANE-NEXT: shrq $48, %rax -; AVX2-FAST-PERLANE-NEXT: vmovd %eax, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm0, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovd %eax, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm5 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %xmm5, %xmm3, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8],zero,zero,zero,ymm0[1,9],zero,zero,zero,ymm0[2,10],zero,zero,ymm0[27],zero,zero,zero,ymm0[20,28],zero,zero,zero,ymm0[21,29],zero,zero,zero,ymm0[22,30] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,ymm1[1,9],zero,zero,zero,ymm1[2,10],zero,zero,zero,ymm1[3],zero,ymm1[19,27],zero,zero,zero,ymm1[20,28],zero,zero,zero,ymm1[21,29],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm3, 32(%r9) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -422,24 +437,25 @@ ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vmovq %rax, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,ymm2[u,1,9],zero,zero,ymm2[u,2,10],zero,zero,ymm2[u,3],zero,ymm2[19,27,u],zero,zero,ymm2[20,28,u],zero,zero,ymm2[21,29,u],zero,zero -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u],zero,zero,ymm2[1,9,u],zero,zero,ymm2[2,10,u],zero,ymm2[27],zero,zero,ymm2[u,20,28],zero,zero,ymm2[u,21,29],zero,zero,ymm2[u,22,30] -; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovq %rax, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,14,u],zero,zero,xmm3[7,15,u,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512F-SLOW-NEXT: shrq $48, %rax -; AVX512F-SLOW-NEXT: vmovd %eax, %xmm1 -; AVX512F-SLOW-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm0 -; AVX512F-SLOW-NEXT: vmovq %xmm1, 32(%r9) +; AVX512F-SLOW-NEXT: vmovd %eax, %xmm3 +; AVX512F-SLOW-NEXT: vpbroadcastw %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,8],zero,zero,ymm0[u,1,9],zero,zero,ymm0[u,2,10],zero,zero,ymm0[u,3],zero,ymm0[19,27,u],zero,zero,ymm0[20,28,u],zero,zero,ymm0[21,29,u],zero,zero +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8,u],zero,zero,ymm0[1,9,u],zero,zero,ymm0[2,10,u],zero,ymm0[27],zero,zero,ymm0[u,20,28],zero,zero,ymm0[u,21,29],zero,zero,ymm0[u,22,30] +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vmovq %xmm3, 32(%r9) ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%r9) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq @@ -453,24 +469,25 @@ ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vmovq %rax, %xmm3 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,ymm2[u,1,9],zero,zero,ymm2[u,2,10],zero,zero,ymm2[u,3],zero,ymm2[19,27,u],zero,zero,ymm2[20,28,u],zero,zero,ymm2[21,29,u],zero,zero -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u],zero,zero,ymm2[1,9,u],zero,zero,ymm2[2,10,u],zero,ymm2[27],zero,zero,ymm2[u,20,28],zero,zero,ymm2[u,21,29],zero,zero,ymm2[u,22,30] -; AVX512F-FAST-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovq %rax, %xmm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,14,u],zero,zero,xmm3[7,15,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512F-FAST-NEXT: shrq $48, %rax -; AVX512F-FAST-NEXT: vmovd %eax, %xmm1 -; AVX512F-FAST-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm0 -; AVX512F-FAST-NEXT: vmovq %xmm1, 32(%r9) +; AVX512F-FAST-NEXT: vmovd %eax, %xmm3 +; AVX512F-FAST-NEXT: vpbroadcastw %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,8],zero,zero,ymm0[u,1,9],zero,zero,ymm0[u,2,10],zero,zero,ymm0[u,3],zero,ymm0[19,27,u],zero,zero,ymm0[20,28,u],zero,zero,ymm0[21,29,u],zero,zero +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8,u],zero,zero,ymm0[1,9,u],zero,zero,ymm0[2,10,u],zero,ymm0[27],zero,zero,ymm0[u,20,28],zero,zero,ymm0[u,21,29],zero,zero,ymm0[u,22,30] +; AVX512F-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovq %xmm3, 32(%r9) ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%r9) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -484,28 +501,29 @@ ; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX512BW-SLOW-NEXT: vmovq %rax, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30] -; AVX512BW-SLOW-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX512BW-SLOW-NEXT: movl $554189328, %ecx # imm = 0x21084210 -; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovq %rax, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,14,u],zero,zero,xmm3[7,15,u,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512BW-SLOW-NEXT: shrq $48, %rax -; AVX512BW-SLOW-NEXT: vpbroadcastw %eax, %xmm1 +; AVX512BW-SLOW-NEXT: vpbroadcastw %eax, %xmm3 ; AVX512BW-SLOW-NEXT: movw $132, %ax ; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 -; AVX512BW-SLOW-NEXT: vmovq %xmm0, 32(%r9) -; AVX512BW-SLOW-NEXT: vmovdqa %ymm1, (%r9) +; AVX512BW-SLOW-NEXT: vmovdqu8 %xmm3, %xmm2 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,8],zero,zero,zero,ymm0[1,9],zero,zero,zero,ymm0[2,10],zero,zero,zero,ymm0[3],zero,ymm0[19,27],zero,zero,zero,ymm0[20,28],zero,zero,zero,ymm0[21,29],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8],zero,zero,zero,ymm0[1,9],zero,zero,zero,ymm0[2,10],zero,zero,ymm0[27],zero,zero,zero,ymm0[20,28],zero,zero,zero,ymm0[21,29],zero,zero,zero,ymm0[22,30] +; AVX512BW-SLOW-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX512BW-SLOW-NEXT: movl $554189328, %eax # imm = 0x21084210 +; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovq %xmm2, 32(%r9) +; AVX512BW-SLOW-NEXT: vmovdqa %ymm0, (%r9) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; @@ -518,28 +536,29 @@ ; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512BW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX512BW-FAST-NEXT: vmovq %rax, %xmm3 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30] -; AVX512BW-FAST-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] -; AVX512BW-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX512BW-FAST-NEXT: movl $554189328, %ecx # imm = 0x21084210 -; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovq %rax, %xmm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,14,u],zero,zero,xmm3[7,15,u,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512BW-FAST-NEXT: shrq $48, %rax -; AVX512BW-FAST-NEXT: vpbroadcastw %eax, %xmm1 +; AVX512BW-FAST-NEXT: vpbroadcastw %eax, %xmm3 ; AVX512BW-FAST-NEXT: movw $132, %ax ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 -; AVX512BW-FAST-NEXT: vmovq %xmm0, 32(%r9) -; AVX512BW-FAST-NEXT: vmovdqa %ymm1, (%r9) +; AVX512BW-FAST-NEXT: vmovdqu8 %xmm3, %xmm2 {%k1} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,8],zero,zero,zero,ymm0[1,9],zero,zero,zero,ymm0[2,10],zero,zero,zero,ymm0[3],zero,ymm0[19,27],zero,zero,zero,ymm0[20,28],zero,zero,zero,ymm0[21,29],zero,zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8],zero,zero,zero,ymm0[1,9],zero,zero,zero,ymm0[2,10],zero,zero,ymm0[27],zero,zero,zero,ymm0[20,28],zero,zero,zero,ymm0[21,29],zero,zero,zero,ymm0[22,30] +; AVX512BW-FAST-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,1,1] +; AVX512BW-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX512BW-FAST-NEXT: movl $554189328, %eax # imm = 0x21084210 +; AVX512BW-FAST-NEXT: kmovd %eax, %k1 +; AVX512BW-FAST-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovq %xmm2, 32(%r9) +; AVX512BW-FAST-NEXT: vmovdqa %ymm0, (%r9) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64 @@ -560,211 +579,208 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride5_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa (%rsi), %xmm8 -; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa (%rsi), %xmm3 +; SSE-NEXT: movdqa (%rdx), %xmm5 ; SSE-NEXT: movdqa (%rcx), %xmm4 -; SSE-NEXT: movdqa (%r8), %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: movdqa (%r8), %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, %xmm12 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,1,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,3,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] +; SSE-NEXT: pand %xmm6, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,2,2] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,1,0] -; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm0, %xmm11 ; SSE-NEXT: pandn %xmm10, %xmm11 -; SSE-NEXT: por %xmm7, %xmm11 -; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: por %xmm9, %xmm11 +; SSE-NEXT: movdqa %xmm6, %xmm10 ; SSE-NEXT: pandn %xmm11, %xmm10 -; SSE-NEXT: por %xmm5, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm7, %xmm10 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: pand %xmm12, %xmm11 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,1,2,1] +; SSE-NEXT: por %xmm7, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm9, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[1,1,2,2] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm12[1,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,0,0] +; SSE-NEXT: pand %xmm9, %xmm10 +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,3,2,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: pandn %xmm12, %xmm13 +; SSE-NEXT: por %xmm10, %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm14 +; SSE-NEXT: pandn %xmm10, %xmm14 +; SSE-NEXT: movdqa %xmm3, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3],xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm12[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6,6] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: por %xmm14, %xmm10 +; SSE-NEXT: pand %xmm15, %xmm10 +; SSE-NEXT: pandn %xmm13, %xmm15 +; SSE-NEXT: por %xmm10, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; SSE-NEXT: pand %xmm10, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[0,0,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: pandn %xmm13, %xmm7 +; SSE-NEXT: por %xmm15, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm3[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,5,4,7] -; SSE-NEXT: movdqa %xmm12, %xmm14 +; SSE-NEXT: movdqa %xmm10, %xmm14 ; SSE-NEXT: pandn %xmm13, %xmm14 -; SSE-NEXT: por %xmm11, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm13, %xmm11 -; SSE-NEXT: pandn %xmm14, %xmm11 -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm9[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,2,3,3] +; SSE-NEXT: pand %xmm10, %xmm13 +; SSE-NEXT: por %xmm13, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm15, %xmm13 +; SSE-NEXT: pandn %xmm14, %xmm13 +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,2,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: pandn %xmm14, %xmm15 -; SSE-NEXT: movdqa %xmm4, %xmm14 -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm4[8],xmm14[9],xmm4[9],xmm14[10],xmm4[10],xmm14[11],xmm4[11],xmm14[12],xmm4[12],xmm14[13],xmm4[13],xmm14[14],xmm4[14],xmm14[15],xmm4[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,4] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm15, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm11, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa %xmm6, %xmm11 -; SSE-NEXT: pandn %xmm15, %xmm11 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm7, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm15, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; SSE-NEXT: pandn %xmm3, %xmm12 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,3] -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: pand %xmm13, %xmm6 -; SSE-NEXT: pandn %xmm12, %xmm13 -; SSE-NEXT: por %xmm6, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pand %xmm1, %xmm13 -; SSE-NEXT: por %xmm13, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,5,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,3,2] -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,6] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm14, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,4] +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: por %xmm5, %xmm14 +; SSE-NEXT: pand %xmm15, %xmm14 +; SSE-NEXT: por %xmm13, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,2,2,2] +; SSE-NEXT: movdqa %xmm8, %xmm13 +; SSE-NEXT: pandn %xmm5, %xmm13 +; SSE-NEXT: pand %xmm8, %xmm14 +; SSE-NEXT: por %xmm14, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] +; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,1] +; SSE-NEXT: pandn %xmm11, %xmm10 +; SSE-NEXT: por %xmm5, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm12[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,1,3] +; SSE-NEXT: pandn %xmm11, %xmm8 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: pand %xmm15, %xmm8 +; SSE-NEXT: pandn %xmm10, %xmm15 +; SSE-NEXT: por %xmm8, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: por %xmm15, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,7,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,3,2] +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[3,3,3,3] +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,2] -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: por %xmm4, %xmm9 +; SSE-NEXT: pand %xmm6, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: por %xmm9, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3] +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, 64(%r9) -; SSE-NEXT: movdqa %xmm6, (%r9) -; SSE-NEXT: movdqa %xmm15, 16(%r9) -; SSE-NEXT: movdqa %xmm11, 48(%r9) +; SSE-NEXT: movdqa %xmm8, (%r9) +; SSE-NEXT: movdqa %xmm13, 48(%r9) +; SSE-NEXT: movdqa %xmm7, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r9) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride5_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm4[6,u,u,u],zero,xmm4[7,u,u,u],zero,xmm4[8,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm2[6,u,u,u],zero,xmm2[7,u,u,u],zero,xmm2[8,u,u,u],zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[6],zero,xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9] ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u],zero,xmm3[7,u,u,u],zero,xmm3[8,u,u,u],zero,xmm3[9,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,7],zero,xmm2[u,u,u,8],zero,xmm2[u,u,u,9],zero,xmm2[u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u],zero,xmm4[7,u,u,u],zero,xmm4[8,u,u,u],zero,xmm4[9,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,7],zero,xmm3[u,u,u,8],zero,xmm3[u,u,u,9],zero,xmm3[u] ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255> ; AVX1-ONLY-NEXT: vpblendvb %xmm7, %xmm5, %xmm6, %xmm5 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1],zero,xmm5[3,4,5,6],zero,xmm5[8,9,10,11],zero,xmm5[13,14,15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm0[6],zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,xmm0[8],zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[10,11],zero,zero,zero,xmm6[12,13],zero,zero,zero,xmm6[14,15],zero -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[10,11],zero,zero,zero,xmm7[12,13],zero,zero,zero,xmm7[14,15],zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm8[0,1],zero,zero,zero,xmm8[2,3],zero,zero,zero,xmm8[4,5],zero,zero -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1],zero,zero,zero,xmm10[2,3],zero,zero,zero,xmm10[4,5],zero,zero,zero,xmm10[6] ; AVX1-ONLY-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm9, %xmm9 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm8[6,7],zero,zero,zero,xmm8[8,9],zero,zero,zero,xmm8[10,11],zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6],zero,zero,zero,xmm2[9,8],zero,zero,zero,xmm2[11,10],zero,zero,zero,xmm2[13,12] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm0[3],zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,xmm0[5],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm7[4,5],zero,zero,zero,xmm7[6,7],zero,zero,zero,xmm7[8,9],zero,zero -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6],zero,zero,zero,xmm3[9,8],zero,zero,zero,xmm3[11,10],zero,zero,zero,xmm3[13,12] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm0[3],zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,xmm0[5],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm7[4,5],zero,zero,zero,xmm7[6,7],zero,zero,zero,xmm7[8,9],zero,zero +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,xmm1[5,4],zero,zero,zero,xmm1[7,6],zero,zero,zero,xmm1[9,8] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 48(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%r9) +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 48(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 16(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm9, (%r9) -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 64(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 64(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm5, 32(%r9) ; AVX1-ONLY-NEXT: retq ; @@ -902,13 +918,13 @@ ; ; AVX512F-SLOW-LABEL: store_i8_stride5_vf16: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm0 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm6 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,7],zero,ymm6[u,u,u,8],zero,ymm6[u,u,u,9],zero,ymm6[u,u,u],zero,ymm6[26,u,u,u],zero,ymm6[27,u,u,u],zero,ymm6[28,u,u] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u],zero,ymm8[7,u,u,u],zero,ymm8[8,u,u,u],zero,ymm8[9,u,u,u,26],zero,ymm8[u,u,u,27],zero,ymm8[u,u,u,28],zero,ymm8[u,u] @@ -926,16 +942,16 @@ ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 ; AVX512F-SLOW-NEXT: vporq %zmm7, %zmm5, %zmm5 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm6, %zmm6 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm6, %zmm6 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 ; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u],zero,zero,xmm3[10,11,u],zero,zero,xmm3[12,13,u],zero,zero,xmm3[14,15,u] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,11],zero,zero,xmm0[u,12,13],zero,zero,xmm0[u,14,15],zero,zero,xmm0[u] -; AVX512F-SLOW-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4],zero,xmm0[6,7,8,9],zero,xmm0[11,12,13,14],zero -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[12],zero,zero,zero,zero,xmm1[13],zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,xmm1[15] -; AVX512F-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,10,11],zero,zero,xmm1[u,12,13],zero,zero,xmm1[u,14,15],zero,zero,xmm1[u] +; AVX512F-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] +; AVX512F-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, 64(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512F-SLOW-NEXT: vzeroupper @@ -943,13 +959,13 @@ ; ; AVX512F-FAST-LABEL: store_i8_stride5_vf16: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm0 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm4 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8,u],zero,zero,ymm7[1,9,u],zero,zero,ymm7[2,10,u],zero,zero,ymm7[19,27,u],zero,zero,ymm7[20,28,u],zero,zero,ymm7[21,29,u],zero,zero ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7] @@ -963,18 +979,18 @@ ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,6,u],zero,zero,ymm6[3,7,u],zero,zero,ymm6[8,12,u],zero,zero,ymm6[9,17,u],zero,zero,ymm6[22,18,u],zero,zero,ymm6[23,19,u],zero,zero,ymm6[24,28] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 ; AVX512F-FAST-NEXT: vporq %zmm7, %zmm5, %zmm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm6 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = ; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm7, %zmm6 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u],zero,zero,xmm3[10,11,u],zero,zero,xmm3[12,13,u],zero,zero,xmm3[14,15,u] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,10,11],zero,zero,xmm1[u,12,13],zero,zero,xmm1[u,14,15],zero,zero,xmm1[u] -; AVX512F-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] -; AVX512F-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[10,11,u],zero,zero,xmm2[12,13,u],zero,zero,xmm2[14,15,u] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,11],zero,zero,xmm0[u,12,13],zero,zero,xmm0[u,14,15],zero,zero,xmm0[u] +; AVX512F-FAST-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4],zero,xmm0[6,7,8,9],zero,xmm0[11,12,13,14],zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15] +; AVX512F-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, 64(%r9) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512F-FAST-NEXT: vzeroupper @@ -1042,7 +1058,7 @@ ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zmm5[0,8],zero,zero,zero,zmm5[1,9],zero,zero,zero,zmm5[2,10],zero,zero,zero,zmm5[19,27],zero,zero,zero,zmm5[20,28],zero,zero,zero,zmm5[21,29],zero,zero,zero,zero,zero,zero,zmm5[35,39],zero,zero,zero,zmm5[40,44],zero,zero,zero,zmm5[41,45],zero,zero,zero,zmm5[50,54],zero,zero,zero,zmm5[51,55],zero,zero,zero,zmm5[56,60],zero,zero ; AVX512BW-FAST-NEXT: vporq %zmm7, %zmm5, %zmm5 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm6 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm6 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] ; AVX512BW-FAST-NEXT: vpermd %zmm6, %zmm7, %zmm6 ; AVX512BW-FAST-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 @@ -1076,500 +1092,521 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride5_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: subq $120, %rsp +; SSE-NEXT: movdqa (%rdi), %xmm13 ; SSE-NEXT: movdqa (%rsi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rsi), %xmm7 -; SSE-NEXT: movdqa (%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdx), %xmm0 +; SSE-NEXT: movdqa 16(%rsi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rcx), %xmm11 -; SSE-NEXT: movdqa 16(%rcx), %xmm12 -; SSE-NEXT: movdqa 16(%r8), %xmm14 +; SSE-NEXT: movdqa 16(%rdx), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rcx), %xmm10 +; SSE-NEXT: movdqa 16(%rcx), %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r8), %xmm14 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,1,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,2] -; SSE-NEXT: movdqa %xmm3, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,0,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,0] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,0] +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 ; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,1,2,3] -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,2,1] +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa (%r8), %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa 16(%rdi), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa 16(%r8), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,4,7] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm12[8],xmm15[9],xmm12[9],xmm15[10],xmm12[10],xmm15[11],xmm12[11],xmm15[12],xmm12[12],xmm15[13],xmm12[13],xmm15[14],xmm12[14],xmm15[15],xmm12[15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,4] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,1,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm7, %xmm4 -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,0,1,1] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,4,7] -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,2,3,3] -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm2[8],xmm12[9],xmm2[9],xmm12[10],xmm2[10],xmm12[11],xmm2[11],xmm12[12],xmm2[12],xmm12[13],xmm2[13],xmm12[14],xmm2[14],xmm12[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm12[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[1,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,4] -; SSE-NEXT: pand %xmm3, %xmm14 -; SSE-NEXT: por %xmm7, %xmm14 -; SSE-NEXT: pand %xmm8, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5,6,6] +; SSE-NEXT: pand %xmm11, %xmm8 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: pand %xmm3, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: por %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,4,7] +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: pandn %xmm5, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,4] +; SSE-NEXT: pand %xmm11, %xmm5 +; SSE-NEXT: por %xmm9, %xmm5 +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: por %xmm8, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] +; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm5, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] +; SSE-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm5[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,6] +; SSE-NEXT: pand %xmm11, %xmm14 ; SSE-NEXT: por %xmm4, %xmm14 +; SSE-NEXT: pand %xmm3, %xmm14 +; SSE-NEXT: pandn %xmm8, %xmm3 +; SSE-NEXT: por %xmm14, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: pand %xmm13, %xmm14 -; SSE-NEXT: por %xmm14, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: por %xmm3, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,4,7] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm14 +; SSE-NEXT: pandn %xmm4, %xmm14 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,4] +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm14, %xmm4 +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm14, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm4 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm15[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm14 ; SSE-NEXT: pandn %xmm4, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[1,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm15[1,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: por %xmm4, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm14, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,1,1] -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,5,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,2] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm14 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,3,3,3] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm2, %xmm14 -; SSE-NEXT: pand %xmm6, %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm14, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,0,1,3] +; SSE-NEXT: movdqa %xmm5, %xmm14 +; SSE-NEXT: pandn %xmm13, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm6[0,0,0,0] +; SSE-NEXT: pand %xmm5, %xmm13 +; SSE-NEXT: por %xmm13, %xmm14 +; SSE-NEXT: pand %xmm1, %xmm14 +; SSE-NEXT: por %xmm4, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm13, %xmm4 ; SSE-NEXT: pand %xmm11, %xmm14 -; SSE-NEXT: por %xmm14, %xmm15 -; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshuflw $164, (%rsp), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,3] -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por %xmm2, %xmm14 -; SSE-NEXT: pand %xmm8, %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm14 -; SSE-NEXT: por %xmm14, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,5,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,2] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,7,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: por %xmm14, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,3,2] +; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: pandn %xmm10, %xmm13 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm15[0,1,2,3,7,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,2,2,2] +; SSE-NEXT: pand %xmm11, %xmm10 +; SSE-NEXT: por %xmm10, %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: pandn %xmm13, %xmm10 +; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[0,1,2,3,7,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[2,3,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm14 +; SSE-NEXT: pandn %xmm13, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm6[3,3,3,3] +; SSE-NEXT: pand %xmm7, %xmm13 +; SSE-NEXT: por %xmm13, %xmm14 +; SSE-NEXT: pand %xmm12, %xmm14 +; SSE-NEXT: por %xmm10, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm15, %xmm10 +; SSE-NEXT: pand %xmm13, %xmm14 +; SSE-NEXT: por %xmm14, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm9[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] +; SSE-NEXT: pand %xmm2, %xmm14 +; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = mem[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,0,1] +; SSE-NEXT: pandn %xmm12, %xmm2 +; SSE-NEXT: por %xmm14, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[3,3,3,3] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[0,1,2,3,7,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,3,2,2] -; SSE-NEXT: pandn %xmm12, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm10 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: por %xmm10, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; SSE-NEXT: pandn %xmm2, %xmm11 -; SSE-NEXT: por %xmm5, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] -; SSE-NEXT: pandn %xmm5, %xmm9 -; SSE-NEXT: por %xmm2, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: pshuflw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm14[0,0,0,0] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm12 +; SSE-NEXT: pshuflw $164, (%rsp), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,1,3] -; SSE-NEXT: pandn %xmm5, %xmm13 -; SSE-NEXT: por %xmm2, %xmm13 -; SSE-NEXT: pand %xmm8, %xmm13 -; SSE-NEXT: pandn %xmm9, %xmm8 -; SSE-NEXT: por %xmm13, %xmm8 -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm8, %xmm3 -; SSE-NEXT: movdqa %xmm3, (%r9) -; SSE-NEXT: movdqa %xmm11, 64(%r9) -; SSE-NEXT: movdqa %xmm0, 80(%r9) -; SSE-NEXT: movdqa %xmm15, 144(%r9) -; SSE-NEXT: movdqa %xmm7, 16(%r9) +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: por %xmm12, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,0,0] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,7,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,3,2] +; SSE-NEXT: pandn %xmm5, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,2,3,7,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,2] +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: por %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm13, 144(%r9) +; SSE-NEXT: movdqa %xmm2, 80(%r9) +; SSE-NEXT: movdqa %xmm10, 64(%r9) +; SSE-NEXT: movdqa %xmm4, (%r9) +; SSE-NEXT: movdqa %xmm3, 128(%r9) +; SSE-NEXT: movdqa %xmm8, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%r9) +; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%r9) +; SSE-NEXT: movaps %xmm0, 112(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 112(%r9) -; SSE-NEXT: addq $152, %rsp +; SSE-NEXT: addq $120, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride5_vf32: ; AVX1-ONLY: # %bb.0: +; AVX1-ONLY-NEXT: subq $40, %rsp +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [3,0,0,1,4,5,0,2,3,0,0,1,4,5,0,2] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3],xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3],xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,1,4,5,0,2,3,6,0,1,4,5,0,2,3,6] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[4,5,6,7],zero,xmm4[9,10,11,12],zero,xmm4[14,15] +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm11 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm11[3],zero,zero,zero,zero,xmm11[4],zero,zero,zero,zero,xmm11[5],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,128,5,6,7,8,128,10,11,12,13,128,15] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,0,128,128,128,128,1,128,128,128,128,2,128] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm13 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm15[8],xmm6[8],xmm15[9],xmm6[9],xmm15[10],xmm6[10],xmm15[11],xmm6[11],xmm15[12],xmm6[12],xmm15[13],xmm6[13],xmm15[14],xmm6[14],xmm15[15],xmm6[15] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] ; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm3[1,2,3,4],zero,xmm3[6,7,8,9],zero,xmm3[11,12,13,14],zero -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [128,1,2,3,4,128,6,7,8,9,128,11,12,13,14,128] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm15, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5],zero,xmm2[7,8,9,10],zero,xmm2[12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm15[9],zero,zero,zero,zero,xmm15[10],zero,zero,zero,zero,xmm15[11],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm14 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm13, %ymm14 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm14, %ymm5 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm14 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,2,3],zero,xmm14[5,6,7,8],zero,xmm14[10,11,12,13],zero,xmm14[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm15[0],zero,zero,zero,zero,xmm15[1],zero,zero,zero,zero,xmm15[2],zero -; AVX1-ONLY-NEXT: vpor %xmm9, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm5[1,2,3,4],zero,xmm5[6,7,8,9],zero,xmm5[11,12,13,14],zero -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm14, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[6,u,u,u],zero,xmm0[7,u,u,u],zero,xmm0[8,u,u,u],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6],zero,xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,6,7,10,11,0,8,9,0,6,7,10,11,0,8,9] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [7,0,4,5,8,9,0,6,7,0,4,5,8,9,0,6] ; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,u],zero,xmm10[7,u,u,u],zero,xmm10[8,u,u,u],zero,xmm10[9,u] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [2,7,6,0,5,4,9,8,2,7,6,0,5,4,9,8] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,2,3,4,5,128,7,8,9,10,128,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [128,9,128,128,128,128,10,128,128,128,128,11,128,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,8,0,128,7,128,9,0,128,8,0,128,7,128,9,0] +; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm15[u,u,u,7],zero,xmm15[u,u,u,8],zero,xmm15[u,u,u,9],zero,xmm15[u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128] ; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm12, %xmm5 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [6,11,10,0,9,8,13,12,6,11,10,0,9,8,13,12] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,128,4,5,6,7,128,9,10,11,12,128,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,3,128,128,128,128,4,128,128,128,128,5,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,128,3,4,5,6,128,8,9,10,11,128,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,6,128,128,128,128,7,128,128,128,128,8,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm9 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm12, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[0,1,2,3],zero,xmm9[5,6,7,8],zero,xmm9[10,11,12,13],zero,xmm9[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm14[0],zero,zero,zero,zero,xmm14[1],zero,zero,zero,zero,xmm14[2],zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,xmm2[7,u,u,u],zero,xmm2[8,u,u,u],zero,xmm2[9,u] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm7[6,u,u,u],zero,xmm7[7,u,u,u],zero,xmm7[8,u,u,u],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[6],zero,xmm8[u,u,u,7],zero,xmm8[u,u,u,8],zero,xmm8[u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm5 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5],zero,xmm2[7,8,9,10],zero,xmm2[12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm14[9],zero,zero,zero,zero,xmm14[10],zero,zero,zero,zero,xmm14[11],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 48(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 32(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 16(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm15, 112(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [6,128,8,128,0,7,128,9,6,128,8,128,0,7,128,9] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm12, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm15, %xmm7 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm0[8],xmm12[8],xmm0[9],xmm12[9],xmm0[10],xmm12[10],xmm0[11],xmm12[11],xmm0[12],xmm12[12],xmm0[13],xmm12[13],xmm0[14],xmm12[14],xmm0[15],xmm12[15] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,128,3,4,5,6,128,8,9,10,11,128,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,6,128,128,128,128,7,128,128,128,128,8,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,7],zero,xmm2[u,u,u,8],zero,xmm2[u,u,u,9],zero,xmm2[u] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,1,2],zero,xmm0[4,5,6,7],zero,xmm0[9,10,11,12],zero,xmm0[14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm8[3],zero,zero,zero,zero,xmm8[4],zero,zero,zero,zero,xmm8[5],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 112(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 96(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 48(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%r9) +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%r9) +; AVX1-ONLY-NEXT: addq $40, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -1663,8 +1700,8 @@ ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm8, 96(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 128(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm7, 32(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 128(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm6, (%r9) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -1688,10 +1725,11 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255> ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm5, %ymm10, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,0,0,0,0,1,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm5, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,0,0,0,0,1,1] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm5, %ymm11, %ymm5 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm9[6],zero,xmm9[8,u],zero,xmm9[7],zero,xmm9[9],zero,xmm9[11,u],zero,xmm9[10],zero,xmm9[12] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6],zero,xmm8[8],zero,xmm8[u,7],zero,xmm8[9],zero,xmm8[11],zero,xmm8[u,10],zero,xmm8[12],zero ; AVX2-FAST-NEXT: vpor %xmm9, %xmm8, %xmm8 @@ -1703,40 +1741,40 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255> ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm8, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,2,2,2,2,2,2] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,ymm3[19,20],zero,ymm3[22],zero,ymm3[24],zero,ymm3[22,23],zero,ymm3[25],zero,ymm3[23] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,ymm1[29,26],zero,ymm1[28],zero,ymm1[30],zero,ymm1[28,29],zero,ymm1[31],zero,ymm1[29] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX2-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[29,26],zero,ymm3[28],zero,ymm3[26,27,28,29],zero,ymm3[31],zero,ymm3[29,30],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,zero,zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] ; AVX2-FAST-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,6,5,5,5,5,4,6] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [6,6,6,6,7,7,7,7] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,ymm1[29,26],zero,ymm1[28],zero,ymm1[30],zero,ymm1[28,29],zero,ymm1[31],zero,ymm1[29] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,ymm3[19,20],zero,ymm3[22],zero,ymm3[24],zero,ymm3[22,23],zero,ymm3[25],zero,ymm3[23] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] ; AVX2-FAST-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[29,26],zero,ymm3[28],zero,ymm3[26,27,28,29],zero,ymm3[31],zero,ymm3[29,30],zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,zero,zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] ; AVX2-FAST-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [6,6,6,6,7,7,7,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,6,5,5,5,5,4,6] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <3,3,3,u,4,4,4,4> ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 @@ -1753,8 +1791,8 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 128(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 96(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 96(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 128(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm6, 32(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FAST-NEXT: vzeroupper @@ -1798,37 +1836,37 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,ymm3[19,20],zero,ymm3[22],zero,ymm3[24],zero,ymm3[22,23],zero,ymm3[25],zero,ymm3[23] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,ymm1[29,26],zero,ymm1[28],zero,ymm1[30],zero,ymm1[28,29],zero,ymm1[31],zero,ymm1[29] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[29,26],zero,ymm3[28],zero,ymm3[26,27,28,29],zero,ymm3[31],zero,ymm3[29,30],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,zero,zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,1,1,4,6,5,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,2,3,3,6,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,ymm1[29,26],zero,ymm1[28],zero,ymm1[30],zero,ymm1[28,29],zero,ymm1[31],zero,ymm1[29] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,ymm3[19,20],zero,ymm3[22],zero,ymm3[24],zero,ymm3[22,23],zero,ymm3[25],zero,ymm3[23] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[29,26],zero,ymm3[28],zero,ymm3[26,27,28,29],zero,ymm3[31],zero,ymm3[29,30],zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,zero,zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[2,2,3,3,6,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,2,1,1,4,6,5,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <3,3,3,u,4,4,4,4> ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm9, %ymm3 @@ -1845,8 +1883,8 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 128(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 128(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 32(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -1854,166 +1892,154 @@ ; ; AVX512F-SLOW-LABEL: store_i8_stride5_vf32: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm3 ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[8],zero,xmm5[u,7],zero,xmm5[9],zero,xmm5[u],zero,xmm5[u,10],zero,xmm5[12],zero,xmm5[u,11] -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9,u,11,u],zero,xmm7[10],zero,xmm7[12,u],zero +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm5 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm5[6],zero,xmm5[8,u],zero,xmm5[7],zero,xmm5[9],zero,xmm5[11,u],zero,xmm5[10],zero,xmm5[12] +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[6],zero,xmm7[8],zero,xmm7[u,7],zero,xmm7[9],zero,xmm7[11],zero,xmm7[u,10],zero,xmm7[12],zero ; AVX512F-SLOW-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm8[6],zero,xmm8[8,u],zero,xmm8[7],zero,xmm8[9],zero,xmm8[11,u],zero,xmm8[10],zero,xmm8[12] -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[6],zero,xmm10[8],zero,xmm10[u,7],zero,xmm10[9],zero,xmm10[11],zero,xmm10[u,10],zero,xmm10[12],zero -; AVX512F-SLOW-NEXT: vpor %xmm9, %xmm11, %xmm9 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $226, %ymm6, %ymm11, %ymm9 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm10, %zmm6 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm6 = zmm6[0,0,1,1,4,4,5,5] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[u],zero,xmm9[u,10],zero,xmm9[12],zero,xmm9[u,11] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm8[8,u],zero,xmm8[7],zero,xmm8[9,u,11,u],zero,xmm8[10],zero,xmm8[12,u],zero +; AVX512F-SLOW-NEXT: vpor %xmm9, %xmm8, %xmm8 ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $226, %ymm8, %ymm7, %ymm5 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[0,0,1,1,4,4,5,5] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm5 ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm8, %zmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm7, %zmm6 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u],zero,ymm4[13,u,u,u],zero,ymm4[14,u,u,u],zero,ymm4[15,u,u,u],zero,ymm4[16,u,u,u],zero,ymm4[17,u,u,u],zero,ymm4[18,u,u] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,13],zero,ymm3[u,u,u,14],zero,ymm3[u,u,u,15],zero,ymm3[u,u,u,16],zero,ymm3[u,u,u,17],zero,ymm3[u,u,u,18],zero,ymm3[u,u] -; AVX512F-SLOW-NEXT: vpor %ymm5, %ymm8, %ymm5 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u],zero,ymm2[13,u,u,u],zero,ymm2[14,u,u,u],zero,ymm2[15,u,u,u],zero,ymm2[16,u,u,u],zero,ymm2[17,u,u,u],zero,ymm2[18,u,u,u],zero -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,13],zero,ymm1[u,u,u,14],zero,ymm1[u,u,u,15],zero,ymm1[u,u,u,16],zero,ymm1[u,u,u,17],zero,ymm1[u,u,u,18],zero,ymm1[u,u,u,19] -; AVX512F-SLOW-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX512F-SLOW-NEXT: vpternlogq $226, %ymm5, %ymm11, %ymm8 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [18374966859431608575,18374966859431608575,18446463693966278400,18446463693966278400] -; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm10, %ymm5, %ymm9 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm3[21],zero,ymm3[21,20],zero,ymm3[22],zero,ymm3[24],zero,ymm3[22,23],zero,ymm3[25] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] -; AVX512F-SLOW-NEXT: vpor %ymm5, %ymm11, %ymm5 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[12],zero,zero,zero,zero,ymm0[13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,ymm0[18],zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,2,1,1,4,6,5,5] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX512F-SLOW-NEXT: vpandn %ymm9, %ymm11, %ymm9 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm8 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,27,u,u,26,u,28,u,30,u,u,29,u,31,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm1[21,u],zero,ymm1[20],zero,ymm1[22],zero,ymm1[24,u],zero,ymm1[23],zero,ymm1[25,u] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] +; AVX512F-SLOW-NEXT: vpor %ymm5, %ymm7, %ymm5 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm1[13,u,u],zero,zero,ymm1[14,u,u],zero,zero,ymm1[15,u,u],zero,zero,ymm1[16,u,u],zero,zero,ymm1[17,u,u],zero,zero,ymm1[18,u,u],zero,zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[12,13],zero,ymm2[u,u,13,14],zero,ymm2[u,u,14,15],zero,ymm2[u,u,u,16],zero,ymm2[u,u,16,17],zero,ymm2[u,u,17,18],zero,ymm2[u,u,18,19] +; AVX512F-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm4[21],zero,ymm4[21,20],zero,ymm4[22],zero,ymm4[24],zero,ymm4[22,23],zero,ymm4[25] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX512F-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,13],zero,ymm4[u,u,u,14],zero,ymm4[u,u,u,15],zero,ymm4[u,u,u,16],zero,ymm4[u,u,u,17],zero,ymm4[u,u,u,18],zero,ymm4[u,u] +; AVX512F-SLOW-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm0[12],zero,zero,zero,zero,ymm0[13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,ymm0[18],zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,1,1,4,6,5,5] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX512F-SLOW-NEXT: vpandn %ymm8, %ymm9, %ymm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm5 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[27],zero,zero,ymm4[26],zero,ymm4[28],zero,ymm4[30],zero,zero,ymm4[29],zero,ymm4[31],zero,zero ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[27],zero,zero,ymm3[26],zero,ymm3[28],zero,ymm3[30],zero,zero,ymm3[29],zero,ymm3[31],zero,zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[27,u],zero,ymm3[26],zero,ymm3[28],zero,ymm3[30,u],zero,ymm3[29],zero,ymm3[31,u] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm10, %ymm4, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] +; AVX512F-SLOW-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[26],zero,ymm1[28],zero,zero,ymm1[27],zero,ymm1[29],zero,ymm1[31],zero,zero,ymm1[30],zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm1[26],zero,ymm1[28,u],zero,ymm1[u],zero,ymm1[29],zero,ymm1[31,u],zero,ymm1[30] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm3, %ymm7, %ymm1 +; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm1 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 128(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 64(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 64(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i8_stride5_vf32: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm4 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[u],zero,xmm4[u,10],zero,xmm4[12],zero,xmm4[u,11] -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero -; AVX512F-FAST-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[6],zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9],zero,xmm7[11,u],zero,xmm7[10],zero,xmm7[12] -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[6],zero,xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[11],zero,xmm9[u,10],zero,xmm9[12],zero -; AVX512F-FAST-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX512F-FAST-NEXT: vpternlogq $226, %ymm5, %ymm10, %ymm8 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm5 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX512F-FAST-NEXT: vpternlogq $226, %ymm7, %ymm6, %ymm4 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm5 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm5[6],zero,xmm5[8,u],zero,xmm5[7],zero,xmm5[9],zero,xmm5[11,u],zero,xmm5[10],zero,xmm5[12] +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[6],zero,xmm7[8],zero,xmm7[u,7],zero,xmm7[9],zero,xmm7[11],zero,xmm7[u,10],zero,xmm7[12],zero +; AVX512F-FAST-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm10, %zmm6 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm6[0,0,1,1,4,4,5,5] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[u],zero,xmm9[u,10],zero,xmm9[12],zero,xmm9[u,11] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm8[8,u],zero,xmm8[7],zero,xmm8[9,u,11,u],zero,xmm8[10],zero,xmm8[12,u],zero +; AVX512F-FAST-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm5[0,0,1,1,4,4,5,5] +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm6 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm6 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-FAST-NEXT: vpermd %zmm4, %zmm7, %zmm7 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,13],zero,ymm2[u,u,u,14],zero,ymm2[u,u,u,15],zero,ymm2[u,u,u,16],zero,ymm2[u,u,u,17],zero,ymm2[u,u,u,18],zero,ymm2[u,u] -; AVX512F-FAST-NEXT: vpor %ymm5, %ymm8, %ymm5 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u],zero,ymm1[13,u,u,u],zero,ymm1[14,u,u,u],zero,ymm1[15,u,u,u],zero,ymm1[16,u,u,u],zero,ymm1[17,u,u,u],zero,ymm1[18,u,u,u],zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,13],zero,ymm0[u,u,u,14],zero,ymm0[u,u,u,15],zero,ymm0[u,u,u,16],zero,ymm0[u,u,u,17],zero,ymm0[u,u,u,18],zero,ymm0[u,u,u,19] -; AVX512F-FAST-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX512F-FAST-NEXT: vpternlogq $226, %ymm5, %ymm10, %ymm8 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[21],zero,zero,ymm0[20],zero,ymm0[22],zero,ymm0[24],zero,zero,ymm0[23],zero,ymm0[25],zero,zero -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [18374966859431608575,18374966859431608575,18446463693966278400,18446463693966278400] -; AVX512F-FAST-NEXT: vpternlogq $248, %ymm10, %ymm5, %ymm9 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero +; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm7, %zmm6 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm2[21],zero,ymm2[21,20],zero,ymm2[22],zero,ymm2[24],zero,ymm2[22,23],zero,ymm2[25] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] -; AVX512F-FAST-NEXT: vpor %ymm5, %ymm11, %ymm5 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <4,u,5,5,5,5,u,6> -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX512F-FAST-NEXT: vpandn %ymm8, %ymm9, %ymm8 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[12],zero,zero,zero,zero,ymm4[13],zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,ymm4[18],zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm8 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,27,u,u,26,u,28,u,30,u,u,29,u,31,u] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm1[21,u],zero,ymm1[20],zero,ymm1[22],zero,ymm1[24,u],zero,ymm1[23],zero,ymm1[25,u] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] +; AVX512F-FAST-NEXT: vpor %ymm5, %ymm7, %ymm5 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm1[13,u,u],zero,zero,ymm1[14,u,u],zero,zero,ymm1[15,u,u],zero,zero,ymm1[16,u,u],zero,zero,ymm1[17,u,u],zero,zero,ymm1[18,u,u],zero,zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[12,13],zero,ymm2[u,u,13,14],zero,ymm2[u,u,14,15],zero,ymm2[u,u,u,16],zero,ymm2[u,u,16,17],zero,ymm2[u,u,17,18],zero,ymm2[u,u,18,19] +; AVX512F-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm4[21],zero,ymm4[21,20],zero,ymm4[22],zero,ymm4[24],zero,ymm4[22,23],zero,ymm4[25] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX512F-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,13],zero,ymm4[u,u,u,14],zero,ymm4[u,u,u,15],zero,ymm4[u,u,u,16],zero,ymm4[u,u,u,17],zero,ymm4[u,u,u,18],zero,ymm4[u,u] +; AVX512F-FAST-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <4,u,5,5,5,5,u,6> +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX512F-FAST-NEXT: vpandn %ymm5, %ymm8, %ymm5 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[12],zero,zero,zero,zero,ymm0[13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,ymm0[18],zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm5 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm5 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[27],zero,zero,ymm4[26],zero,ymm4[28],zero,ymm4[30],zero,zero,ymm4[29],zero,ymm4[31],zero,zero +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[27,u],zero,ymm3[26],zero,ymm3[28],zero,ymm3[30,u],zero,ymm3[29],zero,ymm3[31,u] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero,zero +; AVX512F-FAST-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512F-FAST-NEXT: vpternlogq $248, %ymm10, %ymm3, %ymm2 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm1[26],zero,ymm1[28,u],zero,ymm1[u],zero,ymm1[29],zero,ymm1[31,u],zero,ymm1[30] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm0[26],zero,ymm0[28],zero,zero,ymm0[27],zero,ymm0[29],zero,ymm0[31],zero,zero,ymm0[30],zero -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm2, %ymm6, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <6,6,6,u,7,7,7,7> -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm1, 128(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 64(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, (%r9) +; AVX512F-FAST-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <6,6,6,u,7,7,7,7> +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm0, 128(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -2137,40 +2163,42 @@ ; AVX512BW-FAST-NEXT: movabsq $3570337559743967628, %rax # imm = 0x318C631818C6318C ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1} -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm5 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm5 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] -; AVX512BW-FAST-NEXT: vpermd %zmm5, %zmm6, %zmm6 +; AVX512BW-FAST-NEXT: vpermd %zmm5, %zmm6, %zmm5 ; AVX512BW-FAST-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm6, %zmm3 {%k1} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25] +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] +; AVX512BW-FAST-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm0[12,13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,ymm0[18],zero,zero,zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero ; AVX512BW-FAST-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm0[12,13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,ymm0[18],zero,zero,zero -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero -; AVX512BW-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX512BW-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <3,3,3,u,4,4,4,4> -; AVX512BW-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm8 +; AVX512BW-FAST-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <3,3,3,u,4,4,4,4> +; AVX512BW-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm7 ; AVX512BW-FAST-NEXT: movl $138547332, %eax # imm = 0x8421084 ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm8 {%k1} = ymm4[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 {%k1} = ymm4[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 ; AVX512BW-FAST-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = <3,3,3,3,u,4,4,4,12,14,13,13,13,13,12,14> -; AVX512BW-FAST-NEXT: vpermd %zmm5, %zmm6, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm5, %zmm6 {%k1} +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <3,3,3,3,u,4,4,4,12,14,13,13,13,13,12,14> +; AVX512BW-FAST-NEXT: vpermd %zmm5, %zmm7, %zmm7 ; AVX512BW-FAST-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm7, %zmm6 {%k1} ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,zero,zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[26],zero,ymm1[28],zero,zero,ymm1[27],zero,ymm1[29],zero,ymm1[31],zero,zero,ymm1[30],zero @@ -2190,7 +2218,7 @@ ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 ; AVX512BW-FAST-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa %ymm0, 128(%r9) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, 64(%r9) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 64(%r9) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq @@ -2905,457 +2933,450 @@ ; AVX1-ONLY-LABEL: store_i8_stride5_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $104, %rsp -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6],zero,xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [128,8,0,128,7,128,9,0,128,8,0,128,7,128,9,0] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0] -; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1],zero,xmm6[3,4,5,6],zero,xmm6[8,9,10,11],zero,xmm6[13,14,15] -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm7[6],zero,zero,zero,zero,xmm7[7],zero,zero,zero,zero,xmm7[8],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [3,0,0,1,4,5,0,2,3,0,0,1,4,5,0,2] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm8 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,1,4,5,0,2,3,6,0,1,4,5,0,2,3,6] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[0,1,2],zero,xmm3[4,5,6,7],zero,xmm3[9,10,11,12],zero,xmm3[14,15] +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm3[3],zero,zero,zero,zero,xmm3[4],zero,zero,zero,zero,xmm3[5],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[4,5,6,7],zero,xmm4[9,10,11,12],zero,xmm4[14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm7[3],zero,zero,zero,zero,xmm7[4],zero,zero,zero,zero,xmm7[5],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,10,11,u,u,u,12,13,u,u,u,14,15,u,u,u] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [7,0,4,5,8,9,0,6,7,0,4,5,8,9,0,6] -; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [2,7,6,0,5,4,9,8,2,7,6,0,5,4,9,8] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[12],zero,zero,zero,zero,xmm7[13],zero,zero,zero,zero,xmm7[14],zero,zero,zero,zero,xmm7[15] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [0,128,2,3,4,5,128,7,8,9,10,128,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [128,9,128,128,128,128,10,128,128,128,128,11,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm6 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm10 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[6],zero,xmm8[u,u,u,7],zero,xmm8[u,u,u,8],zero,xmm8[u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm14, %xmm10 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm10, %ymm14 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3],zero,xmm5[5,6,7,8],zero,xmm5[10,11,12,13],zero,xmm5[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm3[0],zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,xmm3[2],zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u],zero,xmm4[7,u,u,u],zero,xmm4[8,u,u,u],zero,xmm4[9,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,7],zero,xmm0[u,u,u,8],zero,xmm0[u,u,u,9],zero,xmm0[u] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128] +; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [6,128,8,128,0,7,128,9,6,128,8,128,0,7,128,9] +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm10, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm10, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[0,1],zero,xmm14[3,4,5,6],zero,xmm14[8,9,10,11],zero,xmm14[13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm6[6],zero,zero,zero,zero,xmm6[7],zero,zero,zero,zero,xmm6[8],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [3,0,0,1,4,5,0,2,3,0,0,1,4,5,0,2] -; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [0,1,4,5,0,2,3,6,0,1,4,5,0,2,3,6] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm10, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0],zero,xmm5[2,3,4,5],zero,xmm5[7,8,9,10],zero,xmm5[12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm3[9],zero,zero,zero,zero,xmm3[10],zero,zero,zero,zero,xmm3[11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,128,3,4,5,6,128,8,9,10,11,128,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [128,128,6,128,128,128,128,7,128,128,128,128,8,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u] +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm10, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[0,1,2,3],zero,xmm0[5,6,7,8],zero,xmm0[10,11,12,13],zero,xmm0[15] +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [128,1,2,3,4,128,6,7,8,9,128,11,12,13,14,128] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [0,6,7,10,11,0,8,9,0,6,7,10,11,0,8,9] ; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] -; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm12, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,128,5,6,7,8,128,10,11,12,13,128,15] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,128,0,128,128,128,128,1,128,128,128,128,2,128] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u],zero,xmm6[7,u,u,u],zero,xmm6[8,u,u,u],zero,xmm6[9,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,7],zero,xmm5[u,u,u,8],zero,xmm5[u,u,u,9],zero,xmm5[u] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm8, %xmm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [6,11,10,0,9,8,13,12,6,11,10,0,9,8,13,12] +; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm8, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,128,4,5,6,7,128,9,10,11,12,128,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,3,128,128,128,128,4,128,128,128,128,5,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [128,1,2,3,4,128,6,7,8,9,128,11,12,13,14,128] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm4 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [7,0,4,5,8,9,0,6,7,0,4,5,8,9,0,6] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[4,5,6,7],zero,xmm1[9,10,11,12],zero,xmm1[14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[3],zero,zero,zero,zero,xmm6[4],zero,zero,zero,zero,xmm6[5],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,128,2,3,4,5,128,7,8,9,10,128,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [128,9,128,128,128,128,10,128,128,128,128,11,128,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm15 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3],zero,xmm2[5,6,7,8],zero,xmm2[10,11,12,13],zero,xmm2[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm3[0],zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,xmm3[2],zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u],zero,xmm5[7,u,u,u],zero,xmm5[8,u,u,u],zero,xmm5[9,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,7],zero,xmm4[u,u,u,8],zero,xmm4[u,u,u,9],zero,xmm4[u] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128] +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [6,128,8,128,0,7,128,9,6,128,8,128,0,7,128,9] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm11 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,128,3,4,5,6,128,8,9,10,11,128,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,6,128,128,128,128,7,128,128,128,128,8,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0] +; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm11, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0],zero,xmm3[2,3,4,5],zero,xmm3[7,8,9,10],zero,xmm3[12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm7, %ymm11 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[0,1,2,3],zero,xmm0[5,6,7,8],zero,xmm0[10,11,12,13],zero,xmm0[15] +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm15, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [128,1,2,3,4,128,6,7,8,9,128,11,12,13,14,128] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15] ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm12 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm10, %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm14, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3],zero,xmm13[5,6,7,8],zero,xmm13[10,11,12,13],zero,xmm13[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[12],zero,zero,zero,zero,xmm13[13],zero,zero,zero,zero,xmm13[14],zero,zero,zero,zero,xmm13[15] -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm12, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128] -; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6],zero,xmm2[u,u,u,7],zero,xmm2[u,u,u,8],zero,xmm2[u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,6,7,10,11,0,8,9,0,6,7,10,11,0,8,9] -; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,xmm8[7,u,u,u],zero,xmm8[8,u,u,u],zero,xmm8[9,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,7],zero,xmm6[u,u,u,8],zero,xmm6[u,u,u,9],zero,xmm6[u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [6,11,10,0,9,8,13,12,6,11,10,0,9,8,13,12] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm11 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[6],zero,xmm4[u,u,u,7],zero,xmm4[u,u,u,8],zero,xmm4[u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm15, %xmm11 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[u,u,u],zero,xmm1[7,u,u,u],zero,xmm1[8,u,u,u],zero,xmm1[9,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm2[u,u,u,7],zero,xmm2[u,u,u,8],zero,xmm2[u,u,u,9],zero,xmm2[u] +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm15, %xmm11 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm15, %ymm11 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm10, %ymm11, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1],zero,xmm10[3,4,5,6],zero,xmm10[8,9,10,11],zero,xmm10[13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm0[6],zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,xmm0[8],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2],zero,xmm6[4,5,6,7],zero,xmm6[9,10,11,12],zero,xmm6[14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm0[3],zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,xmm0[5],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,128,3,4,5,6,128,8,9,10,11,128,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,6,128,128,128,128,7,128,128,128,128,8,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,128,4,5,6,7,128,9,10,11,12,128,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,3,128,128,128,128,4,128,128,128,128,5,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm13, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3],zero,xmm4[5,6,7,8],zero,xmm4[10,11,12,13],zero,xmm4[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm13[0],zero,zero,zero,zero,xmm13[1],zero,zero,zero,zero,xmm13[2],zero -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,xmm5[7,u,u,u],zero,xmm5[8,u,u,u],zero,xmm5[9,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,7],zero,xmm3[u,u,u,8],zero,xmm3[u,u,u,9],zero,xmm3[u] -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm9[6,u,u,u],zero,xmm9[7,u,u,u],zero,xmm9[8,u,u,u],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[6],zero,xmm7[u,u,u,7],zero,xmm7[u,u,u,8],zero,xmm7[u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0],zero,xmm5[2,3,4,5],zero,xmm5[7,8,9,10],zero,xmm5[12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm13[9],zero,zero,zero,zero,xmm13[10],zero,zero,zero,zero,xmm13[11],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm13, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 32(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 96(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm12, 112(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%r9) +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2,3,4,5],zero,xmm1[7,8,9,10],zero,xmm1[12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 288(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 304(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 256(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 272(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 224(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 240(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%r9) ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%r9) ; AVX1-ONLY-NEXT: addq $104, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i8_stride5_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $312, %rsp # imm = 0x138 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX2-SLOW-NEXT: subq $248, %rsp +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm8 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa %xmm12, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm14 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm8 -; AVX2-SLOW-NEXT: vpor %xmm4, %xmm8, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm6, %xmm7 +; AVX2-SLOW-NEXT: vpor %xmm4, %xmm7, %xmm4 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm1, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-SLOW-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm1 -; AVX2-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] -; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm10, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm11 -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [128,5,128,3,4,128,6,128,8,128,6,7,128,9,128,7,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm12 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [3,128,5,128,128,4,128,6,128,8,128,128,7,128,9,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] ; AVX2-SLOW-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm7 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm4[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm12 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3] -; AVX2-SLOW-NEXT: vpor %ymm9, %ymm12, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm6 -; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm6, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm14 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,3,128,5,128,128,4,128,6,128,8,128,128,7,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm14, %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm4[2,2,3,3] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,3,128,5,128,128,4,128,6,128,8,128,128,7,128,9,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm10, %ymm13 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3] +; AVX2-SLOW-NEXT: vpor %ymm11, %ymm13, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm1, %ymm11, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm11, %ymm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm15 ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm15, %ymm5 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] ; AVX2-SLOW-NEXT: vpor %ymm1, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm10 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm10, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm7 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm3[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm8 -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm8, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm4 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] ; AVX2-SLOW-NEXT: vpor %ymm0, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm5, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[0,2,1,1,4,6,5,5] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm5, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[0,2,1,1,4,6,5,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,3,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm9, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm9 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[0,2,1,1,4,6,5,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,3,2] ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <3,3,3,u,4,4,4,4> -; AVX2-SLOW-NEXT: vpermd %ymm11, %ymm3, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <3,3,3,u,4,4,4,4> +; AVX2-SLOW-NEXT: vpermd %ymm12, %ymm2, %ymm4 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermd %ymm6, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpermd %ymm11, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm15, %ymm4 ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm4, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm4 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm11 -; AVX2-SLOW-NEXT: vpor %ymm4, %ymm11, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm10, %ymm12 +; AVX2-SLOW-NEXT: vpor %ymm4, %ymm12, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm4 ; AVX2-SLOW-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <3,3,3,3,u,4,4,4> -; AVX2-SLOW-NEXT: vpermd %ymm12, %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vpermd %ymm13, %ymm2, %ymm4 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3363,63 +3384,61 @@ ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufd $80, (%rsp), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm6, %ymm1, %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm5, %ymm1, %ymm5 ; AVX2-SLOW-NEXT: vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm6 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7] ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0,255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0] ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm10, %ymm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm7, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7] ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm8, %ymm3 ; AVX2-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255,255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255] -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255,255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255] +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm3 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm15, %ymm2 -; AVX2-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm11[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> @@ -3427,7 +3446,7 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[2,2,3,3,6,6,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 @@ -3435,35 +3454,35 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 224(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 256(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 160(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 288(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 288(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, (%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 160(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 128(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-SLOW-NEXT: addq $312, %rsp # imm = 0x138 +; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%r9) +; AVX2-SLOW-NEXT: addq $248, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i8_stride5_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $168, %rsp -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm13 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm10 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm11 +; AVX2-FAST-NEXT: subq $264, %rsp # imm = 0x108 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm3 @@ -3476,145 +3495,149 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpor %xmm4, %xmm6, %xmm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm0 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX2-FAST-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm3 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm3 ; AVX2-FAST-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,2,2,2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm12 -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,29,26,128,28,128,30,128,28,29,128,31,128,29] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,5,128,3,4,128,6,128,8,128,6,7,128,9,128,7,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm12 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] -; AVX2-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,128,5,128,128,4,128,6,128,8,128,128,7,128,9,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX2-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,3,128,5,128,128,4,128,6,128,8,128,128,7,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,3,128,5,128,128,4,128,6,128,8,128,128,7,128,9,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] ; AVX2-FAST-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm10 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm8 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm15 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm9, %ymm15, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [6,6,6,6,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm8, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm4, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm8, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm7 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,2,3,3] +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX2-FAST-NEXT: vpor %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm15 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm3[2,2,3,3] +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm7 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] +; AVX2-FAST-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,6,5,5,5,5,4,6] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm13 +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,11,128,13,10,128,12,128,14,128,12,13,128,15,128,13,128,27,128,29,26,128,28,128,30,128,28,29,128,31,128,29] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm7 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [9,128,11,128,128,10,128,12,128,14,128,128,13,128,15,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm14, %ymm9 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] ; AVX2-FAST-NEXT: vpor %ymm7, %ymm9, %ymm7 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm15 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm14 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,13,10,128,12,128,10,11,12,13,128,15,128,13,14,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm12, %ymm14 +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm15, %ymm14, %ymm14 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm14, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [11,128,128,10,128,12,128,128,128,128,13,128,15,128,128,14,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm12 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3] +; AVX2-FAST-NEXT: vpor %ymm14, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm12, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm6 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm8 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX2-FAST-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm8 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm4, %ymm8, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm6, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [4,6,5,5,5,5,4,6] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm6, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm9 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm4, %ymm6, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <3,3,3,u,4,4,4,4> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm6 -; AVX2-FAST-NEXT: vpor %ymm3, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm3 -; AVX2-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <3,3,3,3,u,4,4,4> -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX2-FAST-NEXT: vpor %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [6,6,6,6,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm6, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <3,3,3,u,4,4,4,4> +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm6, %ymm12, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpor %ymm9, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm6, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <3,3,3,3,u,4,4,4> +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm6, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] @@ -3641,49 +3664,53 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm1, 64(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 224(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 96(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 224(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 288(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 128(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm9, 256(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 160(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 160(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-NEXT: addq $168, %rsp +; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-FAST-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i8_stride5_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $200, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm11 +; AVX2-FAST-PERLANE-NEXT: subq $232, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm6 @@ -3694,149 +3721,145 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,29,26,128,28,128,30,128,28,29,128,31,128,29] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [128,5,128,3,4,128,6,128,8,128,6,7,128,9,128,7,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [3,128,5,128,128,4,128,6,128,8,128,128,7,128,9,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,3,128,5,128,128,4,128,6,128,8,128,128,7,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [128,3,128,5,128,128,4,128,6,128,8,128,128,7,128,9,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm15, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm4, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm13, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm3[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm3, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm6, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm7, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm5, %ymm9, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm5, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,3,3,6,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm9, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[2,2,3,3,6,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm4, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm2, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,2,1,1,4,6,5,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm1, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,2,1,1,4,6,5,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [128,11,128,13,10,128,12,128,14,128,12,13,128,15,128,13,128,27,128,29,26,128,28,128,30,128,28,29,128,31,128,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm15, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [9,128,11,128,128,10,128,12,128,14,128,128,13,128,15,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm10, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm2, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm15, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm12, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [128,13,10,128,12,128,10,11,12,13,128,15,128,13,14,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm12, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm11, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [11,128,128,10,128,12,128,128,128,128,13,128,15,128,128,14,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm14, %ymm12, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm9, %ymm12, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm8, %ymm10, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm7, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm8, %ymm12, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm3, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm13, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm10, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm8, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[0,2,1,1,4,6,5,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm8, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,2,1,1,4,6,5,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm6, %ymm8, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <3,3,3,u,4,4,4,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm4, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm11, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm4, %ymm8, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm7, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm13, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm11, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm6, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[2,2,3,3,6,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm8, %ymm6, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[2,2,3,3,6,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <3,3,3,u,4,4,4,4> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm9, %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm6, %ymm12, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm11, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm4, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm15, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm11, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm6, %ymm9, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm13, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <3,3,3,3,u,4,4,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm6, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw (%rsp), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 @@ -3866,895 +3889,690 @@ ; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 224(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 224(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 288(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 128(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 288(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 256(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 160(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 160(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: addq $200, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-FAST-PERLANE-NEXT: addq $232, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: store_i8_stride5_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm3, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19> -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19> +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm10, %ymm1 ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm29 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm30 -; AVX512F-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm9, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm11 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm26 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm16 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm27 +; AVX512F-SLOW-NEXT: vporq %xmm0, %xmm1, %xmm18 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm11 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm11, %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm11, %ymm10 -; AVX512F-SLOW-NEXT: vpor %ymm4, %ymm10, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm25 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm26 -; AVX512F-SLOW-NEXT: vporq %xmm4, %xmm12, %xmm31 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm22 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm23 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[27],zero,zero,ymm11[26],zero,ymm11[28],zero,ymm11[30],zero,zero,ymm11[29],zero,ymm11[31],zero,zero -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm9[19],zero,ymm9[21],zero,zero,ymm9[20],zero,ymm9[22],zero,ymm9[24],zero,zero,ymm9[23],zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm24 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm12 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm12, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm6 -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm20 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm5, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm3 -; AVX512F-SLOW-NEXT: vporq %ymm2, %ymm3, %ymm21 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm17 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm3 -; AVX512F-SLOW-NEXT: vporq %xmm2, %xmm3, %xmm27 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm19 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm16 -; AVX512F-SLOW-NEXT: vporq %xmm0, %xmm15, %xmm28 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,2,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX512F-SLOW-NEXT: vpandnq %ymm0, %ymm25, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vporq %ymm8, %ymm9, %ymm19 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> +; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm9, %xmm15 +; AVX512F-SLOW-NEXT: vporq %xmm12, %xmm15, %xmm20 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,ymm10[10],zero,ymm10[12],zero,zero,ymm10[11],zero,ymm10[13],zero,ymm10[15],zero,zero,ymm10[14],zero,zero,zero,ymm10[26],zero,ymm10[28],zero,zero,ymm10[27],zero,ymm10[29],zero,ymm10[31],zero,zero,ymm10[30],zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,ymm10[5],zero,zero,ymm10[4],zero,ymm10[6],zero,ymm10[8],zero,zero,ymm10[7],zero,ymm10[9],zero,zero,zero,ymm10[21],zero,zero,ymm10[20],zero,ymm10[22],zero,ymm10[24],zero,zero,ymm10[23],zero,ymm10[25],zero,zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm21 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm6[11,u],zero,ymm6[10],zero,ymm6[12,u],zero,ymm6[u],zero,ymm6[13],zero,ymm6[15,u],zero,ymm6[14,27,u],zero,ymm6[26],zero,ymm6[28,u],zero,ymm6[u],zero,ymm6[29],zero,ymm6[31,u],zero,ymm6[30] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[3],zero,ymm6[5,u],zero,ymm6[4],zero,ymm6[6],zero,ymm6[8,u],zero,ymm6[7],zero,ymm6[9,u,19],zero,ymm6[21,u],zero,ymm6[20],zero,ymm6[22],zero,ymm6[24,u],zero,ymm6[23],zero,ymm6[25,u] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm22 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm0[11],zero,zero,ymm0[10],zero,ymm0[12],zero,ymm0[14],zero,zero,ymm0[13],zero,ymm0[15],zero,zero,zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm11[3],zero,ymm11[5],zero,zero,ymm11[4],zero,ymm11[6],zero,ymm11[8],zero,zero,ymm11[7],zero,zero,zero,ymm11[19],zero,ymm11[21],zero,zero,ymm11[20],zero,ymm11[22],zero,ymm11[24],zero,zero,ymm11[23],zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm10, %zmm23 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[9],zero,ymm11[11,u],zero,ymm11[10],zero,ymm11[12],zero,ymm11[14,u],zero,ymm11[13],zero,ymm11[15,u,25],zero,ymm11[27,u],zero,ymm11[26],zero,ymm11[28],zero,ymm11[30,u],zero,ymm11[29],zero,ymm11[31,u] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,128,5,128,5,4,128,6,128,8,128,6,7,128,9,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] +; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm24 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512F-SLOW-NEXT: vporq %ymm2, %ymm1, %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm6 +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm3 +; AVX512F-SLOW-NEXT: vporq %ymm2, %ymm3, %ymm25 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm15 +; AVX512F-SLOW-NEXT: vporq %xmm3, %xmm15, %xmm26 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm13 +; AVX512F-SLOW-NEXT: vporq %xmm14, %xmm13, %xmm30 +; AVX512F-SLOW-NEXT: vmovdqa64 32(%r8), %xmm27 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm27[1,1,2,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX512F-SLOW-NEXT: vpandn %ymm13, %ymm14, %ymm13 ; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm15, %ymm14 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm26 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,1,1,4,6,5,5] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,2] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX512F-SLOW-NEXT: vpandn %ymm0, %ymm14, %ymm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm25 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm9, %ymm9 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm12[19],zero,ymm12[21],zero,zero,ymm12[20],zero,ymm12[22],zero,ymm12[24],zero,zero,ymm12[23],zero -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm12, %ymm12 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] -; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm11 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm6, %ymm2 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,ymm6[26],zero,ymm6[28],zero,ymm6[30],zero,zero,ymm6[29],zero,ymm6[31],zero,zero -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm5[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,ymm4[27],zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30],zero -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX512F-SLOW-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <4,u,5,5,5,5,u,6,6,6,6,u,7,7,7,7> -; AVX512F-SLOW-NEXT: vpermd %zmm15, %zmm1, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <6,6,6,u,7,7,7,7,u,16,16,16,16,u,17,17> -; AVX512F-SLOW-NEXT: vpermi2d %zmm15, %zmm29, %zmm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm9[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm11[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm15, %ymm12 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm31 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <4,u,5,5,5,5,u,6,6,6,6,u,7,7,7,7> +; AVX512F-SLOW-NEXT: vpermd %zmm15, %zmm12, %zmm29 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm15 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm15[0,2,1,1,4,6,5,5] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,3,2] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX512F-SLOW-NEXT: vpandn %ymm12, %ymm13, %ymm12 +; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm15, %ymm13 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm13 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm26, %zmm1, %zmm28 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm30, %zmm0, %zmm26 +; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm4, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm4[11],zero,zero,ymm4[10],zero,ymm4[12],zero,ymm4[14],zero,zero,ymm4[13],zero,ymm4[15],zero,zero,zero,ymm4[27],zero,zero,ymm4[26],zero,ymm4[28],zero,ymm4[30],zero,zero,ymm4[29],zero,ymm4[31],zero,zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm11[3],zero,ymm11[5],zero,zero,ymm11[4],zero,ymm11[6],zero,ymm11[8],zero,zero,ymm11[7],zero,zero,zero,ymm11[19],zero,ymm11[21],zero,zero,ymm11[20],zero,ymm11[22],zero,ymm11[24],zero,zero,ymm11[23],zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm11[9],zero,ymm11[11,u],zero,ymm11[10],zero,ymm11[12],zero,ymm11[14,u],zero,ymm11[13],zero,ymm11[15,u,25],zero,ymm11[27,u],zero,ymm11[26],zero,ymm11[28],zero,ymm11[30,u],zero,ymm11[29],zero,ymm11[31,u] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm10, %xmm10 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm5[5],zero,zero,ymm5[4],zero,ymm5[6],zero,ymm5[8],zero,zero,ymm5[7],zero,ymm5[9],zero,zero,zero,ymm5[21],zero,zero,ymm5[20],zero,ymm5[22],zero,ymm5[24],zero,zero,ymm5[23],zero,ymm5[25],zero,zero +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[3],zero,ymm6[5,u],zero,ymm6[4],zero,ymm6[6],zero,ymm6[8,u],zero,ymm6[7],zero,ymm6[9,u,19],zero,ymm6[21,u],zero,ymm6[20],zero,ymm6[22],zero,ymm6[24,u],zero,ymm6[23],zero,ymm6[25,u] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[10],zero,ymm5[12],zero,zero,ymm5[11],zero,ymm5[13],zero,ymm5[15],zero,zero,ymm5[14],zero,zero,zero,ymm5[26],zero,ymm5[28],zero,zero,ymm5[27],zero,ymm5[29],zero,ymm5[31],zero,zero,ymm5[30],zero ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm27, %zmm8, %zmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm11 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3],xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm28, %zmm9, %zmm9 -; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm15 = mem[0,0,1,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm15 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm31[0,0,1,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm15, %zmm17, %zmm16 -; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm14, %ymm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm20, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [18374966859431608575,18374966859431608575,18446463693966278400,18446463693966278400] -; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm14, %ymm13, %ymm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm17, %zmm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm22[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm13 = zmm23[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm14, %ymm12, %ymm6 -; AVX512F-SLOW-NEXT: vpandq %ymm14, %ymm30, %ymm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm18, %zmm2 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm11 = zmm24[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vporq %zmm11, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm13, %zmm11, %zmm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm11, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm26 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm25 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm8[0,0,1,1,4,4,5,5] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm9[0,0,1,1,4,4,5,5] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512F-SLOW-NEXT: vpermd %zmm29, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 64(%r9) +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[11,u],zero,ymm6[10],zero,ymm6[12,u],zero,ymm6[u],zero,ymm6[13],zero,ymm6[15,u],zero,ymm6[14,27,u],zero,ymm6[26],zero,ymm6[28,u],zero,ymm6[u],zero,ymm6[29],zero,ymm6[31,u],zero,ymm6[30] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <6,6,6,u,7,7,7,7,u,16,16,16,16,u,17,17> +; AVX512F-SLOW-NEXT: vpermi2d %zmm27, %zmm15, %zmm7 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm18[0,0,1,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm20[0,0,1,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm12, %zmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm8, %zmm14, %zmm12 +; AVX512F-SLOW-NEXT: vpor %ymm4, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm17, %zmm0 +; AVX512F-SLOW-NEXT: vpor %ymm9, %ymm11, %ymm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm25, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm14, %zmm4 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm21[2,2,3,3,6,6,7,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm22[2,2,3,3,6,6,7,7] +; AVX512F-SLOW-NEXT: vporq %zmm0, %zmm8, %zmm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm23[2,2,3,3,6,6,7,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm24[2,2,3,3,6,6,7,7] +; AVX512F-SLOW-NEXT: vporq %zmm8, %zmm9, %zmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm9, %zmm8 +; AVX512F-SLOW-NEXT: vpor %ymm3, %ymm10, %ymm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm9, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm31 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm29 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm13 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm28[0,0,1,1,4,4,5,5] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm3 = zmm26[0,0,1,1,4,4,5,5] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 64(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm29, 256(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 128(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 128(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 256(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, 192(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm31, 192(%r9) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i8_stride5_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19> -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19> +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm2 +; AVX512F-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm18 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm25 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm17 -; AVX512F-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm17 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm26 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm16 +; AVX512F-FAST-NEXT: vpor %xmm0, %xmm4, %xmm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm9 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm4 -; AVX512F-FAST-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm26 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm4 -; AVX512F-FAST-NEXT: vporq %xmm3, %xmm4, %xmm19 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm22 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm23 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[27],zero,zero,ymm8[26],zero,ymm8[28],zero,ymm8[30],zero,zero,ymm8[29],zero,ymm8[31],zero,zero -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm30 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm24 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512F-FAST-NEXT: vporq %ymm0, %ymm1, %ymm20 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm1 -; AVX512F-FAST-NEXT: vporq %ymm0, %ymm1, %ymm21 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm16 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm2 -; AVX512F-FAST-NEXT: vporq %xmm0, %xmm2, %xmm28 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm11 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm7 +; AVX512F-FAST-NEXT: vporq %ymm4, %ymm7, %ymm19 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm12 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm14 +; AVX512F-FAST-NEXT: vporq %xmm12, %xmm14, %xmm20 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,ymm8[10],zero,ymm8[12],zero,zero,ymm8[11],zero,ymm8[13],zero,ymm8[15],zero,zero,ymm8[14],zero,zero,zero,ymm8[26],zero,ymm8[28],zero,zero,ymm8[27],zero,ymm8[29],zero,ymm8[31],zero,zero,ymm8[30],zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[5],zero,zero,ymm8[4],zero,ymm8[6],zero,ymm8[8],zero,zero,ymm8[7],zero,ymm8[9],zero,zero,zero,ymm8[21],zero,zero,ymm8[20],zero,ymm8[22],zero,ymm8[24],zero,zero,ymm8[23],zero,ymm8[25],zero,zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm8, %zmm21 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[11,u],zero,ymm1[10],zero,ymm1[12,u],zero,ymm1[u],zero,ymm1[13],zero,ymm1[15,u],zero,ymm1[14,27,u],zero,ymm1[26],zero,ymm1[28,u],zero,ymm1[u],zero,ymm1[29],zero,ymm1[31,u],zero,ymm1[30] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[3],zero,ymm1[5,u],zero,ymm1[4],zero,ymm1[6],zero,ymm1[8,u],zero,ymm1[7],zero,ymm1[9,u,19],zero,ymm1[21,u],zero,ymm1[20],zero,ymm1[22],zero,ymm1[24,u],zero,ymm1[23],zero,ymm1[25,u] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm22 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm0[11],zero,zero,ymm0[10],zero,ymm0[12],zero,ymm0[14],zero,zero,ymm0[13],zero,ymm0[15],zero,zero,zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,ymm11[3],zero,ymm11[5],zero,zero,ymm11[4],zero,ymm11[6],zero,ymm11[8],zero,zero,ymm11[7],zero,zero,zero,ymm11[19],zero,ymm11[21],zero,zero,ymm11[20],zero,ymm11[22],zero,ymm11[24],zero,zero,ymm11[23],zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm23 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <9,128,11,u,128,10,128,12,128,14,u,128,13,128,15,u,25,128,27,u,128,26,128,28,128,30,u,128,29,128,31,u> +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm11 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,128,5,128,5,4,128,6,128,8,128,6,7,128,9,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm24 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm10 +; AVX512F-FAST-NEXT: vporq %ymm0, %ymm10, %ymm18 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm10 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm13 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm6 +; AVX512F-FAST-NEXT: vporq %ymm13, %ymm6, %ymm25 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm6 +; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm6, %xmm13 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm14 -; AVX512F-FAST-NEXT: vporq %xmm0, %xmm14, %xmm29 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [1,1,2,2,2,2,2,2] -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm0 -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX512F-FAST-NEXT: vpandnq %ymm14, %ymm25, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm15 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm27 -; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm25 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = <4,u,5,5,5,5,u,6,30,30,30,u,31,31,31,31> -; AVX512F-FAST-NEXT: vpermi2d %zmm25, %zmm0, %zmm31 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <4,u,5,5,5,5,u,6> -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm0 -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm15 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX512F-FAST-NEXT: vpandnq %ymm15, %ymm26, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm26 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm12, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm1 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] -; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm12 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,ymm6[26],zero,ymm6[28],zero,ymm6[30],zero,zero,ymm6[29],zero,ymm6[31],zero,zero -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm9[2,2,3,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm5[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,ymm4[27],zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30],zero -; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm13 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm13 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm14 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm10, %xmm10 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,1,1] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm28, %zmm3, %zmm3 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm29, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm25, %zmm0 -; AVX512F-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm7 = mem[0,0,1,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm19[0,0,1,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm7, %zmm16, %zmm13 -; AVX512F-FAST-NEXT: vpor %ymm15, %ymm12, %ymm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm20, %zmm7 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [18374966859431608575,18374966859431608575,18446463693966278400,18446463693966278400] -; AVX512F-FAST-NEXT: vpternlogq $248, %ymm12, %ymm11, %ymm9 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm21, %zmm9 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm7, %zmm16, %zmm9 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm22[2,2,3,3,6,6,7,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm23[2,2,3,3,6,6,7,7] -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm11 -; AVX512F-FAST-NEXT: vpternlogq $248, %ymm12, %ymm1, %ymm6 -; AVX512F-FAST-NEXT: vpandq %ymm12, %ymm30, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm24[2,2,3,3,6,6,7,7] -; AVX512F-FAST-NEXT: vporq %zmm7, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm11, %zmm7, %zmm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm15 +; AVX512F-FAST-NEXT: vporq %xmm13, %xmm15, %xmm26 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm15 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX512F-FAST-NEXT: vporq %xmm15, %xmm4, %xmm31 +; AVX512F-FAST-NEXT: vmovdqa64 32(%r8), %xmm27 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,2,2,2,2,2,2] +; AVX512F-FAST-NEXT: vpermd %ymm27, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX512F-FAST-NEXT: vpandn %ymm4, %ymm15, %ymm4 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm14 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm6, %zmm7, %zmm4 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm27 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm31 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm26 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <6,6,6,u,7,7,7,7,u,8,8,8,8,u,9,9> -; AVX512F-FAST-NEXT: vpermd %zmm25, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,0,1,1,4,4,5,5] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,0,1,1,4,4,5,5] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm3, %zmm0 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, 64(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <4,u,5,5,5,5,u,6,14,14,14,u,15,15,15,15> +; AVX512F-FAST-NEXT: vpermd %zmm15, %zmm14, %zmm29 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [4,6,5,5,5,5,4,6] +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm15 +; AVX512F-FAST-NEXT: vpermd %ymm15, %ymm14, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX512F-FAST-NEXT: vpandnq %ymm14, %ymm28, %ymm14 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm2, %zmm30 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm26, %zmm1, %zmm28 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm31, %zmm0, %zmm26 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm5[11],zero,zero,ymm5[10],zero,ymm5[12],zero,ymm5[14],zero,zero,ymm5[13],zero,ymm5[15],zero,zero,zero,ymm5[27],zero,zero,ymm5[26],zero,ymm5[28],zero,ymm5[30],zero,zero,ymm5[29],zero,ymm5[31],zero,zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm11[3],zero,ymm11[5],zero,zero,ymm11[4],zero,ymm11[6],zero,ymm11[8],zero,zero,ymm11[7],zero,zero,zero,ymm11[19],zero,ymm11[21],zero,zero,ymm11[20],zero,ymm11[22],zero,ymm11[24],zero,zero,ymm11[23],zero +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm8 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm3[5],zero,zero,ymm3[4],zero,ymm3[6],zero,ymm3[8],zero,zero,ymm3[7],zero,ymm3[9],zero,zero,zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero,ymm3[25],zero,zero +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[3],zero,ymm10[5,u],zero,ymm10[4],zero,ymm10[6],zero,ymm10[8,u],zero,ymm10[7],zero,ymm10[9,u,19],zero,ymm10[21,u],zero,ymm10[20],zero,ymm10[22],zero,ymm10[24,u],zero,ymm10[23],zero,ymm10[25,u] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[10],zero,ymm3[12],zero,zero,ymm3[11],zero,ymm3[13],zero,ymm3[15],zero,zero,ymm3[14],zero,zero,zero,ymm3[26],zero,ymm3[28],zero,zero,ymm3[27],zero,ymm3[29],zero,ymm3[31],zero,zero,ymm3[30],zero +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm10[11,u],zero,ymm10[10],zero,ymm10[12,u],zero,ymm10[u],zero,ymm10[13],zero,ymm10[15,u],zero,ymm10[14,27,u],zero,ymm10[26],zero,ymm10[28,u],zero,ymm10[u],zero,ymm10[29],zero,ymm10[31,u],zero,ymm10[30] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm12 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm6 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm27, %zmm15, %zmm10 +; AVX512F-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm12 = mem[0,0,1,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm20[0,0,1,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm13, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm12, %zmm14, %zmm13 +; AVX512F-FAST-NEXT: vpor %ymm5, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm18, %zmm0 +; AVX512F-FAST-NEXT: vpor %ymm9, %ymm11, %ymm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm25, %zmm5 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm14, %zmm5 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm21[2,2,3,3,6,6,7,7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm22[2,2,3,3,6,6,7,7] +; AVX512F-FAST-NEXT: vporq %zmm0, %zmm9, %zmm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm23[2,2,3,3,6,6,7,7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm24[2,2,3,3,6,6,7,7] +; AVX512F-FAST-NEXT: vporq %zmm9, %zmm11, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm11, %zmm9 +; AVX512F-FAST-NEXT: vpor %ymm2, %ymm8, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpor %ymm3, %ymm7, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm11, %zmm1 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm4 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm29 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm30 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm28[0,0,1,1,4,4,5,5] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm26[0,0,1,1,4,4,5,5] +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <6,6,6,u,7,7,7,7,u,8,8,8,8,u,9,9> +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 64(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, 256(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 128(%r9) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 128(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, 256(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 192(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 192(%r9) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; -; AVX512BW-ONLY-SLOW-LABEL: store_i8_stride5_vf64: -; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: movl $693250386, %eax # imm = 0x29522952 -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm4, %ymm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm15 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm4, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm5 -; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm5[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: movl $1251232404, %eax # imm = 0x4A944A94 -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm9, %ymm2 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm11, %xmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm10, %zmm2 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r8), %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [6,6,6,6,7,7,7,7,16,16,16,16,16,16,17,17] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm16, %zmm3, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm10, %zmm2 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm23, %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm24, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm17, %ymm18, %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm20, %xmm12, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm22 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm22, %xmm6, %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm12, %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm6, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm14, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm13, %xmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm12[0,0,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <3,3,3,u,4,4,4,4> -; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm25, %ymm12, %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm26 -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm14 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] -; AVX512BW-ONLY-SLOW-NEXT: movl $138547332, %eax # imm = 0x8421084 -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm26, %ymm17 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm13, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm13, %zmm6 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <3,3,3,3,u,4,4,4> -; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm16, %ymm13, %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm18 = mem[1,1,2,2] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm18[0,1,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm17, %zmm6 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] -; AVX512BW-ONLY-SLOW-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm17, %ymm26, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm18[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm18, %ymm25, %ymm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm27, %ymm28, %ymm27 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm26, %ymm15 -; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm25 = ymm25[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,2,3,3,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm25, %ymm15 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm27, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm25, %ymm23, %ymm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX512BW-ONLY-SLOW-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm24, %ymm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm26, %ymm28, %ymm26 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm24, %ymm8 -; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm23 = ymm23[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[2,2,3,3,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm23, %ymm8 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm26, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [4,6,5,5,5,5,4,6,6,6,6,6,7,7,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermd %zmm16, %zmm15, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm16 -; AVX512BW-ONLY-SLOW-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm20, %xmm15, %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm22, %xmm16, %xmm22 -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm20, %xmm22, %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm15 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm20, %zmm7, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm16, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm15, %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm15, %xmm9 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm9, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,1,1,4,4,5,5] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,0,1,1,4,4,5,5] -; AVX512BW-ONLY-SLOW-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm7, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] -; AVX512BW-ONLY-SLOW-NEXT: vpermd %zmm3, %zmm7, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm3, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm25, %ymm1, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm0, %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm3, %ymm7, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm5, %ymm12, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm4, %ymm1 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm17, %ymm4, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm18, %ymm5, %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm0, %ymm13, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,1,1,4,6,5,5] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 256(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 192(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vzeroupper -; AVX512BW-ONLY-SLOW-NEXT: retq +; AVX512BW-SLOW-LABEL: store_i8_stride5_vf64: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] +; AVX512BW-SLOW-NEXT: vpshufb %ymm5, %ymm0, %ymm2 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] +; AVX512BW-SLOW-NEXT: movl $693250386, %eax # imm = 0x29522952 +; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1} +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm17 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> +; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm3, %xmm3 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm6 +; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] +; AVX512BW-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm3 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] +; AVX512BW-SLOW-NEXT: movl $1251232404, %eax # imm = 0x4A944A94 +; AVX512BW-SLOW-NEXT: kmovd %eax, %k3 +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm9, %ymm3 {%k3} +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %xmm19 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm20 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdi), %xmm16 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm16[0],xmm12[0],xmm16[1],xmm12[1],xmm16[2],xmm12[2],xmm16[3],xmm12[3],xmm16[4],xmm12[4],xmm16[5],xmm12[5],xmm16[6],xmm12[6],xmm16[7],xmm12[7] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm22 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX512BW-SLOW-NEXT: vpshufb %xmm22, %xmm9, %xmm9 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm3 +; AVX512BW-SLOW-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k2 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm6, %zmm3 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 (%r8), %xmm21 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r8), %xmm24 +; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %ymm6 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,6,7,7,7,7,16,16,16,16,16,16,17,17] +; AVX512BW-SLOW-NEXT: vpermi2d %zmm24, %zmm6, %zmm9 +; AVX512BW-SLOW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k4 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm9, %zmm3 {%k4} +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdx), %ymm14 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] +; AVX512BW-SLOW-NEXT: vpshufb %ymm9, %ymm14, %ymm23 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm18 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512BW-SLOW-NEXT: vpshufb %ymm10, %ymm18, %ymm25 +; AVX512BW-SLOW-NEXT: vporq %ymm23, %ymm25, %ymm23 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm25 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> +; AVX512BW-SLOW-NEXT: vpshufb %xmm25, %xmm11, %xmm11 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm26 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> +; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm7, %xmm7 +; AVX512BW-SLOW-NEXT: vpor %xmm7, %xmm11, %xmm7 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm7, %zmm7 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm27 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> +; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm16, %xmm11 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm28 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> +; AVX512BW-SLOW-NEXT: vpshufb %xmm28, %xmm12, %xmm12 +; AVX512BW-SLOW-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm11[0,0,1,1] +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm29 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <3,3,3,u,4,4,4,4> +; AVX512BW-SLOW-NEXT: vpermd %ymm29, %ymm11, %ymm23 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm30 +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm12 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX512BW-SLOW-NEXT: movl $138547332, %eax # imm = 0x8421084 +; AVX512BW-SLOW-NEXT: kmovd %eax, %k4 +; AVX512BW-SLOW-NEXT: vpshufb %ymm12, %ymm30, %ymm23 {%k4} +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm16, %zmm16 +; AVX512BW-SLOW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k5 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm16, %zmm7 {%k5} +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r8), %ymm23 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <3,3,3,3,u,4,4,4> +; AVX512BW-SLOW-NEXT: vpermd %ymm23, %ymm16, %ymm31 +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm24 = xmm24[1,1,2,2] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[0,1,1,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm24, %zmm24 +; AVX512BW-SLOW-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k6 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm24, %zmm7 {%k6} +; AVX512BW-SLOW-NEXT: vpshufb %xmm25, %xmm15, %xmm24 +; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm13, %xmm25 +; AVX512BW-SLOW-NEXT: vporq %xmm24, %xmm25, %xmm24 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm13, %xmm13 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm24, %zmm13, %zmm13 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm15 = zmm13[0,0,1,1,4,4,5,5] +; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm20, %xmm13 +; AVX512BW-SLOW-NEXT: vpshufb %xmm28, %xmm19, %xmm17 +; AVX512BW-SLOW-NEXT: vporq %xmm13, %xmm17, %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm22, %xmm17, %xmm17 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm13, %zmm17, %zmm13 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm13 = zmm13[0,0,1,1,4,4,5,5] +; AVX512BW-SLOW-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C +; AVX512BW-SLOW-NEXT: kmovq %rax, %k6 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm13 {%k6} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512BW-SLOW-NEXT: vpermd %zmm21, %zmm15, %zmm15 +; AVX512BW-SLOW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k6 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm13 {%k6} +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [3,128,5,128,128,4,128,6,128,8,128,128,7,128,9,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] +; AVX512BW-SLOW-NEXT: vpshufb %ymm15, %ymm30, %ymm17 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[2,2,3,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [128,5,128,128,4,128,6,128,8,128,128,7,128,9,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] +; AVX512BW-SLOW-NEXT: vpshufb %ymm19, %ymm29, %ymm20 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,3,3] +; AVX512BW-SLOW-NEXT: vporq %ymm17, %ymm20, %ymm17 +; AVX512BW-SLOW-NEXT: vpshufb %ymm8, %ymm30, %ymm8 +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm20 = ymm29[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[2,2,3,3,6,6,7,7] +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm20, %ymm8 {%k3} +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm17, %zmm8 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [2,3,128,5,128,5,4,128,6,128,8,128,6,7,128,9,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] +; AVX512BW-SLOW-NEXT: vpshufb %ymm17, %ymm14, %ymm20 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,3,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [128,128,3,128,5,128,128,4,128,6,128,8,128,128,7,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX512BW-SLOW-NEXT: vpshufb %ymm21, %ymm18, %ymm22 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,3,3] +; AVX512BW-SLOW-NEXT: vporq %ymm20, %ymm22, %ymm20 +; AVX512BW-SLOW-NEXT: vpshufb %ymm5, %ymm18, %ymm5 +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,3,3,6,6,7,7] +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm14, %ymm5 {%k1} +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm20, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm8, %zmm5 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,6,5,5,5,5,4,6,6,6,6,6,7,7,7,7] +; AVX512BW-SLOW-NEXT: vpermd %zmm23, %zmm8, %zmm8 +; AVX512BW-SLOW-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm8, %zmm5 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb %ymm17, %ymm1, %ymm8 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX512BW-SLOW-NEXT: vpshufb %ymm21, %ymm0, %ymm14 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] +; AVX512BW-SLOW-NEXT: vpor %ymm8, %ymm14, %ymm8 +; AVX512BW-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512BW-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpshufb %ymm15, %ymm2, %ymm1 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] +; AVX512BW-SLOW-NEXT: vpshufb %ymm19, %ymm4, %ymm8 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX512BW-SLOW-NEXT: vpor %ymm1, %ymm8, %ymm1 +; AVX512BW-SLOW-NEXT: vpermd %ymm4, %ymm11, %ymm4 +; AVX512BW-SLOW-NEXT: vpshufb %ymm12, %ymm2, %ymm4 {%k4} +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5} +; AVX512BW-SLOW-NEXT: vpermd %ymm6, %ymm16, %ymm0 +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,2,1,1,4,6,5,5] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,3,2] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, 256(%r9) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm13, (%r9) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, 192(%r9) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm3, 128(%r9) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: store_i8_stride5_vf64: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %ymm19 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] -; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm19, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm3 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] +; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm3 ; AVX512BW-FAST-NEXT: vpor %ymm0, %ymm3, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rcx), %xmm16 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> -; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm16, %xmm3 -; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %xmm18 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> -; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm18, %xmm4 +; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %xmm10 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm16 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> +; AVX512BW-FAST-NEXT: vpshufb %xmm16, %xmm11, %xmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %xmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm18 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> +; AVX512BW-FAST-NEXT: vpshufb %xmm18, %xmm13, %xmm4 ; AVX512BW-FAST-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> -; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm15, %xmm3 -; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %xmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rsi), %xmm17 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm17, %xmm4 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %xmm19 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm20 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> +; AVX512BW-FAST-NEXT: vpshufb %xmm20, %xmm12, %xmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %xmm21 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %xmm14 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm22 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> +; AVX512BW-FAST-NEXT: vpshufb %xmm22, %xmm14, %xmm4 ; AVX512BW-FAST-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm3[0,0,1,1] -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %ymm21 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <3,3,3,u,4,4,4,4> -; AVX512BW-FAST-NEXT: vpermd %ymm21, %ymm3, %ymm22 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rsi), %ymm23 -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,0,1,1] +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <3,3,3,u,4,4,4,4> +; AVX512BW-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm15 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] ; AVX512BW-FAST-NEXT: movl $138547332, %eax # imm = 0x8421084 ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm23, %ymm22 {%k1} -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm20 +; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm15 {%k1} +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm7 ; AVX512BW-FAST-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm20, %zmm0 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%r8), %ymm20 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <1,1,2,2,2,2,2,2,27,27,27,27,u,28,28,28> -; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm20, %zmm22 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm7, %zmm0 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %xmm23 +; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %xmm15 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm24 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <1,1,2,2,2,2,2,2,11,11,11,11,u,12,12,12> +; AVX512BW-FAST-NEXT: vpermd %zmm24, %zmm25, %zmm24 ; AVX512BW-FAST-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 ; AVX512BW-FAST-NEXT: kmovq %rax, %k3 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm22, %zmm0 {%k3} -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm22 = zmm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,zmm22[21],zero,zero,zmm22[20],zero,zmm22[22],zero,zmm22[24],zero,zero,zmm22[23],zero,zmm22[25],zero,zmm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,59],zero,zero,zmm22[58],zero,zmm22[60],zero,zero,zero,zero,zmm22[61],zero,zmm22[63],zero,zero,zmm22[62] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,2,3,3,6,6,7,7] -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm21 = zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm21[21],zero,zero,zmm21[20],zero,zmm21[22],zero,zmm21[24],zero,zero,zmm21[23],zero,zmm21[25],zero,zero,zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm21[58],zero,zmm21[60],zero,zero,zmm21[59],zero,zmm21[61],zero,zmm21[63],zero,zero,zmm21[62],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm21 = zmm21[2,2,3,3,6,6,7,7] -; AVX512BW-FAST-NEXT: vporq %zmm22, %zmm21, %zmm21 -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm19 = zmm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,zmm19[21],zero,zmm19[21,20],zero,zmm19[22],zero,zmm19[24],zero,zmm19[22,23],zero,zmm19[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm19[59],zero,zero,zmm19[58],zero,zmm19[60],zero,zmm19[62],zero,zero,zmm19[61],zero,zmm19[63],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm19 = zmm19[2,2,3,3,6,6,7,7] -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm13 = zmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm13[19],zero,zmm13[21],zero,zero,zmm13[20],zero,zmm13[22],zero,zmm13[24],zero,zero,zmm13[23],zero,zmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm13[59],zero,zero,zmm13[58],zero,zmm13[60],zero,zmm13[62],zero,zero,zmm13[61],zero,zmm13[63],zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm13 = zmm13[2,2,3,3,6,6,7,7] -; AVX512BW-FAST-NEXT: vporq %zmm19, %zmm13, %zmm13 -; AVX512BW-FAST-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 -; AVX512BW-FAST-NEXT: kmovq %rax, %k3 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm21, %zmm13 {%k3} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31] -; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm20, %zmm19 -; AVX512BW-FAST-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 -; AVX512BW-FAST-NEXT: kmovq %rax, %k4 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm19, %zmm13 {%k4} -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %ymm19 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm20 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm19[27],zero,zero,ymm19[26],zero,ymm19[28],zero,ymm19[30],zero,zero,ymm19[29],zero,ymm19[31],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,3,3] -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %ymm21 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm22 = ymm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm21[27],zero,zero,ymm21[26],zero,ymm21[28],zero,ymm21[30],zero,zero,ymm21[29],zero,ymm21[31],zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,3,3] -; AVX512BW-FAST-NEXT: vporq %ymm20, %ymm22, %ymm20 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm16[0],xmm18[0],xmm16[1],xmm18[1],xmm16[2],xmm18[2],xmm16[3],xmm18[3],xmm16[4],xmm18[4],xmm16[5],xmm18[5],xmm16[6],xmm18[6],xmm16[7],xmm18[7] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm18 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX512BW-FAST-NEXT: vpshufb %xmm18, %xmm16, %xmm16 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,0,1,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm20, %zmm20 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %ymm16 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm22 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm16[26],zero,ymm16[28],zero,zero,zero,zero,ymm16[29],zero,ymm16[31],zero,zero,ymm16[30] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,3,3] -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %ymm23 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm24 = ymm23[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm23[26],zero,ymm23[28],zero,zero,ymm23[27],zero,ymm23[29],zero,ymm23[31],zero,zero,ymm23[30],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,3,3] -; AVX512BW-FAST-NEXT: vporq %ymm22, %ymm24, %ymm22 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3],xmm15[4],xmm17[4],xmm15[5],xmm17[5],xmm15[6],xmm17[6],xmm15[7],xmm17[7] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm17 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX512BW-FAST-NEXT: vpshufb %xmm17, %xmm15, %xmm15 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm22, %zmm15 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm20, %zmm15 {%k3} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9] -; AVX512BW-FAST-NEXT: vpermd %zmm5, %zmm20, %zmm5 -; AVX512BW-FAST-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 -; AVX512BW-FAST-NEXT: kmovq %rax, %k3 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm5, %zmm15 {%k3} -; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm5 -; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm8 -; AVX512BW-FAST-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm18, %xmm6, %xmm6 -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm6, %zmm5 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm5[0,0,1,1,4,4,5,5] -; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm6 -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm7 -; AVX512BW-FAST-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3],xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm17, %xmm7, %xmm7 -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm7, %zmm6 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm6[0,0,1,1,4,4,5,5] +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm24, %zmm0 {%k3} +; AVX512BW-FAST-NEXT: vpshufb %xmm16, %xmm10, %xmm16 +; AVX512BW-FAST-NEXT: vpshufb %xmm18, %xmm17, %xmm18 +; AVX512BW-FAST-NEXT: vporq %xmm16, %xmm18, %xmm16 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm17[0],xmm10[1],xmm17[1],xmm10[2],xmm17[2],xmm10[3],xmm17[3],xmm10[4],xmm17[4],xmm10[5],xmm17[5],xmm10[6],xmm17[6],xmm10[7],xmm17[7] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm17 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> +; AVX512BW-FAST-NEXT: vpshufb %xmm17, %xmm10, %xmm10 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm16, %zmm10, %zmm10 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm16 = zmm10[0,0,1,1,4,4,5,5] +; AVX512BW-FAST-NEXT: vpshufb %xmm20, %xmm19, %xmm10 +; AVX512BW-FAST-NEXT: vpshufb %xmm22, %xmm21, %xmm18 +; AVX512BW-FAST-NEXT: vporq %xmm10, %xmm18, %xmm10 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm19[0],xmm21[0],xmm19[1],xmm21[1],xmm19[2],xmm21[2],xmm19[3],xmm21[3],xmm19[4],xmm21[4],xmm19[5],xmm21[5],xmm19[6],xmm21[6],xmm19[7],xmm21[7] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm19 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX512BW-FAST-NEXT: vpshufb %xmm19, %xmm18, %xmm18 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm18, %zmm10 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,0,1,1,4,4,5,5] ; AVX512BW-FAST-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C ; AVX512BW-FAST-NEXT: kmovq %rax, %k3 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm5, %zmm6 {%k3} -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] -; AVX512BW-FAST-NEXT: vpermd %zmm5, %zmm7, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm16, %zmm10 {%k3} +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm23, %zmm23, %zmm16 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] +; AVX512BW-FAST-NEXT: vpermd %zmm16, %zmm18, %zmm16 ; AVX512BW-FAST-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512BW-FAST-NEXT: kmovq %rax, %k3 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm7, %zmm6 {%k3} -; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm21, %ymm1 -; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm19, %ymm2 -; AVX512BW-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm21[21],zero,ymm21[21,20],zero,ymm21[22],zero,ymm21[24],zero,ymm21[22,23],zero,ymm21[25] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm19[19],zero,ymm19[21],zero,zero,ymm19[20],zero,ymm19[22],zero,ymm19[24],zero,zero,ymm19[23],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512BW-FAST-NEXT: vpor %ymm2, %ymm7, %ymm2 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm16[21],zero,zero,ymm16[20],zero,ymm16[22],zero,ymm16[24],zero,zero,ymm16[23],zero,ymm16[25],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm23[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm23[21],zero,zero,ymm23[20],zero,ymm23[22],zero,ymm23[24],zero,zero,ymm23[23],zero,ymm23[25],zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512BW-FAST-NEXT: vpor %ymm2, %ymm7, %ymm2 -; AVX512BW-FAST-NEXT: vpermd %ymm23, %ymm3, %ymm3 -; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm16, %ymm3 {%k1} -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm2 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <3,3,3,3,u,4,4,4,12,14,13,13,13,13,12,14> -; AVX512BW-FAST-NEXT: vpermd %zmm5, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm16, %zmm10 {%k3} +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %ymm16 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm18 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm16[27],zero,zero,ymm16[26],zero,ymm16[28],zero,ymm16[30],zero,zero,ymm16[29],zero,ymm16[31],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm18[2,2,3,3] +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %ymm20 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm21 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm20[27],zero,zero,ymm20[26],zero,ymm20[28],zero,ymm20[30],zero,zero,ymm20[29],zero,ymm20[31],zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[2,2,3,3] +; AVX512BW-FAST-NEXT: vporq %ymm18, %ymm21, %ymm18 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3],xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm17, %xmm11, %xmm11 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm18, %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %ymm17 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm17[26],zero,ymm17[28],zero,zero,zero,zero,ymm17[29],zero,ymm17[31],zero,zero,ymm17[30] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %ymm18 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm21 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm18[26],zero,ymm18[28],zero,zero,ymm18[27],zero,ymm18[29],zero,ymm18[31],zero,zero,ymm18[30],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[2,2,3,3] +; AVX512BW-FAST-NEXT: vporq %ymm11, %ymm21, %ymm11 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm19, %xmm12, %xmm12 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 +; AVX512BW-FAST-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 +; AVX512BW-FAST-NEXT: kmovq %rax, %k3 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm13, %zmm11 {%k3} +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9] +; AVX512BW-FAST-NEXT: vpermd %zmm13, %zmm14, %zmm13 +; AVX512BW-FAST-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 +; AVX512BW-FAST-NEXT: kmovq %rax, %k4 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm13, %zmm11 {%k4} +; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm20, %ymm5 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm16, %ymm6 +; AVX512BW-FAST-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm20[2,3],zero,ymm20[5],zero,ymm20[5,4],zero,ymm20[6],zero,ymm20[8],zero,ymm20[6,7],zero,ymm20[9,18,19],zero,ymm20[21],zero,ymm20[21,20],zero,ymm20[22],zero,ymm20[24],zero,ymm20[22,23],zero,ymm20[25] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,ymm16[3],zero,ymm16[5],zero,zero,ymm16[4],zero,ymm16[6],zero,ymm16[8],zero,zero,ymm16[7],zero,zero,zero,ymm16[19],zero,ymm16[21],zero,zero,ymm16[20],zero,ymm16[22],zero,ymm16[24],zero,zero,ymm16[23],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3] +; AVX512BW-FAST-NEXT: vpor %ymm6, %ymm13, %ymm6 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm17[3],zero,ymm17[5],zero,zero,ymm17[4],zero,ymm17[6],zero,ymm17[8],zero,zero,ymm17[7],zero,ymm17[9],zero,ymm17[19],zero,ymm17[21],zero,zero,ymm17[20],zero,ymm17[22],zero,ymm17[24],zero,zero,ymm17[23],zero,ymm17[25],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,ymm18[5],zero,zero,ymm18[4],zero,ymm18[6],zero,ymm18[8],zero,zero,ymm18[7],zero,ymm18[9],zero,zero,zero,ymm18[21],zero,zero,ymm18[20],zero,ymm18[22],zero,ymm18[24],zero,zero,ymm18[23],zero,ymm18[25],zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3] +; AVX512BW-FAST-NEXT: vpor %ymm6, %ymm13, %ymm6 +; AVX512BW-FAST-NEXT: vpermd %ymm18, %ymm8, %ymm8 +; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm17, %ymm8 {%k1} +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm5, %zmm6 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <3,3,3,3,u,4,4,4,12,14,13,13,13,13,12,14> +; AVX512BW-FAST-NEXT: vpermd %zmm12, %zmm5, %zmm5 ; AVX512BW-FAST-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm2 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, 64(%r9) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, (%r9) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, 128(%r9) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, 256(%r9) +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm5, %zmm6 {%k1} +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,zmm5[21],zero,zmm5[21,20],zero,zmm5[22],zero,zmm5[24],zero,zmm5[22,23],zero,zmm5[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm5[59],zero,zero,zmm5[58],zero,zmm5[60],zero,zmm5[62],zero,zero,zmm5[61],zero,zmm5[63],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,2,3,3,6,6,7,7] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm1[19],zero,zmm1[21],zero,zero,zmm1[20],zero,zmm1[22],zero,zmm1[24],zero,zero,zmm1[23],zero,zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm1[59],zero,zero,zmm1[58],zero,zmm1[60],zero,zmm1[62],zero,zero,zmm1[61],zero,zmm1[63],zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,3,3,6,6,7,7] +; AVX512BW-FAST-NEXT: vporq %zmm5, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm2 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,zmm4[21],zero,zero,zmm4[20],zero,zmm4[22],zero,zmm4[24],zero,zero,zmm4[23],zero,zmm4[25],zero,zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,59],zero,zero,zmm4[58],zero,zmm4[60],zero,zero,zero,zero,zmm4[61],zero,zmm4[63],zero,zero,zmm4[62] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,3,3,6,6,7,7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm3[21],zero,zero,zmm3[20],zero,zmm3[22],zero,zmm3[24],zero,zero,zmm3[23],zero,zmm3[25],zero,zero,zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm3[58],zero,zmm3[60],zero,zero,zmm3[59],zero,zmm3[61],zero,zmm3[63],zero,zero,zmm3[62],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,2,3,3,6,6,7,7] +; AVX512BW-FAST-NEXT: vporq %zmm2, %zmm3, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm1 {%k3} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,6,5,5,5,5,4,6,14,14,14,14,15,15,15,15] +; AVX512BW-FAST-NEXT: vpermd %zmm7, %zmm2, %zmm2 +; AVX512BW-FAST-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 +; AVX512BW-FAST-NEXT: kmovq %rax, %k1 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 256(%r9) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, 128(%r9) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, (%r9) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 192(%r9) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq -; -; AVX512DQBW-SLOW-LABEL: store_i8_stride5_vf64: -; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: movl $693250386, %eax # imm = 0x29522952 -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm4, %ymm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm15 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm15, %ymm4, %ymm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm5 -; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm5[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: movl $1251232404, %eax # imm = 0x4A944A94 -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k5 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm9, %ymm2 {%k5} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm9, %xmm11, %xmm11 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k4 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm10, %zmm2 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r8), %ymm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [6,6,6,6,7,7,7,7,16,16,16,16,16,16,17,17] -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm16, %zmm3, %zmm10 -; AVX512DQBW-SLOW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm10, %zmm2 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm10, %ymm23, %ymm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm11, %ymm24, %ymm18 -; AVX512DQBW-SLOW-NEXT: vporq %ymm17, %ymm18, %ymm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm20, %xmm12, %xmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm22 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm22, %xmm6, %xmm6 -; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm12, %xmm6 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm6, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm14, %xmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm13, %xmm13 -; AVX512DQBW-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm12[0,0,1,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <3,3,3,u,4,4,4,4> -; AVX512DQBW-SLOW-NEXT: vpermd %ymm25, %ymm12, %ymm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm26 -; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm14 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] -; AVX512DQBW-SLOW-NEXT: movl $138547332, %eax # imm = 0x8421084 -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k3 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm14, %ymm26, %ymm17 {%k3} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm13, %zmm13 -; AVX512DQBW-SLOW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm13, %zmm6 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <3,3,3,3,u,4,4,4> -; AVX512DQBW-SLOW-NEXT: vpermd %ymm16, %ymm13, %ymm17 -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm18 = mem[1,1,2,2] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm18[0,1,1,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm17 -; AVX512DQBW-SLOW-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k6 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm17, %zmm6 {%k6} -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm17 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] -; AVX512DQBW-SLOW-NEXT: # ymm17 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm17, %ymm26, %ymm18 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm18[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm18, %ymm25, %ymm28 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vporq %ymm27, %ymm28, %ymm27 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm15, %ymm26, %ymm15 -; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm25 = ymm25[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,2,3,3,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm25, %ymm15 {%k5} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm27, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm25, %ymm23, %ymm26 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX512DQBW-SLOW-NEXT: # ymm27 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm24, %ymm28 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vporq %ymm26, %ymm28, %ymm26 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm8, %ymm24, %ymm8 -; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm23 = ymm23[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[2,2,3,3,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm23, %ymm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm26, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [4,6,5,5,5,5,4,6,6,6,6,6,7,7,7,7] -; AVX512DQBW-SLOW-NEXT: vpermd %zmm16, %zmm15, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm16 -; AVX512DQBW-SLOW-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm15 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm20, %xmm15, %xmm20 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm22, %xmm16, %xmm22 -; AVX512DQBW-SLOW-NEXT: vporq %xmm20, %xmm22, %xmm20 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm15 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm20, %zmm7, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm16, %xmm19 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm15, %xmm20 -; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm9, %xmm15, %xmm9 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm9, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,1,1,4,4,5,5] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,0,1,1,4,4,5,5] -; AVX512DQBW-SLOW-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm7, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] -; AVX512DQBW-SLOW-NEXT: vpermd %zmm3, %zmm7, %zmm3 -; AVX512DQBW-SLOW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm3, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm25, %ymm1, %ymm3 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm0, %ymm7 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpor %ymm3, %ymm7, %ymm3 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermd %ymm5, %ymm12, %ymm1 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm14, %ymm4, %ymm1 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm17, %ymm4, %ymm3 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm18, %ymm5, %ymm4 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512DQBW-SLOW-NEXT: vpermd %ymm0, %ymm13, %ymm3 -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,1,1,4,6,5,5] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%r9) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 256(%r9) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 192(%r9) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 128(%r9) -; AVX512DQBW-SLOW-NEXT: vzeroupper -; AVX512DQBW-SLOW-NEXT: retq %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 %in.vec1 = load <64 x i8>, ptr %in.vecptr1, align 64 %in.vec2 = load <64 x i8>, ptr %in.vecptr2, align 64 @@ -4776,9 +4594,11 @@ ; AVX512-FAST: {{.*}} ; AVX512-SLOW: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} +; AVX512BW-ONLY-SLOW: {{.*}} ; AVX512DQ-FAST: {{.*}} ; AVX512DQ-SLOW: {{.*}} ; AVX512DQBW-FAST: {{.*}} +; AVX512DQBW-SLOW: {{.*}} ; AVX512F-ONLY-FAST: {{.*}} ; AVX512F-ONLY-SLOW: {{.*}} ; FALLBACK0: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll @@ -22,31 +22,29 @@ ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: movdqa (%r8), %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,5,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: packuswb %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movq %xmm0, (%rax) -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movq %xmm3, (%rax) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE-NEXT: movd %xmm0, 8(%rax) ; SSE-NEXT: retq ; @@ -55,14 +53,13 @@ ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa (%rdx), %xmm1 -; AVX-NEXT: vmovdqa (%r9), %xmm2 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: vmovdqa (%r8), %xmm2 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1],zero,xmm0[2,6,10,14,3],zero,xmm0[u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,zero,xmm2[1,u,u,u,u] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,2,6,10,14,3,7,u,u,u,u] ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rax) ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq @@ -90,44 +87,51 @@ ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa (%rdx), %xmm2 ; SSE-NEXT: movdqa (%r8), %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,6] -; SSE-NEXT: packuswb %xmm5, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm3, %xmm6 -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[3,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm6[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm6[0,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,5] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[2,3] +; SSE-NEXT: packuswb %xmm3, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] -; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] +; SSE-NEXT: pand %xmm3, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm5 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movq %xmm2, 16(%rax) ; SSE-NEXT: movdqa %xmm3, (%rax) ; SSE-NEXT: retq @@ -135,13 +139,13 @@ ; AVX1-ONLY-LABEL: store_i8_stride6_vf4: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0,2] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,4,8,12],zero,zero,xmm0[1,5,9,13],zero,zero,xmm0[2,6,10,14] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[0,4],zero,zero,zero,zero,xmm1[1,5],zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 @@ -155,13 +159,15 @@ ; AVX2-ONLY-LABEL: store_i8_stride6_vf4: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-ONLY-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX2-ONLY-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,ymm0[1,5,9,13],zero,zero,ymm0[2,6,10,14,18,22],zero,zero,zero,zero,ymm0[19,23],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -176,13 +182,15 @@ ; AVX512F-LABEL: store_i8_stride6_vf4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512F-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-NEXT: vmovdqa (%r8), %xmm1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512F-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,ymm0[1,5,9,13],zero,zero,ymm0[2,6,10,14,18,22],zero,zero,zero,zero,ymm0[19,23,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -197,13 +205,15 @@ ; AVX512BW-LABEL: store_i8_stride6_vf4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm1 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12,u,u,1,5,9,13,u,u,2,6,10,14,18,22,u,u,u,u,19,23,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -300,30 +310,32 @@ ; AVX1-ONLY-LABEL: store_i8_stride6_vf8: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u],zero,zero,xmm1[3,11,u,u],zero,zero,xmm1[4,12,u,u],zero,zero +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u],zero,zero,xmm2[3,11,u,u],zero,zero,xmm2[4,12,u,u],zero,zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,3,11],zero,zero,xmm0[u,u,4,12],zero,zero,xmm0[u,u,5,13] ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,u,u,u,u,3,11,u,u,u,u,4,12,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[2,10,u,u,u,u,3,11,u,u,u,u,4,12,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm1[0,8,u,u],zero,zero,xmm1[1,9,u,u],zero,zero,xmm1[2,10] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm2[0,8,u,u],zero,zero,xmm2[1,9,u,u],zero,zero,xmm2[2,10] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,8],zero,zero,xmm0[u,u,1,9],zero,zero,xmm0[u,u,2,10],zero,zero ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,13,u,u],zero,zero,xmm2[6,14,u,u],zero,zero,xmm2[7,15,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) @@ -335,29 +347,31 @@ ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6],ymm2[7],ymm4[8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14],ymm2[15] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] -; AVX2-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[u,u,6,14],zero,zero,xmm1[u,u,7,15],zero,zero,xmm1[u,u] +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[5,13,u,u],zero,zero,xmm4[6,14,u,u],zero,zero,xmm4[7,15,u,u] +; AVX2-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %xmm3, 32(%rax) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -366,29 +380,31 @@ ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512F-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512F-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] -; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] -; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] -; AVX512F-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX512F-NEXT: vmovdqa %ymm1, (%rax) +; AVX512F-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512F-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512F-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2 +; AVX512F-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm2[u,u,6,14],zero,zero,xmm2[u,u,7,15],zero,zero,xmm2[u,u] +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,13,u,u],zero,zero,xmm3[6,14,u,u],zero,zero,xmm3[7,15,u,u] +; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] +; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 +; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa %xmm1, 32(%rax) +; AVX512F-NEXT: vmovdqa %ymm0, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -397,31 +413,33 @@ ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm2[u,u,6,14],zero,zero,xmm2[u,u,7,15],zero,zero,xmm2[u,u] +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,13,u,u],zero,zero,xmm3[6,14,u,u],zero,zero,xmm3[7,15,u,u] +; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512BW-NEXT: movw $18724, %cx # imm = 0x4924 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] -; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX512BW-NEXT: vmovdqa %ymm1, (%rax) +; AVX512BW-NEXT: vmovdqu16 %ymm0, %ymm2 {%k1} +; AVX512BW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa %xmm1, 32(%rax) +; AVX512BW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64 @@ -444,136 +462,132 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride6_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: movdqa (%rsi), %xmm8 -; SSE-NEXT: movdqa (%rdx), %xmm13 -; SSE-NEXT: movdqa (%rcx), %xmm2 -; SSE-NEXT: movdqa (%r8), %xmm11 -; SSE-NEXT: movdqa (%r9), %xmm10 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: por %xmm3, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm3, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,2,2] -; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm8, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,2,2] -; SSE-NEXT: movdqa %xmm8, %xmm14 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: por %xmm9, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[1,1,2,2] -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: pandn %xmm15, %xmm10 -; SSE-NEXT: pand %xmm9, %xmm14 -; SSE-NEXT: por %xmm14, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm12[3,3,3,3] -; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: pandn %xmm14, %xmm15 -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm13[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,2,2,3] -; SSE-NEXT: pand %xmm1, %xmm14 -; SSE-NEXT: por %xmm15, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm15, %xmm5 +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa (%rsi), %xmm9 +; SSE-NEXT: movdqa (%rdx), %xmm3 +; SSE-NEXT: movdqa (%rcx), %xmm7 +; SSE-NEXT: movdqa (%r8), %xmm13 +; SSE-NEXT: movdqa (%r9), %xmm8 +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,0,1,1] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm13, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,0,0,0] +; SSE-NEXT: movdqa %xmm5, %xmm14 +; SSE-NEXT: pandn %xmm4, %xmm14 +; SSE-NEXT: por %xmm6, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: pand %xmm4, %xmm14 -; SSE-NEXT: por %xmm14, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: pand %xmm14, %xmm5 -; SSE-NEXT: por %xmm5, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,1,1] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm5, %xmm11 -; SSE-NEXT: pand %xmm3, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0] -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm11, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: por %xmm4, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,2,2] -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,2,2] -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm6[0,0,0,0] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: por %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,2,2] +; SSE-NEXT: pand %xmm5, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: pandn %xmm7, %xmm15 +; SSE-NEXT: por %xmm9, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm9, %xmm15 +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,2,2] +; SSE-NEXT: movdqa %xmm9, %xmm14 +; SSE-NEXT: pandn %xmm7, %xmm14 +; SSE-NEXT: por %xmm15, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: pand %xmm7, %xmm14 +; SSE-NEXT: por %xmm14, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm5, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,2,2] +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: pand %xmm9, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,2,2] +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: por %xmm14, %xmm9 +; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: por %xmm9, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: pandn %xmm9, %xmm12 +; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; SSE-NEXT: pand %xmm10, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa %xmm10, %xmm11 +; SSE-NEXT: pandn %xmm1, %xmm11 +; SSE-NEXT: por %xmm12, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,0,0] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pand %xmm5, %xmm9 +; SSE-NEXT: por %xmm9, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,0,0] +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,3] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm9, 16(%rax) -; SSE-NEXT: movdqa %xmm14, 32(%rax) -; SSE-NEXT: movdqa %xmm3, 48(%rax) -; SSE-NEXT: movdqa %xmm15, 80(%rax) -; SSE-NEXT: movdqa %xmm10, 64(%rax) +; SSE-NEXT: movdqa %xmm10, 80(%rax) +; SSE-NEXT: movdqa %xmm4, 48(%rax) +; SSE-NEXT: movdqa %xmm11, 32(%rax) +; SSE-NEXT: movdqa %xmm7, 16(%rax) +; SSE-NEXT: movdqa %xmm8, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: retq @@ -686,9 +700,9 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512F-NEXT: vmovdqa (%r8), %xmm2 +; AVX512F-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 ; AVX512F-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512F-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 ; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u],zero,zero,ymm3[1,9,u,u],zero,zero,ymm3[2,10,u,u],zero,zero,ymm3[19,27,u,u],zero,zero,ymm3[20,28,u,u],zero,zero ; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] @@ -725,9 +739,9 @@ ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 ; AVX512BW-SLOW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 @@ -769,19 +783,18 @@ ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm2 +; AVX512BW-FAST-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 ; AVX512BW-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FAST-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 -; AVX512BW-FAST-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,0,2,8,10,9,11] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u,50,58,u,u] -; AVX512BW-FAST-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,2,0,2,8,10,9,11] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u,50,58,u,u] +; AVX512BW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29,37,45,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u,50,58] ; AVX512BW-FAST-NEXT: movl $1227105426, %ecx # imm = 0x49242492 ; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm5, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm4, %zmm3 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,0,2,0,2,1,3] ; AVX512BW-FAST-NEXT: vpermq %zmm2, %zmm4, %zmm4 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,37,45,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u] @@ -824,629 +837,609 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride6_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $200, %rsp -; SSE-NEXT: movdqa 16(%rdi), %xmm8 -; SSE-NEXT: movdqa 16(%rsi), %xmm5 -; SSE-NEXT: movdqa 16(%rdx), %xmm12 -; SSE-NEXT: movdqa 16(%rcx), %xmm4 -; SSE-NEXT: movdqa 16(%r8), %xmm11 -; SSE-NEXT: movdqa 16(%r9), %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[3,3,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; SSE-NEXT: subq $24, %rsp +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa (%rdx), %xmm13 +; SSE-NEXT: movdqa (%rcx), %xmm5 +; SSE-NEXT: movdqa (%r8), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r9), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,5,6,7,7] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,1,1] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: por %xmm6, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm6, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,0,0,0] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,2,2] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm4, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,2,2] +; SSE-NEXT: movdqa %xmm12, %xmm15 +; SSE-NEXT: pandn %xmm4, %xmm15 +; SSE-NEXT: pand %xmm12, %xmm9 +; SSE-NEXT: por %xmm9, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,2,2] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm15 +; SSE-NEXT: por %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[3,3,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: pand %xmm10, %xmm6 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm14 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: movdqa (%rdx), %xmm11 -; SSE-NEXT: movdqa (%rcx), %xmm12 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,6,7,7] -; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[2,2,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm8 +; SSE-NEXT: por %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,0,0] +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,2,2] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,2,2] +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,2] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[3,3,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movdqa 16(%rdx), %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,3] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: movdqa 16(%rcx), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa (%r8), %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm15 -; SSE-NEXT: pandn %xmm6, %xmm15 -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa (%r9), %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm15 -; SSE-NEXT: por %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3],xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa 16(%r8), %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: movdqa 16(%r9), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm3, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,2] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm6, %xmm13 +; SSE-NEXT: pandn %xmm5, %xmm13 +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: por %xmm3, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,2,2] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,2,2] +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pand %xmm12, %xmm5 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,2] +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[3,3,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,3] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[2,2,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm14 +; SSE-NEXT: pandn %xmm6, %xmm14 +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: por %xmm3, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: pand %xmm7, %xmm14 +; SSE-NEXT: por %xmm14, %xmm8 +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[8],mem[8],xmm10[9],mem[9],xmm10[10],mem[10],xmm10[11],mem[11],xmm10[12],mem[12],xmm10[13],mem[13],xmm10[14],mem[14],xmm10[15],mem[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] +; SSE-NEXT: pandn %xmm3, %xmm14 +; SSE-NEXT: por %xmm6, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,2,2] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: pand %xmm12, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,2,2] +; SSE-NEXT: pandn %xmm3, %xmm12 +; SSE-NEXT: por %xmm6, %xmm12 +; SSE-NEXT: pand %xmm9, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,2] +; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: por %xmm12, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,3] +; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm15 -; SSE-NEXT: por %xmm15, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,2,2] -; SSE-NEXT: movdqa %xmm5, %xmm15 -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3],xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,2,2] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,2,2] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,2] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: por %xmm4, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,1,1] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: por %xmm4, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,2,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pand %xmm10, %xmm8 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,2] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,2,2] -; SSE-NEXT: movdqa %xmm5, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: pand %xmm5, %xmm8 -; SSE-NEXT: por %xmm8, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,2,2] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,2,2] -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm8, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,2] -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,1,1] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: pshuflw $161, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pshufd $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm13 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: por %xmm8, %xmm13 -; SSE-NEXT: pand %xmm7, %xmm13 -; SSE-NEXT: pshufd $0, (%rsp), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,0,0] -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: por %xmm13, %xmm7 -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: pshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,3] -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm8, %xmm3 -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: por %xmm3, %xmm9 -; SSE-NEXT: pand %xmm10, %xmm9 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm9, %xmm10 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] +; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: pand %xmm7, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: por %xmm11, %xmm7 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm10, 32(%rax) -; SSE-NEXT: movdqa %xmm7, 48(%rax) -; SSE-NEXT: movdqa %xmm0, 96(%rax) +; SSE-NEXT: movdqa %xmm7, 176(%rax) +; SSE-NEXT: movdqa %xmm9, 160(%rax) +; SSE-NEXT: movdqa %xmm14, 144(%rax) +; SSE-NEXT: movdqa %xmm8, 128(%rax) ; SSE-NEXT: movdqa %xmm5, 112(%rax) -; SSE-NEXT: movdqa %xmm15, 160(%rax) -; SSE-NEXT: movdqa %xmm6, 176(%rax) -; SSE-NEXT: movdqa %xmm11, (%rax) -; SSE-NEXT: movdqa %xmm12, 16(%rax) +; SSE-NEXT: movdqa %xmm13, 96(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rax) +; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rax) +; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%rax) -; SSE-NEXT: addq $200, %rsp +; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride6_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,3,3,3] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm14[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm12[8,u],zero,zero,zero,zero,xmm12[9,u],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4],zero,xmm3[6,7,8,9,10],zero,xmm3[12,13,14,15] -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm13[8],zero,zero,zero,zero,zero,xmm13[9],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm12[5,u],zero,zero,zero,zero,xmm12[6,u],zero,zero,zero,zero,xmm12[7,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm13[5],zero,zero,zero,zero,zero,xmm13[6],zero,zero,zero,zero,zero,xmm13[7] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm10 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm6 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[2,u],zero,zero,zero,zero,xmm5[3,u],zero,zero,zero,zero,xmm5[4,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm2[1,2],xmm7[3],xmm2[4,5],xmm7[6],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm3[2],zero,zero,zero,zero,zero,xmm3[3],zero,zero,zero,zero,zero,xmm3[4],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm5[0,u],zero,zero,zero,zero,xmm5[1,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm3[0],zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[10,u],zero,zero,zero,zero,xmm12[11,u],zero,zero,zero,zero,xmm12[12,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2,3,4,5,6],zero,xmm1[8,9,10,11,12],zero,xmm1[14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm13[10],zero,zero,zero,zero,zero,xmm13[11],zero,zero,zero,zero,zero,xmm13[12],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm5[13,u],zero,zero,zero,zero,xmm5[14,u],zero,zero,zero,zero,xmm5[15,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[13],zero,zero,zero,zero,zero,xmm3[14],zero,zero,zero,zero,zero,xmm3[15] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u> -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm13[13],zero,zero,zero,zero,zero,xmm13[14],zero,zero,zero,zero,zero,xmm13[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[10,u],zero,zero,zero,zero,xmm5[11,u],zero,zero,zero,zero,xmm5[12,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm3[10],zero,zero,zero,zero,zero,xmm3[11],zero,zero,zero,zero,zero,xmm3[12],zero,zero ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm3[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm0[13],zero,zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,zero,xmm0[15] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm15[1,2],xmm2[3],xmm15[4,5],xmm2[6],xmm15[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5,6],zero,xmm2[8,9,10,11,12],zero,xmm2[14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm0[10],zero,zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero,zero,xmm0[12],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm9 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm14[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm11 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm2, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm11, %ymm9, %ymm15 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,xmm12[0,u],zero,zero,zero,zero,xmm12[1,u],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0,1],xmm11[2],xmm15[3,4],xmm11[5],xmm15[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,zero,xmm13[0],zero,zero,zero,zero,zero,xmm13[1],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1,2],xmm12[3],xmm10[4,5],xmm12[6],xmm10[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm8[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm14 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,zero,xmm2[8,u],zero,zero,zero,zero,xmm2[9,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm12[2],xmm0[3,4],xmm12[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,zero,zero,xmm1[8],zero,zero,zero,zero,zero,xmm1[9],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <128,128,5,u,128,128,128,128,6,u,128,128,128,128,7,u> +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3],xmm12[4],xmm14[5,6],xmm12[7] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,5,128,128,128,128,128,6,128,128,128,128,128,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm15, %ymm9 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,0,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1,2],xmm2[3],xmm8[4,5],xmm2[6],xmm8[7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm8 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm1[0,u],zero,zero,zero,zero,xmm1[1,u],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,3,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm13, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm15, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm15, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2,3],xmm0[4],xmm7[5,6],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm5[8,u],zero,zero,zero,zero,xmm5[9,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm3[8],zero,zero,zero,zero,zero,xmm3[9],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm1[5,u],zero,zero,zero,zero,xmm1[6,u],zero,zero,zero,zero,xmm1[7,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[4,5,6,7,8],zero,xmm4[10,11,12,13,14],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7] -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[8,u],zero,zero,zero,zero,xmm1[9,u],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,xmm0[9],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm2[0,u],zero,zero,zero,zero,xmm2[1,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[2,u],zero,zero,zero,zero,xmm2[3,u],zero,zero,zero,zero,xmm2[4,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5],xmm6[6],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm1[2],zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,xmm1[4],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm15, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[10,u],zero,zero,zero,zero,xmm2[11,u],zero,zero,zero,zero,xmm2[12,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm1[10],zero,zero,zero,zero,zero,xmm1[11],zero,zero,zero,zero,zero,xmm1[12],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[13,u],zero,zero,zero,zero,xmm2[14,u],zero,zero,zero,zero,xmm2[15,u] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm2[1],xmm6[2,3],xmm2[4],xmm6[5,6],xmm2[7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[13],zero,zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,zero,xmm1[15] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 48(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, (%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 112(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 96(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 176(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 160(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 112(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 96(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i8_stride6_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: pushq %rax -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm6 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm8 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm9 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm7 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm8 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm7 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm7, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm7, %ymm10, %ymm14 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm10 -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm7[0],ymm10[0],ymm7[1],ymm10[1],ymm7[2],ymm10[2],ymm7[3],ymm10[3],ymm7[4],ymm10[4],ymm7[5],ymm10[5],ymm7[6],ymm10[6],ymm7[7],ymm10[7],ymm7[16],ymm10[16],ymm7[17],ymm10[17],ymm7[18],ymm10[18],ymm7[19],ymm10[19],ymm7[20],ymm10[20],ymm7[21],ymm10[21],ymm7[22],ymm10[22],ymm7[23],ymm10[23] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm15 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm7 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm7[0],ymm15[0],ymm7[1],ymm15[1],ymm7[2],ymm15[2],ymm7[3],ymm15[3],ymm7[4],ymm15[4],ymm7[5],ymm15[5],ymm7[6],ymm15[6],ymm7[7],ymm15[7],ymm7[16],ymm15[16],ymm7[17],ymm15[17],ymm7[18],ymm15[18],ymm7[19],ymm15[19],ymm7[20],ymm15[20],ymm7[21],ymm15[21],ymm7[22],ymm15[22],ymm7[23],ymm15[23] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm7 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm8 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm11 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm10, %xmm9 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm7, %ymm9, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm7 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm14 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm4, %ymm12 +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm3, %ymm9 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[4],ymm12[4],ymm9[5],ymm12[5],ymm9[6],ymm12[6],ymm9[7],ymm12[7],ymm9[16],ymm12[16],ymm9[17],ymm12[17],ymm9[18],ymm12[18],ymm9[19],ymm12[19],ymm9[20],ymm12[20],ymm9[21],ymm12[21],ymm9[22],ymm12[22],ymm9[23],ymm12[23] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm6, %ymm15 +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm2, %ymm9 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm9[0],ymm15[0],ymm9[1],ymm15[1],ymm9[2],ymm15[2],ymm9[3],ymm15[3],ymm9[4],ymm15[4],ymm9[5],ymm15[5],ymm9[6],ymm15[6],ymm9[7],ymm15[7],ymm9[16],ymm15[16],ymm9[17],ymm15[17],ymm9[18],ymm15[18],ymm9[19],ymm15[19],ymm9[20],ymm15[20],ymm9[21],ymm15[21],ymm9[22],ymm15[22],ymm9[23],ymm15[23] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm9 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm10, %ymm15, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm10 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm12, %ymm15, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm12 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm9, %ymm15, %ymm12 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm11, %ymm15, %ymm13 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm12, %ymm14, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm13 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,0,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm12 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm15, %ymm9, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm15[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[4],ymm6[4],ymm2[5],ymm6[5],ymm2[6],ymm6[6],ymm2[7],ymm6[7],ymm2[16],ymm6[16],ymm2[17],ymm6[17],ymm2[18],ymm6[18],ymm2[19],ymm6[19],ymm2[20],ymm6[20],ymm2[21],ymm6[21],ymm2[22],ymm6[22],ymm2[23],ymm6[23] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm11 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm14, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm9, %ymm14, %ymm14 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm9, %ymm15 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm11, %ymm14, %ymm14 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm15 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm0 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm12[8],ymm2[9],ymm12[9],ymm2[10],ymm12[10],ymm2[11],ymm12[11],ymm2[12],ymm12[12],ymm2[13],ymm12[13],ymm2[14],ymm12[14],ymm2[15],ymm12[15],ymm2[24],ymm12[24],ymm2[25],ymm12[25],ymm2[26],ymm12[26],ymm2[27],ymm12[27],ymm2[28],ymm12[28],ymm2[29],ymm12[29],ymm2[30],ymm12[30],ymm2[31],ymm12[31] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm10[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 128(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm15, 96(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm14, (%rax) -; AVX2-SLOW-NEXT: popq %rax ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -1510,79 +1503,79 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] ; AVX2-FAST-NEXT: vmovdqa %xmm10, %xmm13 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15],ymm8[24],ymm6[24],ymm8[25],ymm6[25],ymm8[26],ymm6[26],ymm8[27],ymm6[27],ymm8[28],ymm6[28],ymm8[29],ymm6[29],ymm8[30],ymm6[30],ymm8[31],ymm6[31] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm15[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[16],ymm6[16],ymm8[17],ymm6[17],ymm8[18],ymm6[18],ymm8[19],ymm6[19],ymm8[20],ymm6[20],ymm8[21],ymm6[21],ymm8[22],ymm6[22],ymm8[23],ymm6[23] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm15, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm0 ; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm9 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm11 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm11, %ymm14, %ymm14 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm15 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[16],ymm6[16],ymm8[17],ymm6[17],ymm8[18],ymm6[18],ymm8[19],ymm6[19],ymm8[20],ymm6[20],ymm8[21],ymm6[21],ymm8[22],ymm6[22],ymm8[23],ymm6[23] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15],ymm8[24],ymm6[24],ymm8[25],ymm6[25],ymm8[26],ymm6[26],ymm8[27],ymm6[27],ymm8[28],ymm6[28],ymm8[29],ymm6[29],ymm8[30],ymm6[30],ymm8[31],ymm6[31] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] ; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 128(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm15, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm14, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm15, 96(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rax) ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm14, (%rax) ; AVX2-FAST-NEXT: addq $40, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -1647,79 +1640,79 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15],ymm8[24],ymm6[24],ymm8[25],ymm6[25],ymm8[26],ymm6[26],ymm8[27],ymm6[27],ymm8[28],ymm6[28],ymm8[29],ymm6[29],ymm8[30],ymm6[30],ymm8[31],ymm6[31] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm15[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm14, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[16],ymm6[16],ymm8[17],ymm6[17],ymm8[18],ymm6[18],ymm8[19],ymm6[19],ymm8[20],ymm6[20],ymm8[21],ymm6[21],ymm8[22],ymm6[22],ymm8[23],ymm6[23] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm15, %ymm11, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm14, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm11, %ymm14, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3],xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[16],ymm6[16],ymm8[17],ymm6[17],ymm8[18],ymm6[18],ymm8[19],ymm6[19],ymm8[20],ymm6[20],ymm8[21],ymm6[21],ymm8[22],ymm6[22],ymm8[23],ymm6[23] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15],ymm8[24],ymm6[24],ymm8[25],ymm6[25],ymm8[26],ymm6[26],ymm8[27],ymm6[27],ymm8[28],ymm6[28],ymm8[29],ymm6[29],ymm8[30],ymm6[30],ymm8[31],ymm6[31] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, (%rax) ; AVX2-FAST-PERLANE-NEXT: addq $40, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -1731,76 +1724,81 @@ ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm3 ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm4 ; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] -; AVX512F-SLOW-NEXT: vprold $16, %ymm9, %ymm9 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] +; AVX512F-SLOW-NEXT: vprold $16, %ymm7, %ymm7 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm14, %zmm13 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm12 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[2,1,0,3,4,5,6,7,10,9,8,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm6 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm9, %xmm15 -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm13 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm13[8],xmm15[8],xmm13[9],xmm15[9],xmm13[10],xmm15[10],xmm13[11],xmm15[11],xmm13[12],xmm15[12],xmm13[13],xmm15[13],xmm13[14],xmm15[14],xmm13[15],xmm15[15] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm6, %zmm12 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm18 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm18, %zmm12 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpandq %zmm7, %zmm12, %zmm15 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm13 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm14[0],ymm1[1],ymm14[1],ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[4],ymm14[4],ymm1[5],ymm14[5],ymm1[6],ymm14[6],ymm1[7],ymm14[7],ymm1[16],ymm14[16],ymm1[17],ymm14[17],ymm1[18],ymm14[18],ymm1[19],ymm14[19],ymm1[20],ymm14[20],ymm1[21],ymm14[21],ymm1[22],ymm14[22],ymm1[23],ymm14[23] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,1,0,3,4,5,6,7,10,9,8,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX512F-SLOW-NEXT: vprold $16, %xmm15, %xmm15 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm15, %zmm13 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm13 = zmm13[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm15 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm7, %xmm7 ; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm7 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm12, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,1,0,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6,5] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm14, %zmm8 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm9 -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm7 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[4],ymm9[4],ymm7[5],ymm9[5],ymm7[6],ymm9[6],ymm7[7],ymm9[7],ymm7[16],ymm9[16],ymm7[17],ymm9[17],ymm7[18],ymm9[18],ymm7[19],ymm9[19],ymm7[20],ymm9[20],ymm7[21],ymm9[21],ymm7[22],ymm9[22],ymm7[23],ymm9[23] +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm0 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,1,0,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm8, %zmm18, %zmm0 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm9 +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm8 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[16],ymm9[16],ymm8[17],ymm9[17],ymm8[18],ymm9[18],ymm8[19],ymm9[19],ymm8[20],ymm9[20],ymm8[21],ymm9[21],ymm8[22],ymm9[22],ymm8[23],ymm9[23] ; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7] ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm5, %ymm5 ; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm4, %ymm4 @@ -1809,21 +1807,19 @@ ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $202, %zmm7, %zmm2, %zmm3 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm4 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27,24,25,22,23,28,29,26,27,28,29,30,31] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm8, %zmm6, %zmm2 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm14, %ymm4 +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm3 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm14[8],ymm1[9],ymm14[9],ymm1[10],ymm14[10],ymm1[11],ymm14[11],ymm1[12],ymm14[12],ymm1[13],ymm14[13],ymm1[14],ymm14[14],ymm1[15],ymm14[15],ymm1[24],ymm14[24],ymm1[25],ymm14[25],ymm1[26],ymm14[26],ymm1[27],ymm14[27],ymm1[28],ymm14[28],ymm1[29],ymm14[29],ymm1[30],ymm14[30],ymm1[31],ymm14[31] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27,24,25,22,23,28,29,26,27,28,29,30,31] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -1836,187 +1832,191 @@ ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm5 ; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm0 ; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm7 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm6 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm7 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[16],ymm8[16],ymm7[17],ymm8[17],ymm7[18],ymm8[18],ymm7[19],ymm8[19],ymm7[20],ymm8[20],ymm7[21],ymm8[21],ymm7[22],ymm8[22],ymm7[23],ymm8[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpternlogq $202, %zmm6, %zmm7, %zmm8 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm7 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm6 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27,24,25,22,23,28,29,26,27,28,29,30,31] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm6 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm9 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm8 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm7 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm11 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm11, %xmm6 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] ; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm7 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm8 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm12, %zmm11 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm11[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm13 -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm12 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512F-FAST-NEXT: vprold $16, %xmm13, %xmm13 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm13 = zmm12[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm13 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm11 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm15 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm10, %zmm6 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm6[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm12 +; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm10 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm12[8],xmm10[9],xmm12[9],xmm10[10],xmm12[10],xmm10[11],xmm12[11],xmm10[12],xmm12[12],xmm10[13],xmm12[13],xmm10[14],xmm12[14],xmm10[15],xmm12[15] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX512F-FAST-NEXT: vprold $16, %xmm12, %xmm12 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm10 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm13 = zmm10[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm10 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm14 ; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm12 -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm14 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[4,5,2,3,0,1,6,7,8,9,8,9,8,9,8,9] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm15, %zmm14 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm14 = zmm14[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm13, %zmm15, %zmm14 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm6 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3],xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,2,3,0,1,6,7,8,9,8,9,8,9,8,9] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm14, %zmm6 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm6[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm15 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm13, %zmm15, %zmm6 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] +; AVX512F-FAST-NEXT: vprold $16, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,0,1,10,10,10,11] +; AVX512F-FAST-NEXT: vpermt2q %zmm13, %zmm11, %zmm9 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm13 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512F-FAST-NEXT: vpermt2q %zmm13, %zmm11, %zmm7 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm9, %zmm15, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vpandq %zmm8, %zmm7, %zmm8 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,16,17,22,23,24,25,24,25,24,25,24,25] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpermt2q %zmm9, %zmm11, %zmm7 +; AVX512F-FAST-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm7 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm8 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[16],ymm9[16],ymm8[17],ymm9[17],ymm8[18],ymm9[18],ymm8[19],ymm9[19],ymm8[20],ymm9[20],ymm8[21],ymm9[21],ymm8[22],ymm9[22],ymm8[23],ymm9[23] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 ; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] -; AVX512F-FAST-NEXT: vprold $16, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,1,10,10,10,11] -; AVX512F-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512F-FAST-NEXT: vpermt2q %zmm2, %zmm9, %zmm3 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm5, %zmm15, %zmm3 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,16,17,22,23,24,25,24,25,24,25,24,25] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm8, %zmm14, %zmm2 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27,24,25,22,23,28,29,26,27,28,29,30,31] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-SLOW-LABEL: store_i8_stride6_vf32: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %ymm5 +; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %ymm6 ; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %ymm0 ; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %ymm1 -; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm8 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm6 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] ; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm11 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm12, %xmm10 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] -; AVX512BW-SLOW-NEXT: movw $18724, %cx # imm = 0x4924 -; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm6, %ymm10 {%k1} -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm6 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] -; AVX512BW-SLOW-NEXT: vpermw %ymm10, %ymm11, %ymm10 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] -; AVX512BW-SLOW-NEXT: vprold $16, %xmm11, %xmm11 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX512BW-SLOW-NEXT: movw $9362, %cx # imm = 0x2492 -; AVX512BW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm11, %ymm10 {%k2} -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm10[0,1,2,3],zmm6[4,5,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm10 -; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm11 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512BW-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm11, %xmm14, %xmm11 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3],xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3],xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4] -; AVX512BW-SLOW-NEXT: vpermw %ymm13, %ymm15, %ymm13 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] +; AVX512BW-SLOW-NEXT: vprold $16, %ymm7, %ymm7 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm7 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [8,11,10,9,8,11,10,9,8,11,10,9,12,13,14,13] +; AVX512BW-SLOW-NEXT: vpermw %ymm4, %ymm10, %ymm4 +; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm12, %zmm4 ; AVX512BW-SLOW-NEXT: movl $613566756, %ecx # imm = 0x24924924 -; AVX512BW-SLOW-NEXT: kmovd %ecx, %k3 -; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm11, %zmm6 {%k3} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [8,11,10,9,8,11,10,9,8,11,10,9,12,13,14,13] -; AVX512BW-SLOW-NEXT: vpermw %ymm11, %ymm13, %ymm11 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm13 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512BW-SLOW-NEXT: vprold $16, %ymm13, %ymm13 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm13, %ymm11 {%k2} -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7] -; AVX512BW-SLOW-NEXT: vpermw %ymm7, %ymm9, %ymm7 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] -; AVX512BW-SLOW-NEXT: vpermw %ymm8, %ymm9, %ymm7 {%k1} -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm11[4,5,6,7] -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,26,25,24,27,26,25,24,27,26,25,24,27,28,28,28,28] +; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm7, %zmm4 {%k1} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [10,9,8,11,10,9,8,11,10,9,8,11,12,12,12,12] +; AVX512BW-SLOW-NEXT: vpermw %ymm7, %ymm12, %ymm7 +; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm12 +; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm13 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm14, %zmm7 ; AVX512BW-SLOW-NEXT: movl $1227133513, %ecx # imm = 0x49249249 ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-SLOW-NEXT: vpermw %zmm8, %zmm9, %zmm7 {%k2} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm7, %zmm4 {%k2} +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-SLOW-NEXT: vpshufb %xmm7, %xmm8, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm7, %xmm9, %xmm7 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3],xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] +; AVX512BW-SLOW-NEXT: vpermw %ymm14, %ymm15, %ymm14 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm14, %zmm7 +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm14 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm10, %xmm10 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512BW-SLOW-NEXT: vprold $16, %xmm8, %xmm8 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5] +; AVX512BW-SLOW-NEXT: movl $1227105426, %ecx # imm = 0x49242492 +; AVX512BW-SLOW-NEXT: kmovd %ecx, %k2 +; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm8, %zmm7 {%k2} +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512BW-SLOW-NEXT: vpshufb %xmm8, %xmm12, %xmm9 +; AVX512BW-SLOW-NEXT: vpshufb %xmm8, %xmm13, %xmm8 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4] ; AVX512BW-SLOW-NEXT: vpermw %ymm9, %ymm10, %ymm9 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 +; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm8, %zmm7 {%k1} +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm9 +; AVX512BW-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm8 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[16],ymm9[16],ymm8[17],ymm9[17],ymm8[18],ymm9[18],ymm8[19],ymm9[19],ymm8[20],ymm9[20],ymm8[21],ymm9[21],ymm8[22],ymm9[22],ymm8[23],ymm9[23] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31] ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512BW-SLOW-NEXT: vpermw %ymm8, %ymm10, %ymm9 {%k1} -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm8 -; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-SLOW-NEXT: vpermw %ymm9, %ymm10, %ymm9 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-SLOW-NEXT: vpshufb %ymm9, %ymm6, %ymm6 ; AVX512BW-SLOW-NEXT: vpshufb %ymm9, %ymm5, %ymm5 -; AVX512BW-SLOW-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX512BW-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm4, %ymm2 {%k1} -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm8[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-SLOW-NEXT: vpermw %ymm2, %ymm3, %ymm2 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 +; AVX512BW-SLOW-NEXT: movl $1227114788, %ecx # imm = 0x49244924 +; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm8, %zmm2 {%k1} ; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512BW-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm4 +; AVX512BW-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm5 ; AVX512BW-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm3 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[4],ymm5[4],ymm3[5],ymm5[5],ymm3[6],ymm5[6],ymm3[7],ymm5[7],ymm3[16],ymm5[16],ymm3[17],ymm5[17],ymm3[18],ymm5[18],ymm3[19],ymm5[19],ymm3[20],ymm5[20],ymm3[21],ymm5[21],ymm3[22],ymm5[22],ymm3[23],ymm5[23] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] @@ -2026,116 +2026,112 @@ ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1} ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: store_i8_stride6_vf32: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %ymm5 +; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %ymm6 ; AVX512BW-FAST-NEXT: vmovdqa (%r8), %ymm0 ; AVX512BW-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm9 +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm8 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] +; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %xmm9 +; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm10 ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] -; AVX512BW-FAST-NEXT: vpermw %ymm11, %ymm12, %ymm11 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6] -; AVX512BW-FAST-NEXT: movw $9362, %cx # imm = 0x2492 -; AVX512BW-FAST-NEXT: kmovd %ecx, %k2 -; AVX512BW-FAST-NEXT: vpermw %ymm6, %ymm12, %ymm11 {%k2} -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm12 -; AVX512BW-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm6 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm12[8],xmm6[9],xmm12[9],xmm6[10],xmm12[10],xmm6[11],xmm12[11],xmm6[12],xmm12[12],xmm6[13],xmm12[13],xmm6[14],xmm12[14],xmm6[15],xmm12[15] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm13 -; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm12 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] -; AVX512BW-FAST-NEXT: movw $18724, %cx # imm = 0x4924 +; AVX512BW-FAST-NEXT: vpermw %ymm11, %ymm12, %ymm11 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm11 +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm12 +; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm8 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] +; AVX512BW-FAST-NEXT: vpermw %ymm12, %ymm13, %ymm12 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm12, %zmm8 +; AVX512BW-FAST-NEXT: movl $1227105426, %ecx # imm = 0x49242492 ; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu16 %ymm6, %ymm12 {%k1} -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm6 -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm11[0,1,2,3],zmm6[4,5,6,7] +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm11, %zmm8 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa (%r9), %xmm11 -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm13 -; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm14 -; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm12 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm11, %xmm14 +; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm12 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm13 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4] -; AVX512BW-FAST-NEXT: vpermw %ymm13, %ymm15, %ymm13 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 +; AVX512BW-FAST-NEXT: vpermw %ymm14, %ymm15, %ymm14 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm13 ; AVX512BW-FAST-NEXT: movl $613566756, %ecx # imm = 0x24924924 -; AVX512BW-FAST-NEXT: kmovd %ecx, %k3 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm12, %zmm6 {%k3} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [8,11,10,9,8,11,10,9,8,11,10,9,12,13,14,13] -; AVX512BW-FAST-NEXT: vpermw %ymm13, %ymm15, %ymm13 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [9,8,11,10,9,8,11,10,9,8,11,10,13,12,15,14] -; AVX512BW-FAST-NEXT: vpermw %ymm12, %ymm15, %ymm13 {%k2} -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm12 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7] -; AVX512BW-FAST-NEXT: vpermw %ymm8, %ymm9, %ymm8 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] -; AVX512BW-FAST-NEXT: vpermw %ymm7, %ymm9, %ymm8 {%k1} -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm12[4,5,6,7] -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,26,25,24,27,26,25,24,27,26,25,24,27,28,28,28,28] -; AVX512BW-FAST-NEXT: movl $1227133513, %ecx # imm = 0x49249249 +; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm13, %zmm8 {%k1} +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm14 +; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm13 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm13 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[16],ymm14[16],ymm13[17],ymm14[17],ymm13[18],ymm14[18],ymm13[19],ymm14[19],ymm13[20],ymm14[20],ymm13[21],ymm14[21],ymm13[22],ymm14[22],ymm13[23],ymm14[23] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm14 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512BW-FAST-NEXT: vpermw %ymm14, %ymm15, %ymm14 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm13 +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm15 +; AVX512BW-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm14 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[16],ymm15[16],ymm14[17],ymm15[17],ymm14[18],ymm15[18],ymm14[19],ymm15[19],ymm14[20],ymm15[20],ymm14[21],ymm15[21],ymm14[22],ymm15[22],ymm14[23],ymm15[23] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-FAST-NEXT: vpermw %ymm15, %ymm16, %ymm15 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512BW-FAST-NEXT: movl $1227114788, %ecx # imm = 0x49244924 ; AVX512BW-FAST-NEXT: kmovd %ecx, %k2 -; AVX512BW-FAST-NEXT: vpermw %zmm8, %zmm9, %zmm7 {%k2} -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512BW-FAST-NEXT: vpermw %ymm9, %ymm10, %ymm9 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512BW-FAST-NEXT: vpermw %ymm8, %ymm10, %ymm9 {%k1} -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm8 -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 -; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512BW-FAST-NEXT: vmovdqu16 %ymm4, %ymm2 {%k1} -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm8[4,5,6,7] -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512BW-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm4 -; AVX512BW-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm3 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] -; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm13, %zmm14 {%k2} +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm15 +; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm13 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm13 = ymm13[0],ymm15[0],ymm13[1],ymm15[1],ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[4],ymm15[4],ymm13[5],ymm15[5],ymm13[6],ymm15[6],ymm13[7],ymm15[7],ymm13[16],ymm15[16],ymm13[17],ymm15[17],ymm13[18],ymm15[18],ymm13[19],ymm15[19],ymm13[20],ymm15[20],ymm13[21],ymm15[21],ymm13[22],ymm15[22],ymm13[23],ymm15[23] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] +; AVX512BW-FAST-NEXT: vpermw %ymm15, %ymm16, %ymm15 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm13 ; AVX512BW-FAST-NEXT: movl $-1840700270, %ecx # imm = 0x92492492 +; AVX512BW-FAST-NEXT: kmovd %ecx, %k2 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm13, %zmm14 {%k2} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] +; AVX512BW-FAST-NEXT: vpermw %zmm2, %zmm3, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,25,24,27,26,25,24,27,26,25,24,27,26,29,28,31,30] +; AVX512BW-FAST-NEXT: vpermw %zmm5, %zmm3, %zmm2 {%k1} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,26,25,24,27,26,25,24,27,26,25,24,27,28,28,28,28] +; AVX512BW-FAST-NEXT: movl $1227133513, %ecx # imm = 0x49249249 ; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 64 @@ -2730,379 +2726,358 @@ ; ; AVX1-ONLY-LABEL: store_i8_stride6_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $200, %rsp -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,3] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm6 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm2[8,u],zero,zero,zero,zero,xmm2[9,u],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[0,1,2,3,4],zero,xmm5[6,7,8,9,10],zero,xmm5[12,13,14,15] -; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm5[8],zero,zero,zero,zero,zero,xmm5[9],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: subq $136, %rsp +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm8 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1,2],xmm7[3],xmm1[4,5],xmm7[6],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[0],zero,xmm1[2,3,4,5,6],zero,xmm1[8,9,10,11,12],zero,xmm1[14,15] +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,xmm1[2],zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,xmm1[4],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm2[5,u],zero,zero,zero,zero,xmm2[6,u],zero,zero,zero,zero,xmm2[7,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5,6],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm0[0,u],zero,zero,zero,zero,xmm0[1,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm8 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm5[5],zero,zero,zero,zero,zero,xmm5[6],zero,zero,zero,zero,zero,xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,2,2,3] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm10, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u> -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3],xmm6[4],xmm4[5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[13],zero,zero,zero,zero,zero,xmm5[14],zero,zero,zero,zero,zero,xmm5[15] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm12, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm0[8,u],zero,zero,zero,zero,xmm0[9,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,xmm1[8],zero,zero,zero,zero,zero,xmm1[9],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm0[5,u],zero,zero,zero,zero,xmm0[6,u],zero,zero,zero,zero,xmm0[7,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5],zero,zero,zero,zero,zero,xmm1[6],zero,zero,zero,zero,zero,xmm1[7] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[10,u],zero,zero,zero,zero,xmm2[11,u],zero,zero,zero,zero,xmm2[12,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2,3,4,5,6],zero,xmm1[8,9,10,11,12],zero,xmm1[14,15] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [128,10,128,128,128,128,128,11,128,128,128,128,128,12,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm9 -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6],xmm10[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[0,1,2],zero,xmm8[4,5,6,7,8],zero,xmm8[10,11,12,13,14],zero -; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm8 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm8[13],zero,zero,zero,zero,zero,xmm8[14],zero,zero,zero,zero,zero,xmm8[15] -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm0[13,u],zero,zero,zero,zero,xmm0[14,u],zero,zero,zero,zero,xmm0[15,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[13],zero,zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,zero,xmm1[15] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <10,u,128,128,128,128,11,u,128,128,128,128,12,u,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[10],zero,zero,zero,zero,zero,xmm1[11],zero,zero,zero,zero,zero,xmm1[12],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0],xmm1[1,2],xmm10[3],xmm1[4,5],xmm10[6],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,xmm1[2],zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,xmm1[4],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm10, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5],xmm10[6],xmm9[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,xmm0[0,u],zero,zero,zero,zero,xmm0[1,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3,4],xmm10[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm9, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[2,u],zero,zero,zero,zero,xmm2[3,u],zero,zero,zero,zero,xmm2[4,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm3[1,2],xmm9[3],xmm3[4,5],xmm9[6],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm9, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm12, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm0[8,u],zero,zero,zero,zero,xmm0[9,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,xmm1[8],zero,zero,zero,zero,zero,xmm1[9],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm0[5,u],zero,zero,zero,zero,xmm0[6,u],zero,zero,zero,zero,xmm0[7,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5],zero,zero,zero,zero,zero,xmm1[6],zero,zero,zero,zero,zero,xmm1[7] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm0[13,u],zero,zero,zero,zero,xmm0[14,u],zero,zero,zero,zero,xmm0[15,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[13],zero,zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,zero,xmm1[15] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[10],zero,zero,zero,zero,zero,xmm1[11],zero,zero,zero,zero,zero,xmm1[12],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[2,u],zero,zero,zero,zero,xmm0[3,u],zero,zero,zero,zero,xmm0[4,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0],xmm1[1,2],xmm10[3],xmm1[4,5],xmm10[6],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,xmm0[0,u],zero,zero,zero,zero,xmm0[1,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3,4],xmm10[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[2,u],zero,zero,zero,zero,xmm1[3,u],zero,zero,zero,zero,xmm1[4,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5],xmm6[6],xmm5[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0],zero,xmm5[2,3,4,5,6],zero,xmm5[8,9,10,11,12],zero,xmm5[14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm12, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm0[8,u],zero,zero,zero,zero,xmm0[9,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,xmm1[8],zero,zero,zero,zero,zero,xmm1[9],zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm0[5,u],zero,zero,zero,zero,xmm0[6,u],zero,zero,zero,zero,xmm0[7,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5],zero,zero,zero,zero,zero,xmm1[6],zero,zero,zero,zero,zero,xmm1[7] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,128,8,u,128,128,128,128,9,u,128,128,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,8,128,128,128,128,128,9,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,5,u,128,128,128,128,6,u,128,128,128,128,7,u> -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,5,128,128,128,128,128,6,128,128,128,128,128,7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm15, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6],xmm5[7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm5 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u> -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,128,13,128,128,128,128,128,14,128,128,128,128,128,15] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <10,u,128,128,128,128,11,u,128,128,128,128,12,u,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm2[10],zero,zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero,zero,xmm2[12],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm0[13,u],zero,zero,zero,zero,xmm0[14,u],zero,zero,zero,zero,xmm0[15,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[13],zero,zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,zero,xmm1[15] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm11 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,u],zero,zero,zero,zero,xmm0[11,u],zero,zero,zero,zero,xmm0[12,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[10],zero,zero,zero,zero,zero,xmm1[11],zero,zero,zero,zero,zero,xmm1[12],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3],xmm0[4],xmm14[5,6],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[0,1,2],zero,xmm0[4,5,6,7,8],zero,xmm0[10,11,12,13,14],zero -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[2,u],zero,zero,zero,zero,xmm1[3,u],zero,zero,zero,zero,xmm1[4,u],zero,zero +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1,2],xmm0[3],xmm9[4,5],xmm0[6],xmm9[7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm0 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1,2],xmm15[3],xmm12[4,5],xmm15[6],xmm12[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,xmm0[10],zero,zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero,zero,xmm0[12],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm15, %ymm11 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm15, %ymm10 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm13, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm13, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm10, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1,2],xmm15[3],xmm10[4,5],xmm15[6],xmm10[7] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm10, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm3[2],xmm11[3,4],xmm3[5],xmm11[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,zero,xmm1[0,u],zero,zero,zero,zero,xmm1[1,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm15[2],xmm13[3,4],xmm15[5],xmm13[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[1,0,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm3, %ymm6 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1,2],xmm3[3],xmm8[4,5],xmm3[6],xmm8[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0],zero,xmm3[2,3,4,5,6],zero,xmm3[8,9,10,11,12],zero,xmm3[14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm8 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3,4],xmm8[5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm8 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,3,3,3] -; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm8, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm1[8,u],zero,zero,zero,zero,xmm1[9,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm2[2],xmm6[3,4],xmm2[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,xmm0[9],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm1[5,u],zero,zero,zero,zero,xmm1[6,u],zero,zero,zero,zero,xmm1[7,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3],xmm6[4],xmm3[5,6],xmm6[7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm12, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm1[8,u],zero,zero,zero,zero,xmm1[9,u],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,xmm0[9],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[5,u],zero,zero,zero,zero,xmm1[6,u],zero,zero,zero,zero,xmm1[7,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[4,5,6,7,8],zero,xmm1[10,11,12,13,14],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[13,u],zero,zero,zero,zero,xmm1[14,u],zero,zero,zero,zero,xmm1[15,u] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm0[13],zero,zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,zero,xmm0[15] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5],xmm1[6],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2,3,4,5,6],zero,xmm1[8,9,10,11,12],zero,xmm1[14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10],zero,zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero,zero,xmm0[12],zero,zero ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 16(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 96(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 352(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 368(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 320(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 336(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm13, 288(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, 304(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 256(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm11, 272(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3112,78 +3087,67 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rax) ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 368(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 336(%rax) -; AVX1-ONLY-NEXT: addq $200, %rsp +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rax) +; AVX1-ONLY-NEXT: addq $136, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i8_stride6_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $664, %rsp # imm = 0x298 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX2-SLOW-NEXT: subq $680, %rsp # imm = 0x2A8 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm12 ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm7 ; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm8 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm8 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm11 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm15 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm10 ; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm12, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm3 ; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 @@ -3191,9 +3155,9 @@ ; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm5 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[16],ymm3[16],ymm5[17],ymm3[17],ymm5[18],ymm3[18],ymm5[19],ymm3[19],ymm5[20],ymm3[20],ymm5[21],ymm3[21],ymm5[22],ymm3[22],ymm5[23],ymm3[23] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm6 @@ -3202,27 +3166,27 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm6, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm7 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm6 -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm9 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm9, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm14 ; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm7, %ymm4 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm9 ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm9, %ymm6 -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm12 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm0 @@ -3231,14 +3195,14 @@ ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm4 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] @@ -3251,14 +3215,13 @@ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm9 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] @@ -3266,29 +3229,30 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3],xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3],xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm11 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] @@ -3299,66 +3263,65 @@ ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm15[0],mem[0],ymm15[1],mem[1],ymm15[2],mem[2],ymm15[3],mem[3],ymm15[4],mem[4],ymm15[5],mem[5],ymm15[6],mem[6],ymm15[7],mem[7],ymm15[16],mem[16],ymm15[17],mem[17],ymm15[18],mem[18],ymm15[19],mem[19],ymm15[20],mem[20],ymm15[21],mem[21],ymm15[22],mem[22],ymm15[23],mem[23] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[16],mem[16],ymm2[17],mem[17],ymm2[18],mem[18],ymm2[19],mem[19],ymm2[20],mem[20],ymm2[21],mem[21],ymm2[22],mem[22],ymm2[23],mem[23] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm13 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[16],ymm13[16],ymm12[17],ymm13[17],ymm12[18],ymm13[18],ymm12[19],ymm13[19],ymm12[20],ymm13[20],ymm12[21],ymm13[21],ymm12[22],ymm13[22],ymm12[23],ymm13[23] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm7[0],ymm9[0],ymm7[1],ymm9[1],ymm7[2],ymm9[2],ymm7[3],ymm9[3],ymm7[4],ymm9[4],ymm7[5],ymm9[5],ymm7[6],ymm9[6],ymm7[7],ymm9[7],ymm7[16],ymm9[16],ymm7[17],ymm9[17],ymm7[18],ymm9[18],ymm7[19],ymm9[19],ymm7[20],ymm9[20],ymm7[21],ymm9[21],ymm7[22],ymm9[22],ymm7[23],ymm9[23] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[4],mem[4],ymm3[5],mem[5],ymm3[6],mem[6],ymm3[7],mem[7],ymm3[16],mem[16],ymm3[17],mem[17],ymm3[18],mem[18],ymm3[19],mem[19],ymm3[20],mem[20],ymm3[21],mem[21],ymm3[22],mem[22],ymm3[23],mem[23] +; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm8 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm7[0],ymm14[0],ymm7[1],ymm14[1],ymm7[2],ymm14[2],ymm7[3],ymm14[3],ymm7[4],ymm14[4],ymm7[5],ymm14[5],ymm7[6],ymm14[6],ymm7[7],ymm14[7],ymm7[16],ymm14[16],ymm7[17],ymm14[17],ymm7[18],ymm14[18],ymm7[19],ymm14[19],ymm7[20],ymm14[20],ymm7[21],ymm14[21],ymm7[22],ymm14[22],ymm7[23],ymm14[23] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> -; AVX2-SLOW-NEXT: vmovdqa %xmm7, %xmm14 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm11, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm6, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] ; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm11 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm12, %ymm11 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm11, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm6 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm11, %ymm6 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm6, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm13, %xmm6 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm6, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm14, %xmm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm9, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm15, %ymm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3380,7 +3343,7 @@ ; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm5 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 @@ -3390,128 +3353,124 @@ ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm15[8],mem[8],ymm15[9],mem[9],ymm15[10],mem[10],ymm15[11],mem[11],ymm15[12],mem[12],ymm15[13],mem[13],ymm15[14],mem[14],ymm15[15],mem[15],ymm15[24],mem[24],ymm15[25],mem[25],ymm15[26],mem[26],ymm15[27],mem[27],ymm15[28],mem[28],ymm15[29],mem[29],ymm15[30],mem[30],ymm15[31],mem[31] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15],ymm12[24],ymm13[24],ymm12[25],ymm13[25],ymm12[26],ymm13[26],ymm12[27],ymm13[27],ymm12[28],ymm13[28],ymm12[29],ymm13[29],ymm12[30],ymm13[30],ymm12[31],ymm13[31] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = ymm5[8],mem[8],ymm5[9],mem[9],ymm5[10],mem[10],ymm5[11],mem[11],ymm5[12],mem[12],ymm5[13],mem[13],ymm5[14],mem[14],ymm5[15],mem[15],ymm5[24],mem[24],ymm5[25],mem[25],ymm5[26],mem[26],ymm5[27],mem[27],ymm5[28],mem[28],ymm5[29],mem[29],ymm5[30],mem[30],ymm5[31],mem[31] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15],ymm7[24],ymm8[24],ymm7[25],ymm8[25],ymm7[26],ymm8[26],ymm7[27],ymm8[27],ymm7[28],ymm8[28],ymm7[29],ymm8[29],ymm7[30],ymm8[30],ymm7[31],ymm8[31] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm12, %ymm6 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm6, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm8, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm11, %ymm4 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm13, %xmm5 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm14, %xmm4 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm9, %ymm5 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm15, %ymm5 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm10, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 352(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 352(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 160(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 256(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 256(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) -; AVX2-SLOW-NEXT: addq $664, %rsp # imm = 0x298 +; AVX2-SLOW-NEXT: addq $680, %rsp # imm = 0x2A8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i8_stride6_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $680, %rsp # imm = 0x2A8 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm13 ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm12 ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm10 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm9 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm12 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm14 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm14, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> ; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm14 -; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 @@ -3519,9 +3478,9 @@ ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm3 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm5 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[16],ymm3[16],ymm5[17],ymm3[17],ymm5[18],ymm3[18],ymm5[19],ymm3[19],ymm5[20],ymm3[20],ymm5[21],ymm3[21],ymm5[22],ymm3[22],ymm5[23],ymm3[23] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm7 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm6 @@ -3530,19 +3489,17 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm7 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm15 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm6 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm10 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] @@ -3554,243 +3511,240 @@ ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm7 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] ; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm14[8],xmm8[9],xmm14[9],xmm8[10],xmm14[10],xmm8[11],xmm14[11],xmm8[12],xmm14[12],xmm8[13],xmm14[13],xmm8[14],xmm14[14],xmm8[15],xmm14[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3],xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm15 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,2,2,3] +; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,2,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[4],ymm13[4],ymm0[5],ymm13[5],ymm0[6],ymm13[6],ymm0[7],ymm13[7],ymm0[16],ymm13[16],ymm0[17],ymm13[17],ymm0[18],ymm13[18],ymm0[19],ymm13[19],ymm0[20],ymm13[20],ymm0[21],ymm13[21],ymm0[22],ymm13[22],ymm0[23],ymm13[23] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15],ymm10[24],ymm9[24],ymm10[25],ymm9[25],ymm10[26],ymm9[26],ymm10[27],ymm9[27],ymm10[28],ymm9[28],ymm10[29],ymm9[29],ymm10[30],ymm9[30],ymm10[31],ymm9[31] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm12 -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm15[8],mem[8],ymm15[9],mem[9],ymm15[10],mem[10],ymm15[11],mem[11],ymm15[12],mem[12],ymm15[13],mem[13],ymm15[14],mem[14],ymm15[15],mem[15],ymm15[24],mem[24],ymm15[25],mem[25],ymm15[26],mem[26],ymm15[27],mem[27],ymm15[28],mem[28],ymm15[29],mem[29],ymm15[30],mem[30],ymm15[31],mem[31] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm8 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm7[0],ymm11[0],ymm7[1],ymm11[1],ymm7[2],ymm11[2],ymm7[3],ymm11[3],ymm7[4],ymm11[4],ymm7[5],ymm11[5],ymm7[6],ymm11[6],ymm7[7],ymm11[7],ymm7[16],ymm11[16],ymm7[17],ymm11[17],ymm7[18],ymm11[18],ymm7[19],ymm11[19],ymm7[20],ymm11[20],ymm7[21],ymm11[21],ymm7[22],ymm11[22],ymm7[23],ymm11[23] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm15, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] ; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm15 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm15, %ymm2 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm12, %xmm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm14, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm14, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm5 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm5 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm15 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm1, %ymm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[16],mem[16],ymm1[17],mem[17],ymm1[18],mem[18],ymm1[19],mem[19],ymm1[20],mem[20],ymm1[21],mem[21],ymm1[22],mem[22],ymm1[23],mem[23] +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11],ymm1[12],mem[12],ymm1[13],mem[13],ymm1[14],mem[14],ymm1[15],mem[15],ymm1[24],mem[24],ymm1[25],mem[25],ymm1[26],mem[26],ymm1[27],mem[27],ymm1[28],mem[28],ymm1[29],mem[29],ymm1[30],mem[30],ymm1[31],mem[31] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[16],mem[16],ymm1[17],mem[17],ymm1[18],mem[18],ymm1[19],mem[19],ymm1[20],mem[20],ymm1[21],mem[21],ymm1[22],mem[22],ymm1[23],mem[23] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm1[8],ymm13[8],ymm1[9],ymm13[9],ymm1[10],ymm13[10],ymm1[11],ymm13[11],ymm1[12],ymm13[12],ymm1[13],ymm13[13],ymm1[14],ymm13[14],ymm1[15],ymm13[15],ymm1[24],ymm13[24],ymm1[25],ymm13[25],ymm1[26],ymm13[26],ymm1[27],ymm13[27],ymm1[28],ymm13[28],ymm1[29],ymm13[29],ymm1[30],ymm13[30],ymm1[31],ymm13[31] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[4],ymm9[4],ymm6[5],ymm9[5],ymm6[6],ymm9[6],ymm6[7],ymm9[7],ymm6[16],ymm9[16],ymm6[17],ymm9[17],ymm6[18],ymm9[18],ymm6[19],ymm9[19],ymm6[20],ymm9[20],ymm6[21],ymm9[21],ymm6[22],ymm9[22],ymm6[23],ymm9[23] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[4],mem[4],ymm12[5],mem[5],ymm12[6],mem[6],ymm12[7],mem[7],ymm12[16],mem[16],ymm12[17],mem[17],ymm12[18],mem[18],ymm12[19],mem[19],ymm12[20],mem[20],ymm12[21],mem[21],ymm12[22],mem[22],ymm12[23],mem[23] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm15 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm15, %ymm5 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm13[8],ymm8[8],ymm13[9],ymm8[9],ymm13[10],ymm8[10],ymm13[11],ymm8[11],ymm13[12],ymm8[12],ymm13[13],ymm8[13],ymm13[14],ymm8[14],ymm13[15],ymm8[15],ymm13[24],ymm8[24],ymm13[25],ymm8[25],ymm13[26],ymm8[26],ymm13[27],ymm8[27],ymm13[28],ymm8[28],ymm13[29],ymm8[29],ymm13[30],ymm8[30],ymm13[31],ymm8[31] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15],ymm6[24],ymm7[24],ymm6[25],ymm7[25],ymm6[26],ymm7[26],ymm6[27],ymm7[27],ymm6[28],ymm7[28],ymm6[29],ymm7[29],ymm6[30],ymm7[30],ymm6[31],ymm7[31] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm5, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm5 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm5 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm14, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 352(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 288(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 160(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 256(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-NEXT: addq $680, %rsp # imm = 0x2A8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -3798,46 +3752,44 @@ ; AVX2-FAST-PERLANE-LABEL: store_i8_stride6_vf64: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $680, %rsp # imm = 0x2A8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 @@ -3845,9 +3797,9 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm6, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[16],ymm3[16],ymm5[17],ymm3[17],ymm5[18],ymm3[18],ymm5[19],ymm3[19],ymm5[20],ymm3[20],ymm5[21],ymm3[21],ymm5[22],ymm3[22],ymm5[23],ymm3[23] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm6, %ymm6 @@ -3856,19 +3808,17 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm3, %ymm6, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm7, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm11, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm7, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] @@ -3880,1433 +3830,1624 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm7, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm14[8],xmm8[9],xmm14[9],xmm8[10],xmm14[10],xmm8[11],xmm14[11],xmm8[12],xmm14[12],xmm8[13],xmm14[13],xmm8[14],xmm14[14],xmm8[15],xmm14[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3],xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[4],ymm13[4],ymm0[5],ymm13[5],ymm0[6],ymm13[6],ymm0[7],ymm13[7],ymm0[16],ymm13[16],ymm0[17],ymm13[17],ymm0[18],ymm13[18],ymm0[19],ymm13[19],ymm0[20],ymm13[20],ymm0[21],ymm13[21],ymm0[22],ymm13[22],ymm0[23],ymm13[23] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15],ymm10[24],ymm9[24],ymm10[25],ymm9[25],ymm10[26],ymm9[26],ymm10[27],ymm9[27],ymm10[28],ymm9[28],ymm10[29],ymm9[29],ymm10[30],ymm9[30],ymm10[31],ymm9[31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm15[8],mem[8],ymm15[9],mem[9],ymm15[10],mem[10],ymm15[11],mem[11],ymm15[12],mem[12],ymm15[13],mem[13],ymm15[14],mem[14],ymm15[15],mem[15],ymm15[24],mem[24],ymm15[25],mem[25],ymm15[26],mem[26],ymm15[27],mem[27],ymm15[28],mem[28],ymm15[29],mem[29],ymm15[30],mem[30],ymm15[31],mem[31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm7[0],ymm11[0],ymm7[1],ymm11[1],ymm7[2],ymm11[2],ymm7[3],ymm11[3],ymm7[4],ymm11[4],ymm7[5],ymm11[5],ymm7[6],ymm11[6],ymm7[7],ymm11[7],ymm7[16],ymm11[16],ymm7[17],ymm11[17],ymm7[18],ymm11[18],ymm7[19],ymm11[19],ymm7[20],ymm11[20],ymm7[21],ymm11[21],ymm7[22],ymm11[22],ymm7[23],ymm11[23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm15, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm10, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm10, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm15, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm11, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm12, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm14, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm4, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm14, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm15 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[16],mem[16],ymm1[17],mem[17],ymm1[18],mem[18],ymm1[19],mem[19],ymm1[20],mem[20],ymm1[21],mem[21],ymm1[22],mem[22],ymm1[23],mem[23] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11],ymm1[12],mem[12],ymm1[13],mem[13],ymm1[14],mem[14],ymm1[15],mem[15],ymm1[24],mem[24],ymm1[25],mem[25],ymm1[26],mem[26],ymm1[27],mem[27],ymm1[28],mem[28],ymm1[29],mem[29],ymm1[30],mem[30],ymm1[31],mem[31] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[16],mem[16],ymm1[17],mem[17],ymm1[18],mem[18],ymm1[19],mem[19],ymm1[20],mem[20],ymm1[21],mem[21],ymm1[22],mem[22],ymm1[23],mem[23] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm1[8],ymm13[8],ymm1[9],ymm13[9],ymm1[10],ymm13[10],ymm1[11],ymm13[11],ymm1[12],ymm13[12],ymm1[13],ymm13[13],ymm1[14],ymm13[14],ymm1[15],ymm13[15],ymm1[24],ymm13[24],ymm1[25],ymm13[25],ymm1[26],ymm13[26],ymm1[27],ymm13[27],ymm1[28],ymm13[28],ymm1[29],ymm13[29],ymm1[30],ymm13[30],ymm1[31],ymm13[31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[4],ymm9[4],ymm6[5],ymm9[5],ymm6[6],ymm9[6],ymm6[7],ymm9[7],ymm6[16],ymm9[16],ymm6[17],ymm9[17],ymm6[18],ymm9[18],ymm6[19],ymm9[19],ymm6[20],ymm9[20],ymm6[21],ymm9[21],ymm6[22],ymm9[22],ymm6[23],ymm9[23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[4],mem[4],ymm12[5],mem[5],ymm12[6],mem[6],ymm12[7],mem[7],ymm12[16],mem[16],ymm12[17],mem[17],ymm12[18],mem[18],ymm12[19],mem[19],ymm12[20],mem[20],ymm12[21],mem[21],ymm12[22],mem[22],ymm12[23],mem[23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm15, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm15, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm13[8],ymm8[8],ymm13[9],ymm8[9],ymm13[10],ymm8[10],ymm13[11],ymm8[11],ymm13[12],ymm8[12],ymm13[13],ymm8[13],ymm13[14],ymm8[14],ymm13[15],ymm8[15],ymm13[24],ymm8[24],ymm13[25],ymm8[25],ymm13[26],ymm8[26],ymm13[27],ymm8[27],ymm13[28],ymm8[28],ymm13[29],ymm8[29],ymm13[30],ymm8[30],ymm13[31],ymm8[31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15],ymm6[24],ymm7[24],ymm6[25],ymm7[25],ymm6[26],ymm7[26],ymm6[27],ymm7[27],ymm6[28],ymm7[28],ymm6[29],ymm7[29],ymm6[30],ymm7[30],ymm6[31],ymm7[31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm15, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm4, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm10, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm5, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm11, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm13, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm14, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 352(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 288(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 288(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 256(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: addq $680, %rsp # imm = 0x2A8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: store_i8_stride6_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $264, %rsp # imm = 0x108 +; AVX512F-SLOW-NEXT: subq $328, %rsp # imm = 0x148 ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm1 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm26 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm27 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm2 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm31 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm29 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm28 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm7, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm4 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 -; AVX512F-SLOW-NEXT: vmovdqa %xmm12, %xmm5 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %xmm13, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm7, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm7, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm26 ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm10, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm11, %ymm1 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15],ymm11[24],ymm10[24],ymm11[25],ymm10[25],ymm11[26],ymm10[26],ymm11[27],ymm10[27],ymm11[28],ymm10[28],ymm11[29],ymm10[29],ymm11[30],ymm10[30],ymm11[31],ymm10[31] -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512F-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm8, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm9, %ymm1 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15],ymm9[24],ymm8[24],ymm9[25],ymm8[25],ymm9[26],ymm8[26],ymm9[27],ymm8[27],ymm9[28],ymm8[28],ymm9[29],ymm8[29],ymm9[30],ymm8[30],ymm9[31],ymm8[31] -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512F-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm19 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] -; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm0 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm15 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm15, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm7 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[2],ymm0[2],ymm7[3],ymm0[3],ymm7[4],ymm0[4],ymm7[5],ymm0[5],ymm7[6],ymm0[6],ymm7[7],ymm0[7],ymm7[16],ymm0[16],ymm7[17],ymm0[17],ymm7[18],ymm0[18],ymm7[19],ymm0[19],ymm7[20],ymm0[20],ymm7[21],ymm0[21],ymm7[22],ymm0[22],ymm7[23],ymm0[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm2[8],ymm15[8],ymm2[9],ymm15[9],ymm2[10],ymm15[10],ymm2[11],ymm15[11],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15],ymm2[24],ymm15[24],ymm2[25],ymm15[25],ymm2[26],ymm15[26],ymm2[27],ymm15[27],ymm2[28],ymm15[28],ymm2[29],ymm15[29],ymm2[30],ymm15[30],ymm2[31],ymm15[31] -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm24 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm13 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm7 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm13, %ymm12 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[4],ymm7[4],ymm12[5],ymm7[5],ymm12[6],ymm7[6],ymm12[7],ymm7[7],ymm12[16],ymm7[16],ymm12[17],ymm7[17],ymm12[18],ymm7[18],ymm12[19],ymm7[19],ymm12[20],ymm7[20],ymm12[21],ymm7[21],ymm12[22],ymm7[22],ymm12[23],ymm7[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm13[8],ymm0[8],ymm13[9],ymm0[9],ymm13[10],ymm0[10],ymm13[11],ymm0[11],ymm13[12],ymm0[12],ymm13[13],ymm0[13],ymm13[14],ymm0[14],ymm13[15],ymm0[15],ymm13[24],ymm0[24],ymm13[25],ymm0[25],ymm13[26],ymm0[26],ymm13[27],ymm0[27],ymm13[28],ymm0[28],ymm13[29],ymm0[29],ymm13[30],ymm0[30],ymm13[31],ymm0[31] -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm12, %ymm12 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm23 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm7 -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm22 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm13[0],ymm0[0],ymm13[1],ymm0[1],ymm13[2],ymm0[2],ymm13[3],ymm0[3],ymm13[4],ymm0[4],ymm13[5],ymm0[5],ymm13[6],ymm0[6],ymm13[7],ymm0[7],ymm13[16],ymm0[16],ymm13[17],ymm0[17],ymm13[18],ymm0[18],ymm13[19],ymm0[19],ymm13[20],ymm0[20],ymm13[21],ymm0[21],ymm13[22],ymm0[22],ymm13[23],ymm0[23] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm0 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[16],ymm15[16],ymm2[17],ymm15[17],ymm2[18],ymm15[18],ymm2[19],ymm15[19],ymm2[20],ymm15[20],ymm2[21],ymm15[21],ymm2[22],ymm15[22],ymm2[23],ymm15[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm2, %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm29 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm30 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm8 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm13, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm31 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm20 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm28 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm10, %ymm1 +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm2 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm8[8],ymm10[8],ymm8[9],ymm10[9],ymm8[10],ymm10[10],ymm8[11],ymm10[11],ymm8[12],ymm10[12],ymm8[13],ymm10[13],ymm8[14],ymm10[14],ymm8[15],ymm10[15],ymm8[24],ymm10[24],ymm8[25],ymm10[25],ymm8[26],ymm10[26],ymm8[27],ymm10[27],ymm8[28],ymm10[28],ymm8[29],ymm10[29],ymm8[30],ymm10[30],ymm8[31],ymm10[31] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm28 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm14, %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm4 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15],ymm13[24],ymm14[24],ymm13[25],ymm14[25],ymm13[26],ymm14[26],ymm13[27],ymm14[27],ymm13[28],ymm14[28],ymm13[29],ymm14[29],ymm13[30],ymm14[30],ymm13[31],ymm14[31] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm13, %ymm16 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm27 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm7 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] +; AVX512F-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm4 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] +; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm7, %ymm5 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm20 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm11 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] ; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm9 -; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm21 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm7 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512F-SLOW-NEXT: vprold $16, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm26 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512F-SLOW-NEXT: vprold $16, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm27 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm13, %ymm18 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm3 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm12 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm11, %ymm15 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm15, %zmm21 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm12 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm12, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm15, %ymm6 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[4],ymm0[4],ymm6[5],ymm0[5],ymm6[6],ymm0[6],ymm6[7],ymm0[7],ymm6[16],ymm0[16],ymm6[17],ymm0[17],ymm6[18],ymm0[18],ymm6[19],ymm0[19],ymm6[20],ymm0[20],ymm6[21],ymm0[21],ymm6[22],ymm0[22],ymm6[23],ymm0[23] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11],ymm15[12],ymm12[12],ymm15[13],ymm12[13],ymm15[14],ymm12[14],ymm15[15],ymm12[15],ymm15[24],ymm12[24],ymm15[25],ymm12[25],ymm15[26],ymm12[26],ymm15[27],ymm12[27],ymm15[28],ymm12[28],ymm15[29],ymm12[29],ymm15[30],ymm12[30],ymm15[31],ymm12[31] +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm22 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm0 ; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm6 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm10 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[1],ymm6[1],ymm1[2],ymm6[2],ymm1[3],ymm6[3],ymm1[4],ymm6[4],ymm1[5],ymm6[5],ymm1[6],ymm6[6],ymm1[7],ymm6[7],ymm1[16],ymm6[16],ymm1[17],ymm6[17],ymm1[18],ymm6[18],ymm1[19],ymm6[19],ymm1[20],ymm6[20],ymm1[21],ymm6[21],ymm1[22],ymm6[22],ymm1[23],ymm6[23] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm23 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm1 +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm3, %ymm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm24 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm5 +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm25 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm13 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] +; AVX512F-SLOW-NEXT: vprold $16, %xmm13, %xmm13 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm13, %zmm30 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm4 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm14[0],ymm4[1],ymm14[1],ymm4[2],ymm14[2],ymm4[3],ymm14[3],ymm4[4],ymm14[4],ymm4[5],ymm14[5],ymm4[6],ymm14[6],ymm4[7],ymm14[7],ymm4[16],ymm14[16],ymm4[17],ymm14[17],ymm4[18],ymm14[18],ymm4[19],ymm14[19],ymm4[20],ymm14[20],ymm4[21],ymm14[21],ymm4[22],ymm14[22],ymm4[23],ymm14[23] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm4 +; AVX512F-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm9 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm9 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[16],ymm10[16],ymm8[17],ymm10[17],ymm8[18],ymm10[18],ymm8[19],ymm10[19],ymm8[20],ymm10[20],ymm8[21],ymm10[21],ymm8[22],ymm10[22],ymm8[23],ymm10[23] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX512F-SLOW-NEXT: vprold $16, %xmm14, %xmm14 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm14, %zmm31 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm5, %xmm13 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm6 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[4],ymm12[4],ymm15[5],ymm12[5],ymm15[6],ymm12[6],ymm15[7],ymm12[7],ymm15[16],ymm12[16],ymm15[17],ymm12[17],ymm15[18],ymm12[18],ymm15[19],ymm12[19],ymm15[20],ymm12[20],ymm15[21],ymm12[21],ymm15[22],ymm12[22],ymm15[23],ymm12[23] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm9 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] ; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm7 -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm14[0,0,0,1] -; AVX512F-SLOW-NEXT: vprold $16, %ymm19, %ymm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm4[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm8, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm12, %zmm1 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm0, %ymm8, %ymm14 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm4, %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm11, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm29[0,0,0,1] -; AVX512F-SLOW-NEXT: vprold $16, %ymm30, %ymm9 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm31[0,0,0,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm11 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm7, %ymm7 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm11 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm30[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm30 = mem[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm30 +; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm19 = mem[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm1, %zmm19 +; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm0 = mem[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm26, %zmm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX512F-SLOW-NEXT: vprold $16, %ymm18, %ymm19 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm5 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm15[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm28[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm21[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm18[0,0,0,1] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm13[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm16[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm17[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX512F-SLOW-NEXT: vprold $16, %ymm16, %ymm13 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm12, %zmm9 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm9, %ymm1 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm8, %ymm6 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm9, %ymm4, %ymm14 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm14[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm15, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm6 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm26[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm5 = mem[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm9, %zmm5 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm27[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm11 = mem[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm9, %zmm11 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm9, %ymm29 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm5, %ymm8, %ymm30 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm30[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm5 = mem[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm14, %zmm5 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm11, %ymm8, %ymm31 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm11, %ymm1 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm9, %ymm13 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm31[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq $64, (%rsp), %zmm8 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm8 = mem[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm14, %zmm8 -; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm1 = mem[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm11 = mem[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm12, %zmm11 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm24[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm13 = zmm23[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm12, %zmm13 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm11, %ymm1 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm4, %ymm10 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm13, %ymm1 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm4, %ymm2 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm11, %ymm9, %ymm7 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm13, %ymm9, %ymm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm25[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm7, %zmm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm22[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm7, %zmm2 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512F-SLOW-NEXT: addq $264, %rsp # imm = 0x108 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm19[2,2,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm10, %zmm10 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm14, %zmm5 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm10, %zmm1, %zmm5 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm12, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm10, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm11, %zmm5 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm31[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm12 = mem[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm29, %zmm12 +; AVX512F-SLOW-NEXT: vpermq $64, (%rsp), %zmm7 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm7 = mem[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm1, %zmm7 +; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm12 = mem[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm26, %zmm12 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm13[2,2,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm30, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm1, %zmm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm10, %zmm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm11, %zmm2 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm3 = zmm28[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm27[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm1, %zmm4 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm3 = zmm20[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm29, %zmm3 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm21[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm6, %zmm4 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm3 = zmm22[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm23[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm1, %zmm7 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm24[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm29, %zmm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm3 = zmm25[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm6, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-SLOW-NEXT: addq $328, %rsp # imm = 0x148 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i8_stride6_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $360, %rsp # imm = 0x168 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX512F-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm2 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: subq $584, %rsp # imm = 0x248 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512F-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm2 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm4 +; AVX512F-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> +; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm4 +; AVX512F-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm6 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm7 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm4 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm6 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm5 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23] ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15],ymm7[24],ymm6[24],ymm7[25],ymm6[25],ymm7[26],ymm6[26],ymm7[27],ymm6[27],ymm7[28],ymm6[28],ymm7[29],ymm6[29],ymm7[30],ymm6[30],ymm7[31],ymm6[31] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm20 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm25 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm23 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm22 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm17 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm5 -; AVX512F-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] -; AVX512F-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm4 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] -; AVX512F-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm4 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm8 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm7 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[16],ymm5[16],ymm7[17],ymm5[17],ymm7[18],ymm5[18],ymm7[19],ymm5[19],ymm7[20],ymm5[20],ymm7[21],ymm5[21],ymm7[22],ymm5[22],ymm7[23],ymm5[23] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15],ymm8[24],ymm4[24],ymm8[25],ymm4[25],ymm8[26],ymm4[26],ymm8[27],ymm4[27],ymm8[28],ymm4[28],ymm8[29],ymm4[29],ymm8[30],ymm4[30],ymm8[31],ymm4[31] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm26 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm4 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm14 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm14, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[4],ymm6[4],ymm0[5],ymm6[5],ymm0[6],ymm6[6],ymm0[7],ymm6[7],ymm0[16],ymm6[16],ymm0[17],ymm6[17],ymm0[18],ymm6[18],ymm0[19],ymm6[19],ymm0[20],ymm6[20],ymm0[21],ymm6[21],ymm0[22],ymm6[22],ymm0[23],ymm6[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm8[8],ymm14[8],ymm8[9],ymm14[9],ymm8[10],ymm14[10],ymm8[11],ymm14[11],ymm8[12],ymm14[12],ymm8[13],ymm14[13],ymm8[14],ymm14[14],ymm8[15],ymm14[15],ymm8[24],ymm14[24],ymm8[25],ymm14[25],ymm8[26],ymm14[26],ymm8[27],ymm14[27],ymm8[28],ymm14[28],ymm8[29],ymm14[29],ymm8[30],ymm14[30],ymm8[31],ymm14[31] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm1 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm6[8],ymm11[9],ymm6[9],ymm11[10],ymm6[10],ymm11[11],ymm6[11],ymm11[12],ymm6[12],ymm11[13],ymm6[13],ymm11[14],ymm6[14],ymm11[15],ymm6[15],ymm11[24],ymm6[24],ymm11[25],ymm6[25],ymm11[26],ymm6[26],ymm11[27],ymm6[27],ymm11[28],ymm6[28],ymm11[29],ymm6[29],ymm11[30],ymm6[30],ymm11[31],ymm6[31] -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm7 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] +; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm30 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] +; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm14 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm14, %zmm5 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm5 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX512F-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm31 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX512F-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm15 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm18 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm19 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm15, %zmm5 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm14 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm6, %xmm21 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm5, %xmm20 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm25 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm9 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm2 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm26 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm17 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm27 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm10 -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm10, %xmm10 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm10, %zmm28 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm13 -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm1 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm29 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm12 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm0[2,2,2,3] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm0[2,2,2,3] -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm15 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512F-FAST-NEXT: vprold $16, %xmm15, %xmm15 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm23 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm15 -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm10 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; AVX512F-FAST-NEXT: vprold $16, %xmm10, %xmm10 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm10, %zmm24 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[4],ymm6[4],ymm11[5],ymm6[5],ymm11[6],ymm6[6],ymm11[7],ymm6[7],ymm11[16],ymm6[16],ymm11[17],ymm6[17],ymm11[18],ymm6[18],ymm11[19],ymm6[19],ymm11[20],ymm6[20],ymm11[21],ymm6[21],ymm11[22],ymm6[22],ymm11[23],ymm6[23] -; AVX512F-FAST-NEXT: vprold $16, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm8[0],ymm14[0],ymm8[1],ymm14[1],ymm8[2],ymm14[2],ymm8[3],ymm14[3],ymm8[4],ymm14[4],ymm8[5],ymm14[5],ymm8[6],ymm14[6],ymm8[7],ymm14[7],ymm8[16],ymm14[16],ymm8[17],ymm14[17],ymm8[18],ymm14[18],ymm8[19],ymm14[19],ymm8[20],ymm14[20],ymm8[21],ymm14[21],ymm8[22],ymm14[22],ymm8[23],ymm14[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm6 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,0,0,1,10,10,10,11] -; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm31, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm2 -; AVX512F-FAST-NEXT: vpermt2q %zmm1, %zmm31, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm30, %zmm6 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512F-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm8, %xmm16 -; AVX512F-FAST-NEXT: vpermt2q %zmm1, %zmm31, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm8 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[4],ymm8[4],ymm1[5],ymm8[5],ymm1[6],ymm8[6],ymm1[7],ymm8[7],ymm1[16],ymm8[16],ymm1[17],ymm8[17],ymm1[18],ymm8[18],ymm1[19],ymm8[19],ymm1[20],ymm8[20],ymm1[21],ymm8[21],ymm1[22],ymm8[22],ymm1[23],ymm8[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX512F-FAST-NEXT: vprold $16, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vpermt2q %zmm1, %zmm31, %zmm3 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm4 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm13 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm8 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm14 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm1 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15],ymm13[24],ymm14[24],ymm13[25],ymm14[25],ymm13[26],ymm14[26],ymm13[27],ymm14[27],ymm13[28],ymm14[28],ymm13[29],ymm14[29],ymm13[30],ymm14[30],ymm13[31],ymm14[31] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm29 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm10 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm9, %ymm1 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15],ymm9[24],ymm10[24],ymm9[25],ymm10[25],ymm9[26],ymm10[26],ymm9[27],ymm10[27],ymm9[28],ymm10[28],ymm9[29],ymm10[29],ymm9[30],ymm10[30],ymm9[31],ymm10[31] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm11 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm27 +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm12 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm26 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm4 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-FAST-NEXT: vprold $16, %xmm4, %xmm4 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm31 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX512F-FAST-NEXT: vprold $16, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm30 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[16],ymm2[16],ymm8[17],ymm2[17],ymm8[18],ymm2[18],ymm8[19],ymm2[19],ymm8[20],ymm2[20],ymm8[21],ymm2[21],ymm8[22],ymm2[22],ymm8[23],ymm2[23] +; AVX512F-FAST-NEXT: vprold $16, %ymm2, %ymm6 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: # xmm3 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm23 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm8, %xmm22 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,0,0,1,10,10,10,11] +; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vpermt2q %zmm6, %zmm8, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm3, %zmm1 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm17 +; AVX512F-FAST-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm16, %zmm0 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX512F-FAST-NEXT: vpermt2q %zmm1, %zmm8, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm7 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[16],ymm10[16],ymm9[17],ymm10[17],ymm9[18],ymm10[18],ymm9[19],ymm10[19],ymm9[20],ymm10[20],ymm9[21],ymm10[21],ymm9[22],ymm10[22],ymm9[23],ymm10[23] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[16],ymm14[16],ymm13[17],ymm14[17],ymm13[18],ymm14[18],ymm13[19],ymm14[19],ymm13[20],ymm14[20],ymm13[21],ymm14[21],ymm13[22],ymm14[22],ymm13[23],ymm14[23] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm9 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm10 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm10 +; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX512F-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 +; AVX512F-FAST-NEXT: vprold $16, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm4 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm4, %zmm3, %zmm9 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm15 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> -; AVX512F-FAST-NEXT: vpermt2q %zmm4, %zmm31, %zmm5 -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm4 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm3, %zmm30, %zmm5 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm11 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm15, %xmm12 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm13 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm15 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm0 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm10 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm4[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm13[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm12[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm14[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm15[0,0,0,1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm15 -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm14, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm6, %ymm15, %ymm13 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm6, %ymm6 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm5, %ymm15, %ymm0 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm5, %ymm5 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512F-FAST-NEXT: vpermt2q %zmm10, %zmm31, %zmm14 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm5, %ymm10, %ymm9 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm5 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm14 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm5 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm4 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm9, %zmm16, %zmm4 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm2 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm1, %zmm2 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 256(%rax) -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm6, %ymm10, %ymm11 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm13[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm31[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm1 = mem[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm0 = mem[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm1 = mem[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm4, %zmm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm30[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm5 = mem[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm5 +; AVX512F-FAST-NEXT: vpermq $64, (%rsp), %zmm0 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm0 = mem[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm5, %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm5 = mem[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm4, %zmm5 ; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: # zmm0 = mem[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm2 = mem[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm30, %zmm2 +; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm4 = mem[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm3, %zmm4 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm29[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm28[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm3, %zmm6 ; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: # zmm0 = mem[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm5 = mem[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm30, %zmm5 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm0, %ymm15, %ymm21 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm5, %ymm0 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm0, %ymm15, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm2, %ymm0, %ymm22 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm2 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm22[0,1,2,3],zmm2[4,5,6,7] -; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm6 = mem[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm2, %zmm9, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm5, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm2 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm27[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm6, %zmm2, %zmm3 ; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: # zmm2 = mem[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm9, %zmm2 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm23[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm26[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm8, %zmm5 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm24[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm28[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm8, %zmm9 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm5, %ymm1 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm1, %ymm0, %ymm3 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm9, %ymm1 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm1, %ymm0, %ymm4 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm5, %ymm10, %ymm7 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm9, %ymm10, %ymm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm27[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm12[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm29[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm4, %zmm2 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm26[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm4, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 192(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 320(%rax) -; AVX512F-FAST-NEXT: addq $360, %rsp # imm = 0x168 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-FAST-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: store_i8_stride6_vf64: ; AVX512BW-ONLY-SLOW: # %bb.0: ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm0, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm5, %ymm20, %ymm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %ymm19 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm19[0],ymm18[0],ymm19[1],ymm18[1],ymm19[2],ymm18[2],ymm19[3],ymm18[3],ymm19[4],ymm18[4],ymm19[5],ymm18[5],ymm19[6],ymm18[6],ymm19[7],ymm18[7],ymm19[16],ymm18[16],ymm19[17],ymm18[17],ymm19[18],ymm18[18],ymm19[19],ymm18[19],ymm19[20],ymm18[20],ymm19[21],ymm18[21],ymm19[22],ymm18[22],ymm19[23],ymm18[23] -; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %ymm6, %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[4],ymm9[4],ymm11[5],ymm9[5],ymm11[6],ymm9[6],ymm11[7],ymm9[7],ymm11[16],ymm9[16],ymm11[17],ymm9[17],ymm11[18],ymm9[18],ymm11[19],ymm9[19],ymm11[20],ymm9[20],ymm11[21],ymm9[21],ymm11[22],ymm9[22],ymm11[23],ymm9[23] +; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %ymm3, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm16 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm16[0],ymm14[0],ymm16[1],ymm14[1],ymm16[2],ymm14[2],ymm16[3],ymm14[3],ymm16[4],ymm14[4],ymm16[5],ymm14[5],ymm16[6],ymm14[6],ymm16[7],ymm14[7],ymm16[16],ymm14[16],ymm16[17],ymm14[17],ymm16[18],ymm14[18],ymm16[19],ymm14[19],ymm16[20],ymm14[20],ymm16[21],ymm14[21],ymm16[22],ymm14[22],ymm16[23],ymm14[23] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [8,11,10,9,8,11,10,9,8,11,10,9,12,13,14,13] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm0, %ymm15, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %xmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm25[8],xmm19[8],xmm25[9],xmm19[9],xmm25[10],xmm19[10],xmm25[11],xmm19[11],xmm25[12],xmm19[12],xmm25[13],xmm19[13],xmm25[14],xmm19[14],xmm25[15],xmm19[15] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm17, %xmm6, %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: movl $613566756, %r10d # imm = 0x24924924 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm5, %zmm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm6, %ymm23, %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm21 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512BW-ONLY-SLOW-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm21, %ymm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %xmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm26 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm26, %xmm23, %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512BW-ONLY-SLOW-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm18, %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: movl $1227133513, %r10d # imm = 0x49249249 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm6, %zmm0 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm22, %ymm23, %ymm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %ymm22 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm26 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512BW-ONLY-SLOW-NEXT: # ymm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm26, %ymm22, %ymm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,2,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %xmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm28 = +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm28, %xmm24, %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm29 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512BW-ONLY-SLOW-NEXT: # ymm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm29, %ymm22, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %r10, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm25, %zmm0 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm27 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm28 = ymm27[0],ymm25[0],ymm27[1],ymm25[1],ymm27[2],ymm25[2],ymm27[3],ymm25[3],ymm27[4],ymm25[4],ymm27[5],ymm25[5],ymm27[6],ymm25[6],ymm27[7],ymm25[7],ymm27[16],ymm25[16],ymm27[17],ymm25[17],ymm27[18],ymm25[18],ymm27[19],ymm25[19],ymm27[20],ymm25[20],ymm27[21],ymm25[21],ymm27[22],ymm25[22],ymm27[23],ymm25[23] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm29 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm29, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm28, %zmm7, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm28, %ymm20, %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm29 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm30 = ymm29[0],ymm28[0],ymm29[1],ymm28[1],ymm29[2],ymm28[2],ymm29[3],ymm28[3],ymm29[4],ymm28[4],ymm29[5],ymm28[5],ymm29[6],ymm28[6],ymm29[7],ymm28[7],ymm29[16],ymm28[16],ymm29[17],ymm28[17],ymm29[18],ymm28[18],ymm29[19],ymm28[19],ymm29[20],ymm28[20],ymm29[21],ymm28[21],ymm29[22],ymm28[22],ymm29[23],ymm28[23] -; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %ymm30, %ymm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,2,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm20, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm20, %zmm7 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm20, %ymm23, %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r8), %ymm30 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm30, %ymm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm20, %zmm7 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm20, %ymm23, %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r9), %ymm23 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm26, %ymm23, %ymm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm20, %zmm7 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm24 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm25, %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm27, %ymm26 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm26[0],ymm20[0],ymm26[1],ymm20[1],ymm26[2],ymm20[2],ymm26[3],ymm20[3],ymm26[4],ymm20[4],ymm26[5],ymm20[5],ymm26[6],ymm20[6],ymm26[7],ymm20[7],ymm26[16],ymm20[16],ymm26[17],ymm20[17],ymm26[18],ymm20[18],ymm26[19],ymm20[19],ymm26[20],ymm20[20],ymm26[21],ymm20[21],ymm26[22],ymm20[22],ymm26[23],ymm20[23] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm25 = ymm27[8],ymm25[8],ymm27[9],ymm25[9],ymm27[10],ymm25[10],ymm27[11],ymm25[11],ymm27[12],ymm25[12],ymm27[13],ymm25[13],ymm27[14],ymm25[14],ymm27[15],ymm25[15],ymm27[24],ymm25[24],ymm27[25],ymm25[25],ymm27[26],ymm25[26],ymm27[27],ymm25[27],ymm27[28],ymm25[28],ymm27[29],ymm25[29],ymm27[30],ymm25[30],ymm27[31],ymm25[31] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm25, %ymm26, %ymm25 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm27 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm28, %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm29, %ymm31 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm31[0],ymm20[0],ymm31[1],ymm20[1],ymm31[2],ymm20[2],ymm31[3],ymm20[3],ymm31[4],ymm20[4],ymm31[5],ymm20[5],ymm31[6],ymm20[6],ymm31[7],ymm20[7],ymm31[16],ymm20[16],ymm31[17],ymm20[17],ymm31[18],ymm20[18],ymm31[19],ymm20[19],ymm31[20],ymm20[20],ymm31[21],ymm20[21],ymm31[22],ymm20[22],ymm31[23],ymm20[23] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm28 = ymm29[8],ymm28[8],ymm29[9],ymm28[9],ymm29[10],ymm28[10],ymm29[11],ymm28[11],ymm29[12],ymm28[12],ymm29[13],ymm28[13],ymm29[14],ymm28[14],ymm29[15],ymm28[15],ymm29[24],ymm28[24],ymm29[25],ymm28[25],ymm29[26],ymm28[26],ymm29[27],ymm28[27],ymm29[28],ymm28[28],ymm29[29],ymm28[29],ymm29[30],ymm28[30],ymm29[31],ymm28[31] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm28, %ymm29, %ymm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm8, %zmm0 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[4],ymm8[4],ymm10[5],ymm8[5],ymm10[6],ymm8[6],ymm10[7],ymm8[7],ymm10[16],ymm8[16],ymm10[17],ymm8[17],ymm10[18],ymm8[18],ymm10[19],ymm8[19],ymm10[20],ymm8[20],ymm10[21],ymm8[21],ymm10[22],ymm8[22],ymm10[23],ymm8[23] +; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %ymm12, %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[16],ymm12[16],ymm13[17],ymm12[17],ymm13[18],ymm12[18],ymm13[19],ymm12[19],ymm13[20],ymm12[20],ymm13[21],ymm12[21],ymm13[22],ymm12[22],ymm13[23],ymm12[23] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm7, %ymm15, %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm17, %xmm15, %xmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm30, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm26, %xmm3, %xmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm15[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm15, %ymm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm17, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm17, %zmm7 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm28, %xmm6, %xmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm17[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm29, %ymm17, %ymm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm26, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm26, %zmm7 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm26 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm26, %xmm19, %xmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm26, %xmm25, %xmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm25[0],xmm19[0],xmm25[1],xmm19[1],xmm25[2],xmm19[2],xmm25[3],xmm19[3],xmm25[4],xmm19[4],xmm25[5],xmm19[5],xmm25[6],xmm19[6],xmm25[7],xmm19[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm19, %ymm25, %ymm19 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm19, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm27 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm27, %xmm20, %xmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm27, %xmm21, %xmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm28 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7] +; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %xmm20, %xmm20 ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm20, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm30[0,1,2,3],zmm14[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm28, %zmm25, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm25 = zmm25[2,2,2,3,6,6,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm20 = zmm20[0,0,0,1,4,4,4,5] ; AVX512BW-ONLY-SLOW-NEXT: movl $-1840700270, %ecx # imm = 0x92492492 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],zmm12[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm25, %zmm23, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm23 = zmm23[2,2,2,3,6,6,6,7] -; AVX512BW-ONLY-SLOW-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rcx, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm23, %zmm20 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm16, %ymm23 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm17, %ymm24 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm23 = ymm24[0],ymm23[0],ymm24[1],ymm23[1],ymm24[2],ymm23[2],ymm24[3],ymm23[3],ymm24[4],ymm23[4],ymm24[5],ymm23[5],ymm24[6],ymm23[6],ymm24[7],ymm23[7],ymm24[16],ymm23[16],ymm24[17],ymm23[17],ymm24[18],ymm23[18],ymm24[19],ymm23[19],ymm24[20],ymm23[20],ymm24[21],ymm23[21],ymm24[22],ymm23[22],ymm24[23],ymm23[23] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,2,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm16 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm16, %ymm26, %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm23, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm18, %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm19, %ymm23 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm16 = ymm23[0],ymm16[0],ymm23[1],ymm16[1],ymm23[2],ymm16[2],ymm23[3],ymm16[3],ymm23[4],ymm16[4],ymm23[5],ymm16[5],ymm23[6],ymm16[6],ymm23[7],ymm16[7],ymm23[16],ymm16[16],ymm23[17],ymm16[17],ymm23[18],ymm16[18],ymm23[19],ymm16[19],ymm23[20],ymm16[20],ymm23[21],ymm16[21],ymm23[22],ymm16[22],ymm23[23],ymm16[23] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm18 = ymm19[8],ymm18[8],ymm19[9],ymm18[9],ymm19[10],ymm18[10],ymm19[11],ymm18[11],ymm19[12],ymm18[12],ymm19[13],ymm18[13],ymm19[14],ymm18[14],ymm19[15],ymm18[15],ymm19[24],ymm18[24],ymm19[25],ymm18[25],ymm19[26],ymm18[26],ymm19[27],ymm18[27],ymm19[28],ymm18[28],ymm19[29],ymm18[29],ymm19[30],ymm18[30],ymm19[31],ymm18[31] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm18, %ymm29, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm17, %zmm16 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm14, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm28, %zmm14, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm14 = zmm14[2,2,2,3,6,6,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm14, %zmm16 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm12, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm25, %zmm12, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[2,2,2,3,6,6,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm12, %zmm16 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm9, %ymm11, %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm10, %xmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %xmm8, %xmm8 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm8, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm8, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[2,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm13, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm20, %zmm19 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm20, %xmm23, %xmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm23, %xmm23 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm28, %zmm23, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm23 = zmm23[0,0,0,1,4,4,4,5] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm23, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm23 = +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm23, %xmm24, %xmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm28 = +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm28, %xmm24, %xmm24 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm29, %zmm24, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm24 = zmm24[0,0,0,1,4,4,4,5] ; AVX512BW-ONLY-SLOW-NEXT: movabsq $585610922974906400, %rcx # imm = 0x820820820820820 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rcx, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm8, %zmm9 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm1, %ymm11, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %xmm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm24, %zmm19 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm24 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm14, %ymm29 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm16, %ymm30 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm29 = ymm30[0],ymm29[0],ymm30[1],ymm29[1],ymm30[2],ymm29[2],ymm30[3],ymm29[3],ymm30[4],ymm29[4],ymm30[5],ymm29[5],ymm30[6],ymm29[6],ymm30[7],ymm29[7],ymm30[16],ymm29[16],ymm30[17],ymm29[17],ymm30[18],ymm29[18],ymm30[19],ymm29[19],ymm30[20],ymm29[20],ymm30[21],ymm29[21],ymm30[22],ymm29[22],ymm30[23],ymm29[23] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm29[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm16 = ymm16[8],ymm14[8],ymm16[9],ymm14[9],ymm16[10],ymm14[10],ymm16[11],ymm14[11],ymm16[12],ymm14[12],ymm16[13],ymm14[13],ymm16[14],ymm14[14],ymm16[15],ymm14[15],ymm16[24],ymm14[24],ymm16[25],ymm14[25],ymm16[26],ymm14[26],ymm16[27],ymm14[27],ymm16[28],ymm14[28],ymm16[29],ymm14[29],ymm16[30],ymm14[30],ymm16[31],ymm14[31] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm16, %ymm14, %ymm16 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm29, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm16 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm16, %ymm9, %ymm30 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm16, %ymm11, %ymm31 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm30 = ymm31[0],ymm30[0],ymm31[1],ymm30[1],ymm31[2],ymm30[2],ymm31[3],ymm30[3],ymm31[4],ymm30[4],ymm31[5],ymm30[5],ymm31[6],ymm30[6],ymm31[7],ymm30[7],ymm31[16],ymm30[16],ymm31[17],ymm30[17],ymm31[18],ymm30[18],ymm31[19],ymm30[19],ymm31[20],ymm30[20],ymm31[21],ymm30[21],ymm31[22],ymm30[22],ymm31[23],ymm30[23] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11],ymm11[12],ymm9[12],ymm11[13],ymm9[13],ymm11[14],ymm9[14],ymm11[15],ymm9[15],ymm11[24],ymm9[24],ymm11[25],ymm9[25],ymm11[26],ymm9[26],ymm11[27],ymm9[27],ymm11[28],ymm9[28],ymm11[29],ymm9[29],ymm11[30],ymm9[30],ymm11[31],ymm9[31] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm9, %ymm11, %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm30, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm29, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm29, %zmm18, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm18 = zmm18[2,2,2,3,6,6,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm18, %zmm9 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm18, %zmm22, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,2,2,3,6,6,6,7] +; AVX512BW-ONLY-SLOW-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208 +; AVX512BW-ONLY-SLOW-NEXT: kmovq %rcx, %k4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm22, %zmm9 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm26, %xmm4, %xmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm26, %xmm5, %xmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm26[8],xmm22[8],xmm26[9],xmm22[9],xmm26[10],xmm22[10],xmm26[11],xmm22[11],xmm26[12],xmm22[12],xmm26[13],xmm22[13],xmm26[14],xmm22[14],xmm26[15],xmm22[15] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm22[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm4, %ymm25, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm4, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm27, %xmm1, %xmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm27, %xmm2, %xmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm22[0],xmm5[0],xmm22[1],xmm5[1],xmm22[2],xmm5[2],xmm22[3],xmm5[3],xmm22[4],xmm5[4],xmm22[5],xmm5[5],xmm22[6],xmm5[6],xmm22[7],xmm5[7] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %xmm1, %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm1, %zmm4 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm20, %xmm3, %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm3, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm23, %xmm6, %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm28, %xmm6, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm1, %zmm4 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm12, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm13, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15],ymm13[24],ymm12[24],ymm13[25],ymm12[25],ymm13[26],ymm12[26],ymm13[27],ymm12[27],ymm13[28],ymm12[28],ymm13[29],ymm12[29],ymm13[30],ymm12[30],ymm13[31],ymm12[31] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm2, %ymm14, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm16, %ymm8, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm16, %ymm10, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[16],ymm2[16],ymm3[17],ymm2[17],ymm3[18],ymm2[18],ymm3[19],ymm2[19],ymm3[20],ymm2[20],ymm3[21],ymm2[21],ymm3[22],ymm2[22],ymm3[23],ymm2[23] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm10[8],ymm8[8],ymm10[9],ymm8[9],ymm10[10],ymm8[10],ymm10[11],ymm8[11],ymm10[12],ymm8[12],ymm10[13],ymm8[13],ymm10[14],ymm8[14],ymm10[15],ymm8[15],ymm10[24],ymm8[24],ymm10[25],ymm8[25],ymm10[26],ymm8[26],ymm10[27],ymm8[27],ymm10[28],ymm8[28],ymm10[29],ymm8[29],ymm10[30],ymm8[30],ymm10[31],ymm8[31] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm3, %ymm11, %ymm3 ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,0,0,1,4,4,4,5] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm2, %zmm10, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm2, %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm29, %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm1, %zmm2 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm18, %zmm17, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm1, %zmm2 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 256(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; -; AVX512BW-FAST-LABEL: store_i8_stride6_vf64: -; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm1 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512BW-FAST-NEXT: vpermw %ymm1, %ymm9, %ymm1 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %ymm7 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm7, %ymm2 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15],ymm7[24],ymm5[24],ymm7[25],ymm5[25],ymm7[26],ymm5[26],ymm7[27],ymm5[27],ymm7[28],ymm5[28],ymm7[29],ymm5[29],ymm7[30],ymm5[30],ymm7[31],ymm5[31] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm13, %ymm2 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: movl $613566756, %eax # imm = 0x24924924 -; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm8[4,5,6,7] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-FAST-NEXT: vpshufb %zmm15, %zmm1, %zmm1 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] -; AVX512BW-FAST-NEXT: movl $-1840700270, %eax # imm = 0x92492492 -; AVX512BW-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm1[0,1,2,3],zmm10[4,5,6,7] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = -; AVX512BW-FAST-NEXT: vpshufb %zmm16, %zmm11, %zmm11 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,2,2,3,6,6,6,7] -; AVX512BW-FAST-NEXT: movabsq $-9076969306111049208, %rax # imm = 0x8208208208208208 -; AVX512BW-FAST-NEXT: kmovq %rax, %k3 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm11, %zmm0 {%k3} -; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm11 -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm17 -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm14 -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm6 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm17[0],ymm6[1],ymm17[1],ymm6[2],ymm17[2],ymm6[3],ymm17[3],ymm6[4],ymm17[4],ymm6[5],ymm17[5],ymm6[6],ymm17[6],ymm6[7],ymm17[7],ymm6[16],ymm17[16],ymm6[17],ymm17[17],ymm6[18],ymm17[18],ymm6[19],ymm17[19],ymm6[20],ymm17[20],ymm6[21],ymm17[21],ymm6[22],ymm17[22],ymm6[23],ymm17[23] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm17 = ymm14[8],ymm11[8],ymm14[9],ymm11[9],ymm14[10],ymm11[10],ymm14[11],ymm11[11],ymm14[12],ymm11[12],ymm14[13],ymm11[13],ymm14[14],ymm11[14],ymm14[15],ymm11[15],ymm14[24],ymm11[24],ymm14[25],ymm11[25],ymm14[26],ymm11[26],ymm14[27],ymm11[27],ymm14[28],ymm11[28],ymm14[29],ymm11[29],ymm14[30],ymm11[30],ymm14[31],ymm11[31] -; AVX512BW-FAST-NEXT: vpermw %ymm17, %ymm9, %ymm9 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %ymm18 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm18, %ymm6 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %ymm19 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm19, %ymm12 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm12[0],ymm6[0],ymm12[1],ymm6[1],ymm12[2],ymm6[2],ymm12[3],ymm6[3],ymm12[4],ymm6[4],ymm12[5],ymm6[5],ymm12[6],ymm6[6],ymm12[7],ymm6[7],ymm12[16],ymm6[16],ymm12[17],ymm6[17],ymm12[18],ymm6[18],ymm12[19],ymm6[19],ymm12[20],ymm6[20],ymm12[21],ymm6[21],ymm12[22],ymm6[22],ymm12[23],ymm6[23] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm19[8],ymm18[8],ymm19[9],ymm18[9],ymm19[10],ymm18[10],ymm19[11],ymm18[11],ymm19[12],ymm18[12],ymm19[13],ymm18[13],ymm19[14],ymm18[14],ymm19[15],ymm18[15],ymm19[24],ymm18[24],ymm19[25],ymm18[25],ymm19[26],ymm18[26],ymm19[27],ymm18[27],ymm19[28],ymm18[28],ymm19[29],ymm18[29],ymm19[30],ymm18[30],ymm19[31],ymm18[31] -; AVX512BW-FAST-NEXT: vpermw %ymm12, %ymm13, %ymm12 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm6, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm9, %zmm6 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa (%r8), %ymm9 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 -; AVX512BW-FAST-NEXT: vpshufb %zmm15, %zmm8, %zmm8 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7] -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm8, %zmm6 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa (%r9), %ymm8 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm10 -; AVX512BW-FAST-NEXT: vpshufb %zmm16, %zmm10, %zmm10 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,2,2,3,6,6,6,7] -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm10, %zmm6 {%k3} -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %xmm22 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm21 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm12, %xmm10 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %xmm23 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %xmm15 -; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm15, %xmm13 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3],xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6] -; AVX512BW-FAST-NEXT: vpermw %ymm13, %ymm24, %ymm13 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm13, %zmm13 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rsi), %xmm17 -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm25 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm17, %xmm10 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm20 -; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm20, %xmm16 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm16[8],xmm10[8],xmm16[9],xmm10[9],xmm16[10],xmm10[10],xmm16[11],xmm10[11],xmm16[12],xmm10[12],xmm16[13],xmm10[13],xmm16[14],xmm10[14],xmm16[15],xmm10[15] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm20[0],xmm17[0],xmm20[1],xmm17[1],xmm20[2],xmm17[2],xmm20[3],xmm17[3],xmm20[4],xmm17[4],xmm20[5],xmm17[5],xmm20[6],xmm17[6],xmm20[7],xmm17[7] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] -; AVX512BW-FAST-NEXT: vpermw %ymm16, %ymm26, %ymm16 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm16, %zmm10 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm13, %zmm10 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %xmm13 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm27 = <8,u,9,u,u,u,u,u,u,u,5,u,6,u,7,u> -; AVX512BW-FAST-NEXT: vpshufb %xmm27, %xmm13, %xmm16 -; AVX512BW-FAST-NEXT: vpmovzxbw {{.*#+}} xmm28 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm28, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17] -; AVX512BW-FAST-NEXT: vpermw %zmm16, %zmm28, %zmm10 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%r9), %xmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm29 = -; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm16, %xmm30 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm31 = xmm16[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm30, %zmm31, %zmm30 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %xmm31 -; AVX512BW-FAST-NEXT: vpermw %zmm30, %zmm28, %zmm30 -; AVX512BW-FAST-NEXT: movabsq $585610922974906400, %rax # imm = 0x820820820820820 -; AVX512BW-FAST-NEXT: kmovq %rax, %k3 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm30, %zmm10 {%k3} -; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm22, %xmm30 -; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm23, %xmm21 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm21[0],xmm30[0],xmm21[1],xmm30[1],xmm21[2],xmm30[2],xmm21[3],xmm30[3],xmm21[4],xmm30[4],xmm21[5],xmm30[5],xmm21[6],xmm30[6],xmm21[7],xmm30[7] -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm30 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] -; AVX512BW-FAST-NEXT: vpermw %ymm30, %ymm24, %ymm24 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %xmm30 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm24, %zmm24 -; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm31, %xmm21 -; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm30, %xmm25 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm25[8],xmm21[8],xmm25[9],xmm21[9],xmm25[10],xmm21[10],xmm25[11],xmm21[11],xmm25[12],xmm21[12],xmm25[13],xmm21[13],xmm25[14],xmm21[14],xmm25[15],xmm21[15] -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm30[0],xmm31[0],xmm30[1],xmm31[1],xmm30[2],xmm31[2],xmm30[3],xmm31[3],xmm30[4],xmm31[4],xmm30[5],xmm31[5],xmm30[6],xmm31[6],xmm30[7],xmm31[7] -; AVX512BW-FAST-NEXT: vpermw %ymm25, %ymm26, %ymm25 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm25, %zmm21 -; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %xmm25 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm24, %zmm21 {%k2} -; AVX512BW-FAST-NEXT: vpshufb %xmm27, %xmm25, %xmm24 -; AVX512BW-FAST-NEXT: vpmovzxbw {{.*#+}} xmm26 = xmm25[0],zero,xmm25[1],zero,xmm25[2],zero,xmm25[3],zero,xmm25[4],zero,xmm25[5],zero,xmm25[6],zero,xmm25[7],zero -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm26, %zmm26 -; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %xmm24 -; AVX512BW-FAST-NEXT: vpermw %zmm26, %zmm28, %zmm21 {%k1} -; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm24, %xmm26 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm24[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm26, %zmm27, %zmm26 -; AVX512BW-FAST-NEXT: vpermw %zmm26, %zmm28, %zmm26 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm26, %zmm21 {%k3} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm18 = ymm19[0],ymm18[0],ymm19[1],ymm18[1],ymm19[2],ymm18[2],ymm19[3],ymm18[3],ymm19[4],ymm18[4],ymm19[5],ymm18[5],ymm19[6],ymm18[6],ymm19[7],ymm18[7],ymm19[16],ymm18[16],ymm19[17],ymm18[17],ymm19[18],ymm18[18],ymm19[19],ymm18[19],ymm19[20],ymm18[20],ymm19[21],ymm18[21],ymm19[22],ymm18[22],ymm19[23],ymm18[23] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm19, %zmm18 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm14[0],ymm11[0],ymm14[1],ymm11[1],ymm14[2],ymm11[2],ymm14[3],ymm11[3],ymm14[4],ymm11[4],ymm14[5],ymm11[5],ymm14[6],ymm11[6],ymm14[7],ymm11[7],ymm14[16],ymm11[16],ymm14[17],ymm11[17],ymm14[18],ymm11[18],ymm14[19],ymm11[19],ymm14[20],ymm11[20],ymm14[21],ymm11[21],ymm14[22],ymm11[22],ymm14[23],ymm11[23] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm30[8],xmm31[8],xmm30[9],xmm31[9],xmm30[10],xmm31[10],xmm30[11],xmm31[11],xmm30[12],xmm31[12],xmm30[13],xmm31[13],xmm30[14],xmm31[14],xmm30[15],xmm31[15] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] -; AVX512BW-FAST-NEXT: vpermw %zmm11, %zmm14, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,25,24,27,26,25,24,27,26,25,24,27,26,29,28,31,30] -; AVX512BW-FAST-NEXT: vpermw %zmm18, %zmm19, %zmm11 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[16],ymm5[16],ymm7[17],ymm5[17],ymm7[18],ymm5[18],ymm7[19],ymm5[19],ymm7[20],ymm5[20],ymm7[21],ymm5[21],ymm7[22],ymm5[22],ymm7[23],ymm5[23] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm15[8],xmm12[8],xmm15[9],xmm12[9],xmm15[10],xmm12[10],xmm15[11],xmm12[11],xmm15[12],xmm12[12],xmm15[13],xmm12[13],xmm15[14],xmm12[14],xmm15[15],xmm12[15] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm20[8],xmm17[8],xmm20[9],xmm17[9],xmm20[10],xmm17[10],xmm20[11],xmm17[11],xmm20[12],xmm17[12],xmm20[13],xmm17[13],xmm20[14],xmm17[14],xmm20[15],xmm17[15] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512BW-FAST-NEXT: vpermw %zmm3, %zmm14, %zmm3 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm25[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FAST-NEXT: vpermw %zmm5, %zmm19, %zmm3 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7] -; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm5, %ymm4 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512BW-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 -; AVX512BW-FAST-NEXT: movl $1227133513, %eax # imm = 0x49249249 -; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm4, %zmm11 {%k1} -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm24[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm5, %ymm4 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512BW-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm8 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 -; AVX512BW-FAST-NEXT: movabsq $2342443691899625602, %rax # imm = 0x2082082082082082 -; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm4, %zmm11 {%k2} -; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm5, %ymm4 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm16[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm5, %ymm2 -; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm3 {%k2} -; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, 256(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, (%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) -; AVX512BW-FAST-NEXT: vzeroupper -; AVX512BW-FAST-NEXT: retq +; AVX512BW-ONLY-FAST-LABEL: store_i8_stride6_vf64: +; AVX512BW-ONLY-FAST: # %bb.0: +; AVX512BW-ONLY-FAST-NEXT: pushq %rax +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 32(%rdx), %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-ONLY-FAST-NEXT: movl $613566756, %eax # imm = 0x24924924 +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm15 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 32(%rsi), %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 32(%rdi), %ymm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 32(%rcx), %ymm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 32(%rdx), %ymm21 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} ymm22 = ymm15[0],ymm14[0],ymm15[1],ymm14[1],ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[4],ymm14[4],ymm15[5],ymm14[5],ymm15[6],ymm14[6],ymm15[7],ymm14[7],ymm15[16],ymm14[16],ymm15[17],ymm14[17],ymm15[18],ymm14[18],ymm15[19],ymm14[19],ymm15[20],ymm14[20],ymm15[21],ymm14[21],ymm15[22],ymm14[22],ymm15[23],ymm14[23] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm23, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} ymm22 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[16],ymm12[16],ymm13[17],ymm12[17],ymm13[18],ymm12[18],ymm13[19],ymm12[19],ymm13[20],ymm12[20],ymm13[21],ymm12[21],ymm13[22],ymm12[22],ymm13[23],ymm12[23] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm24, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm22, %zmm24, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,25,24,27,26,25,24,27,26,25,24,27,26,29,28,31,30] +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm23, %zmm26, %zmm22 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} xmm27 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm27, %xmm10, %xmm23 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm29 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512BW-ONLY-FAST-NEXT: # ymm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpshufb %ymm29, %ymm4, %ymm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,0,0,1,10,10,10,11] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm30, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} ymm25 = ymm21[0],ymm20[0],ymm21[1],ymm20[1],ymm21[2],ymm20[2],ymm21[3],ymm20[3],ymm21[4],ymm20[4],ymm21[5],ymm20[5],ymm21[6],ymm20[6],ymm21[7],ymm20[7],ymm21[16],ymm20[16],ymm21[17],ymm20[17],ymm21[18],ymm20[18],ymm21[19],ymm20[19],ymm21[20],ymm20[20],ymm21[21],ymm20[21],ymm21[22],ymm20[22],ymm21[23],ymm20[23] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm17[8],xmm5[8],xmm17[9],xmm5[9],xmm17[10],xmm5[10],xmm17[11],xmm5[11],xmm17[12],xmm5[12],xmm17[13],xmm5[13],xmm17[14],xmm5[14],xmm17[15],xmm5[15] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm28, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} ymm25 = ymm19[0],ymm18[0],ymm19[1],ymm18[1],ymm19[2],ymm18[2],ymm19[3],ymm18[3],ymm19[4],ymm18[4],ymm19[5],ymm18[5],ymm19[6],ymm18[6],ymm19[7],ymm18[7],ymm19[16],ymm18[16],ymm19[17],ymm18[17],ymm19[18],ymm18[18],ymm19[19],ymm18[19],ymm19[20],ymm18[20],ymm19[21],ymm18[21],ymm19[22],ymm18[22],ymm19[23],ymm18[23] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm31 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm31, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} xmm31 = +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm25, %zmm24, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm31, %xmm11, %xmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm28, %zmm26, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512BW-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm16, %ymm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm30, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm27, %xmm2, %xmm26 +; AVX512BW-ONLY-FAST-NEXT: vpshufb %ymm29, %ymm28, %ymm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm30, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm31, %xmm3, %xmm27 +; AVX512BW-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm29, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm30 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm30, %xmm6, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm30, %xmm7, %xmm31 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm31[0],xmm0[0],xmm31[1],xmm0[1],xmm31[2],xmm0[2],xmm31[3],xmm0[3],xmm31[4],xmm0[4],xmm31[5],xmm0[5],xmm31[6],xmm0[6],xmm31[7],xmm0[7] +; AVX512BW-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6] +; AVX512BW-ONLY-FAST-NEXT: vpermw %ymm6, %ymm2, %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm31 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm31, %xmm8, %xmm6 +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm31, %xmm9, %xmm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512BW-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] +; AVX512BW-ONLY-FAST-NEXT: vpermw %ymm7, %ymm8, %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: movl $-1840700270, %eax # imm = 0x92492492 +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm0, %zmm6 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm10, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,0,1,4,4,4,5] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm7, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm11, %xmm11 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm11, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,0,1,4,4,4,5] +; AVX512BW-ONLY-FAST-NEXT: movabsq $585610922974906400, %rax # imm = 0x820820820820820 +; AVX512BW-ONLY-FAST-NEXT: kmovq %rax, %k3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm7, %zmm6 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm12, %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm13, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[4],ymm7[4],ymm3[5],ymm7[5],ymm3[6],ymm7[6],ymm3[7],ymm7[7],ymm3[16],ymm7[16],ymm3[17],ymm7[17],ymm3[18],ymm7[18],ymm3[19],ymm7[19],ymm3[20],ymm7[20],ymm3[21],ymm7[21],ymm3[22],ymm7[22],ymm3[23],ymm7[23] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15],ymm13[24],ymm12[24],ymm13[25],ymm12[25],ymm13[26],ymm12[26],ymm13[27],ymm12[27],ymm13[28],ymm12[28],ymm13[29],ymm12[29],ymm13[30],ymm12[30],ymm13[31],ymm12[31] +; AVX512BW-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-ONLY-FAST-NEXT: vpermw %ymm7, %ymm12, %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm14, %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm15, %ymm16 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm16[0],ymm7[0],ymm16[1],ymm7[1],ymm16[2],ymm7[2],ymm16[3],ymm7[3],ymm16[4],ymm7[4],ymm16[5],ymm7[5],ymm16[6],ymm7[6],ymm16[7],ymm7[7],ymm16[16],ymm7[16],ymm16[17],ymm7[17],ymm16[18],ymm7[18],ymm16[19],ymm7[19],ymm16[20],ymm7[20],ymm16[21],ymm7[21],ymm16[22],ymm7[22],ymm16[23],ymm7[23] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} ymm14 = ymm15[8],ymm14[8],ymm15[9],ymm14[9],ymm15[10],ymm14[10],ymm15[11],ymm14[11],ymm15[12],ymm14[12],ymm15[13],ymm14[13],ymm15[14],ymm14[14],ymm15[15],ymm14[15],ymm15[24],ymm14[24],ymm15[25],ymm14[25],ymm15[26],ymm14[26],ymm15[27],ymm14[27],ymm15[28],ymm14[28],ymm15[29],ymm14[29],ymm15[30],ymm14[30],ymm15[31],ymm14[31] +; AVX512BW-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512BW-ONLY-FAST-NEXT: vpermw %ymm14, %ymm15, %ymm14 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm7, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm3, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpshufb %zmm3, %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm14 = zmm14[2,2,2,3,6,6,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm14, %zmm7 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512BW-ONLY-FAST-NEXT: vpshufb %zmm14, %zmm1, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,2,2,3,6,6,6,7] +; AVX512BW-ONLY-FAST-NEXT: movabsq $-9076969306111049208, %rax # imm = 0x8208208208208208 +; AVX512BW-ONLY-FAST-NEXT: kmovq %rax, %k4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm16, %zmm7 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa %xmm5, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm30, %xmm5, %xmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm5 +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm30, %xmm17, %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512BW-ONLY-FAST-NEXT: vpermw %ymm17, %ymm2, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,0,0,1] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm2, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm31, %xmm0, %xmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm31, %xmm5, %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512BW-ONLY-FAST-NEXT: vpermw %ymm17, %ymm8, %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,0,0,1] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm8, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm2, %zmm8 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm1, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm8 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm18, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm19, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm19[8],ymm18[8],ymm19[9],ymm18[9],ymm19[10],ymm18[10],ymm19[11],ymm18[11],ymm19[12],ymm18[12],ymm19[13],ymm18[13],ymm19[14],ymm18[14],ymm19[15],ymm18[15],ymm19[24],ymm18[24],ymm19[25],ymm18[25],ymm19[26],ymm18[26],ymm19[27],ymm18[27],ymm19[28],ymm18[28],ymm19[29],ymm18[29],ymm19[30],ymm18[30],ymm19[31],ymm18[31] +; AVX512BW-ONLY-FAST-NEXT: vpermw %ymm1, %ymm12, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm20, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm21, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm21[8],ymm20[8],ymm21[9],ymm20[9],ymm21[10],ymm20[10],ymm21[11],ymm20[11],ymm21[12],ymm20[12],ymm21[13],ymm20[13],ymm21[14],ymm20[14],ymm21[15],ymm20[15],ymm21[24],ymm20[24],ymm21[25],ymm20[25],ymm21[26],ymm20[26],ymm21[27],ymm20[27],ymm21[28],ymm20[28],ymm21[29],ymm20[29],ymm21[30],ymm20[30],ymm21[31],ymm20[31] +; AVX512BW-ONLY-FAST-NEXT: vpermw %ymm2, %ymm15, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpshufb %zmm3, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm0, %zmm1 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpshufb %zmm14, %zmm29, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm1 {%k4} +; AVX512BW-ONLY-FAST-NEXT: movl $1227133513, %eax # imm = 0x49249249 +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm23, %zmm22 {%k1} +; AVX512BW-ONLY-FAST-NEXT: movabsq $2342443691899625602, %rax # imm = 0x2082082082082082 +; AVX512BW-ONLY-FAST-NEXT: kmovq %rax, %k2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm25, %zmm22 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm26, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm27, %zmm24 {%k2} +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-ONLY-FAST-NEXT: popq %rax +; AVX512BW-ONLY-FAST-NEXT: vzeroupper +; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: store_i8_stride6_vf64: ; AVX512DQBW-SLOW: # %bb.0: ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %ymm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23] -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rsi), %xmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] -; AVX512DQBW-SLOW-NEXT: vpermw %zmm0, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm5, %ymm20, %ymm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %ymm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm19 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm19[0],ymm18[0],ymm19[1],ymm18[1],ymm19[2],ymm18[2],ymm19[3],ymm18[3],ymm19[4],ymm18[4],ymm19[5],ymm18[5],ymm19[6],ymm18[6],ymm19[7],ymm18[7],ymm19[16],ymm18[16],ymm19[17],ymm18[17],ymm19[18],ymm18[18],ymm19[19],ymm18[19],ymm19[20],ymm18[20],ymm19[21],ymm18[21],ymm19[22],ymm18[22],ymm19[23],ymm18[23] -; AVX512DQBW-SLOW-NEXT: vprold $16, %ymm6, %ymm6 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %xmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15] +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[4],ymm9[4],ymm11[5],ymm9[5],ymm11[6],ymm9[6],ymm11[7],ymm9[7],ymm11[16],ymm9[16],ymm11[17],ymm9[17],ymm11[18],ymm9[18],ymm11[19],ymm9[19],ymm11[20],ymm9[20],ymm11[21],ymm9[21],ymm11[22],ymm9[22],ymm11[23],ymm9[23] +; AVX512DQBW-SLOW-NEXT: vprold $16, %ymm3, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm16 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm16[0],ymm14[0],ymm16[1],ymm14[1],ymm16[2],ymm14[2],ymm16[3],ymm14[3],ymm16[4],ymm14[4],ymm16[5],ymm14[5],ymm16[6],ymm14[6],ymm16[7],ymm14[7],ymm16[16],ymm14[16],ymm16[17],ymm14[17],ymm16[18],ymm14[18],ymm16[19],ymm14[19],ymm16[20],ymm14[20],ymm16[21],ymm14[21],ymm16[22],ymm14[22],ymm16[23],ymm14[23] +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [8,11,10,9,8,11,10,9,8,11,10,9,12,13,14,13] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm0, %ymm15, %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %xmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm25[8],xmm19[8],xmm25[9],xmm19[9],xmm25[10],xmm19[10],xmm25[11],xmm19[11],xmm25[12],xmm19[12],xmm25[13],xmm19[13],xmm25[14],xmm19[14],xmm25[15],xmm19[15] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm17, %xmm6, %xmm6 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 ; AVX512DQBW-SLOW-NEXT: movl $613566756, %r10d # imm = 0x24924924 ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm5, %zmm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %xmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%r8), %xmm13 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm6, %ymm23, %ymm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %ymm21 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512DQBW-SLOW-NEXT: # ymm24 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm21, %ymm15 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %xmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm26 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm26, %xmm23, %xmm6 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512DQBW-SLOW-NEXT: # ymm27 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm18, %ymm8 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 ; AVX512DQBW-SLOW-NEXT: movl $1227133513, %r10d # imm = 0x49249249 ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k2 ; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm6, %zmm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %xmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%r9), %xmm15 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm22, %ymm23, %ymm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %ymm22 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512DQBW-SLOW-NEXT: # ymm26 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm26, %ymm22, %ymm27 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,2,2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %xmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%r9), %xmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm28 = +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm28, %xmm24, %xmm8 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm29 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512DQBW-SLOW-NEXT: # ymm29 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm29, %ymm22, %ymm10 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 ; AVX512DQBW-SLOW-NEXT: movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082 ; AVX512DQBW-SLOW-NEXT: kmovq %r10, %k3 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm25, %zmm0 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm27 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm28 = ymm27[0],ymm25[0],ymm27[1],ymm25[1],ymm27[2],ymm25[2],ymm27[3],ymm25[3],ymm27[4],ymm25[4],ymm27[5],ymm25[5],ymm27[6],ymm25[6],ymm27[7],ymm25[7],ymm27[16],ymm25[16],ymm27[17],ymm25[17],ymm27[18],ymm25[18],ymm27[19],ymm25[19],ymm27[20],ymm25[20],ymm27[21],ymm25[21],ymm27[22],ymm25[22],ymm27[23],ymm25[23] -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm29 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm29, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermw %zmm28, %zmm7, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm28, %ymm20, %ymm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm29 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm30 = ymm29[0],ymm28[0],ymm29[1],ymm28[1],ymm29[2],ymm28[2],ymm29[3],ymm28[3],ymm29[4],ymm28[4],ymm29[5],ymm28[5],ymm29[6],ymm28[6],ymm29[7],ymm28[7],ymm29[16],ymm28[16],ymm29[17],ymm28[17],ymm29[18],ymm28[18],ymm29[19],ymm28[19],ymm29[20],ymm28[20],ymm29[21],ymm28[21],ymm29[22],ymm28[22],ymm29[23],ymm28[23] -; AVX512DQBW-SLOW-NEXT: vprold $16, %ymm30, %ymm30 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,2,2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm20, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm20, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm20, %ymm23, %ymm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r8), %ymm30 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm30, %ymm24 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm20, %zmm7 {%k2} -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm20, %ymm23, %ymm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r9), %ymm23 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm26, %ymm23, %ymm24 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm20, %zmm7 {%k3} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm24 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm25, %ymm20 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm27, %ymm26 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm26[0],ymm20[0],ymm26[1],ymm20[1],ymm26[2],ymm20[2],ymm26[3],ymm20[3],ymm26[4],ymm20[4],ymm26[5],ymm20[5],ymm26[6],ymm20[6],ymm26[7],ymm20[7],ymm26[16],ymm20[16],ymm26[17],ymm20[17],ymm26[18],ymm20[18],ymm26[19],ymm20[19],ymm26[20],ymm20[20],ymm26[21],ymm20[21],ymm26[22],ymm20[22],ymm26[23],ymm20[23] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm25 = ymm27[8],ymm25[8],ymm27[9],ymm25[9],ymm27[10],ymm25[10],ymm27[11],ymm25[11],ymm27[12],ymm25[12],ymm27[13],ymm25[13],ymm27[14],ymm25[14],ymm27[15],ymm25[15],ymm27[24],ymm25[24],ymm27[25],ymm25[25],ymm27[26],ymm25[26],ymm27[27],ymm25[27],ymm27[28],ymm25[28],ymm27[29],ymm25[29],ymm27[30],ymm25[30],ymm27[31],ymm25[31] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm25, %ymm26, %ymm25 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm27 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm28, %ymm20 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm29, %ymm31 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm31[0],ymm20[0],ymm31[1],ymm20[1],ymm31[2],ymm20[2],ymm31[3],ymm20[3],ymm31[4],ymm20[4],ymm31[5],ymm20[5],ymm31[6],ymm20[6],ymm31[7],ymm20[7],ymm31[16],ymm20[16],ymm31[17],ymm20[17],ymm31[18],ymm20[18],ymm31[19],ymm20[19],ymm31[20],ymm20[20],ymm31[21],ymm20[21],ymm31[22],ymm20[22],ymm31[23],ymm20[23] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm28 = ymm29[8],ymm28[8],ymm29[9],ymm28[9],ymm29[10],ymm28[10],ymm29[11],ymm28[11],ymm29[12],ymm28[12],ymm29[13],ymm28[13],ymm29[14],ymm28[14],ymm29[15],ymm28[15],ymm29[24],ymm28[24],ymm29[25],ymm28[25],ymm29[26],ymm28[26],ymm29[27],ymm28[27],ymm29[28],ymm28[28],ymm29[29],ymm28[29],ymm29[30],ymm28[30],ymm29[31],ymm28[31] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm28, %ymm29, %ymm28 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm8, %zmm0 {%k3} +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[4],ymm8[4],ymm10[5],ymm8[5],ymm10[6],ymm8[6],ymm10[7],ymm8[7],ymm10[16],ymm8[16],ymm10[17],ymm8[17],ymm10[18],ymm8[18],ymm10[19],ymm8[19],ymm10[20],ymm8[20],ymm10[21],ymm8[21],ymm10[22],ymm8[22],ymm10[23],ymm8[23] +; AVX512DQBW-SLOW-NEXT: vprold $16, %ymm12, %ymm12 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rsi), %ymm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[16],ymm12[16],ymm13[17],ymm12[17],ymm13[18],ymm12[18],ymm13[19],ymm12[19],ymm13[20],ymm12[20],ymm13[21],ymm12[21],ymm13[22],ymm12[22],ymm13[23],ymm12[23] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm7, %ymm15, %ymm7 +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm17, %xmm15, %xmm15 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm30, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm26, %xmm3, %xmm15 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm15[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm15, %ymm26 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm17, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm17, %zmm7 {%k2} +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm28, %xmm6, %xmm17 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm17[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm29, %ymm17, %ymm27 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm26, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm26, %zmm7 {%k3} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm26 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm26, %xmm19, %xmm27 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm26, %xmm25, %xmm28 +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm25[0],xmm19[0],xmm25[1],xmm19[1],xmm25[2],xmm19[2],xmm25[3],xmm19[3],xmm25[4],xmm19[4],xmm25[5],xmm19[5],xmm25[6],xmm19[6],xmm25[7],xmm19[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm19, %ymm25, %ymm19 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm19, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm27 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm27, %xmm20, %xmm28 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm27, %xmm21, %xmm29 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm28 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7] +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7] +; AVX512DQBW-SLOW-NEXT: vprold $16, %xmm20, %xmm20 ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm20, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k1} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm30[0,1,2,3],zmm14[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512DQBW-SLOW-NEXT: vpshufb %zmm28, %zmm25, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm25 = zmm25[2,2,2,3,6,6,6,7] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm20 = zmm20[0,0,0,1,4,4,4,5] ; AVX512DQBW-SLOW-NEXT: movl $-1840700270, %ecx # imm = 0x92492492 ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k2} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512DQBW-SLOW-NEXT: vpshufb %zmm25, %zmm23, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm23 = zmm23[2,2,2,3,6,6,6,7] -; AVX512DQBW-SLOW-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208 -; AVX512DQBW-SLOW-NEXT: kmovq %rcx, %k3 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm23, %zmm20 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm16, %ymm23 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm17, %ymm24 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm23 = ymm24[0],ymm23[0],ymm24[1],ymm23[1],ymm24[2],ymm23[2],ymm24[3],ymm23[3],ymm24[4],ymm23[4],ymm24[5],ymm23[5],ymm24[6],ymm23[6],ymm24[7],ymm23[7],ymm24[16],ymm23[16],ymm24[17],ymm23[17],ymm24[18],ymm23[18],ymm24[19],ymm23[19],ymm24[20],ymm23[20],ymm24[21],ymm23[21],ymm24[22],ymm23[22],ymm24[23],ymm23[23] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,2,2,3] -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm16 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm16, %ymm26, %ymm16 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm23, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm18, %ymm16 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm19, %ymm23 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm16 = ymm23[0],ymm16[0],ymm23[1],ymm16[1],ymm23[2],ymm16[2],ymm23[3],ymm16[3],ymm23[4],ymm16[4],ymm23[5],ymm16[5],ymm23[6],ymm16[6],ymm23[7],ymm16[7],ymm23[16],ymm16[16],ymm23[17],ymm16[17],ymm23[18],ymm16[18],ymm23[19],ymm16[19],ymm23[20],ymm16[20],ymm23[21],ymm16[21],ymm23[22],ymm16[22],ymm23[23],ymm16[23] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,2,3] -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm18 = ymm19[8],ymm18[8],ymm19[9],ymm18[9],ymm19[10],ymm18[10],ymm19[11],ymm18[11],ymm19[12],ymm18[12],ymm19[13],ymm18[13],ymm19[14],ymm18[14],ymm19[15],ymm18[15],ymm19[24],ymm18[24],ymm19[25],ymm18[25],ymm19[26],ymm18[26],ymm19[27],ymm18[27],ymm19[28],ymm18[28],ymm19[29],ymm18[29],ymm19[30],ymm18[30],ymm19[31],ymm18[31] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm18, %ymm29, %ymm18 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm17, %zmm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm14, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpshufb %zmm28, %zmm14, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm14 = zmm14[2,2,2,3,6,6,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm14, %zmm16 {%k2} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm12, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpshufb %zmm25, %zmm12, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[2,2,2,3,6,6,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm12, %zmm16 {%k3} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm14 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm17 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm9, %ymm11, %ymm9 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm17 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm14, %xmm10, %xmm18 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX512DQBW-SLOW-NEXT: vprold $16, %xmm8, %xmm8 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm8, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} -; AVX512DQBW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm8, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17] -; AVX512DQBW-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[2,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm13, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm20, %zmm19 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm20, %xmm23, %xmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm23, %xmm23 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm28, %zmm23, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm23 = zmm23[0,0,0,1,4,4,4,5] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm23, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm23 = +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm23, %xmm24, %xmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm28 = +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm28, %xmm24, %xmm24 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm29, %zmm24, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm24 = zmm24[0,0,0,1,4,4,4,5] ; AVX512DQBW-SLOW-NEXT: movabsq $585610922974906400, %rcx # imm = 0x820820820820820 ; AVX512DQBW-SLOW-NEXT: kmovq %rcx, %k3 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm8, %zmm9 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm8 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm12 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm1, %ymm11, %ymm1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm8 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX512DQBW-SLOW-NEXT: vprold $16, %xmm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm24, %zmm19 {%k3} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm24 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm14, %ymm29 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm16, %ymm30 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm29 = ymm30[0],ymm29[0],ymm30[1],ymm29[1],ymm30[2],ymm29[2],ymm30[3],ymm29[3],ymm30[4],ymm29[4],ymm30[5],ymm29[5],ymm30[6],ymm29[6],ymm30[7],ymm29[7],ymm30[16],ymm29[16],ymm30[17],ymm29[17],ymm30[18],ymm29[18],ymm30[19],ymm29[19],ymm30[20],ymm29[20],ymm30[21],ymm29[21],ymm30[22],ymm29[22],ymm30[23],ymm29[23] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm29[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm16 = ymm16[8],ymm14[8],ymm16[9],ymm14[9],ymm16[10],ymm14[10],ymm16[11],ymm14[11],ymm16[12],ymm14[12],ymm16[13],ymm14[13],ymm16[14],ymm14[14],ymm16[15],ymm14[15],ymm16[24],ymm14[24],ymm16[25],ymm14[25],ymm16[26],ymm14[26],ymm16[27],ymm14[27],ymm16[28],ymm14[28],ymm16[29],ymm14[29],ymm16[30],ymm14[30],ymm16[31],ymm14[31] +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm16, %ymm14, %ymm16 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm29, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm16 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm16, %ymm9, %ymm30 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm16, %ymm11, %ymm31 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm30 = ymm31[0],ymm30[0],ymm31[1],ymm30[1],ymm31[2],ymm30[2],ymm31[3],ymm30[3],ymm31[4],ymm30[4],ymm31[5],ymm30[5],ymm31[6],ymm30[6],ymm31[7],ymm30[7],ymm31[16],ymm30[16],ymm31[17],ymm30[17],ymm31[18],ymm30[18],ymm31[19],ymm30[19],ymm31[20],ymm30[20],ymm31[21],ymm30[21],ymm31[22],ymm30[22],ymm31[23],ymm30[23] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11],ymm11[12],ymm9[12],ymm11[13],ymm9[13],ymm11[14],ymm9[14],ymm11[15],ymm9[15],ymm11[24],ymm9[24],ymm11[25],ymm9[25],ymm11[26],ymm9[26],ymm11[27],ymm9[27],ymm11[28],ymm9[28],ymm11[29],ymm9[29],ymm11[30],ymm9[30],ymm11[31],ymm9[31] +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm9, %ymm11, %ymm9 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm30, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm29, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512DQBW-SLOW-NEXT: vpshufb %zmm29, %zmm18, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm18 = zmm18[2,2,2,3,6,6,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm18, %zmm9 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512DQBW-SLOW-NEXT: vpshufb %zmm18, %zmm22, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,2,2,3,6,6,6,7] +; AVX512DQBW-SLOW-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208 +; AVX512DQBW-SLOW-NEXT: kmovq %rcx, %k4 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm22, %zmm9 {%k4} +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm26, %xmm4, %xmm22 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm26, %xmm5, %xmm26 +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm26[8],xmm22[8],xmm26[9],xmm22[9],xmm26[10],xmm22[10],xmm26[11],xmm22[11],xmm26[12],xmm22[12],xmm26[13],xmm22[13],xmm26[14],xmm22[14],xmm26[15],xmm22[15] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm22[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm4, %ymm25, %ymm4 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm4, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm27, %xmm1, %xmm5 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm27, %xmm2, %xmm22 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm22[0],xmm5[0],xmm22[1],xmm5[1],xmm22[2],xmm5[2],xmm22[3],xmm5[3],xmm22[4],xmm5[4],xmm22[5],xmm5[5],xmm22[6],xmm5[6],xmm22[7],xmm5[7] +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQBW-SLOW-NEXT: vprold $16, %xmm1, %xmm1 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm1, %zmm4 {%k2} +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm20, %xmm3, %xmm1 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm3, %xmm2 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm1, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm23, %xmm6, %xmm1 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm28, %xmm6, %xmm2 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm4 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm12, %ymm1 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm13, %ymm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15],ymm13[24],ymm12[24],ymm13[25],ymm12[25],ymm13[26],ymm12[26],ymm13[27],ymm12[27],ymm13[28],ymm12[28],ymm13[29],ymm12[29],ymm13[30],ymm12[30],ymm13[31],ymm12[31] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm2, %ymm14, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm16, %ymm8, %ymm2 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm16, %ymm10, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[16],ymm2[16],ymm3[17],ymm2[17],ymm3[18],ymm2[18],ymm3[19],ymm2[19],ymm3[20],ymm2[20],ymm3[21],ymm2[21],ymm3[22],ymm2[22],ymm3[23],ymm2[23] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm10[8],ymm8[8],ymm10[9],ymm8[9],ymm10[10],ymm8[10],ymm10[11],ymm8[11],ymm10[12],ymm8[12],ymm10[13],ymm8[13],ymm10[14],ymm8[14],ymm10[15],ymm8[15],ymm10[24],ymm8[24],ymm10[25],ymm8[25],ymm10[26],ymm8[26],ymm10[27],ymm8[27],ymm10[28],ymm8[28],ymm10[29],ymm8[29],ymm10[30],ymm8[30],ymm10[31],ymm8[31] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm3, %ymm11, %ymm3 ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,0,0,1,4,4,4,5] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k2} -; AVX512DQBW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermw %zmm2, %zmm10, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermw %zmm2, %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vpshufb %zmm29, %zmm15, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm1, %zmm2 {%k2} +; AVX512DQBW-SLOW-NEXT: vpshufb %zmm18, %zmm17, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm2 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, (%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 256(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq +; +; AVX512DQBW-FAST-LABEL: store_i8_stride6_vf64: +; AVX512DQBW-FAST: # %bb.0: +; AVX512DQBW-FAST-NEXT: pushq %rax +; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 32(%rdx), %xmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %xmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQBW-FAST-NEXT: movl $613566756, %eax # imm = 0x24924924 +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqa (%r9), %xmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %ymm12 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm13 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %ymm14 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %ymm15 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 32(%rsi), %ymm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 32(%rdi), %ymm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 32(%rcx), %ymm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 32(%rdx), %ymm21 +; AVX512DQBW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm22 = ymm15[0],ymm14[0],ymm15[1],ymm14[1],ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[4],ymm14[4],ymm15[5],ymm14[5],ymm15[6],ymm14[6],ymm15[7],ymm14[7],ymm15[16],ymm14[16],ymm15[17],ymm14[17],ymm15[18],ymm14[18],ymm15[19],ymm14[19],ymm15[20],ymm14[20],ymm15[21],ymm14[21],ymm15[22],ymm14[22],ymm15[23],ymm14[23] +; AVX512DQBW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm23, %zmm23 +; AVX512DQBW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm22 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[16],ymm12[16],ymm13[17],ymm12[17],ymm13[18],ymm12[18],ymm13[19],ymm12[19],ymm13[20],ymm12[20],ymm13[21],ymm12[21],ymm13[22],ymm12[22],ymm13[23],ymm12[23] +; AVX512DQBW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm24, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] +; AVX512DQBW-FAST-NEXT: vpermw %zmm22, %zmm24, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,25,24,27,26,25,24,27,26,25,24,27,26,29,28,31,30] +; AVX512DQBW-FAST-NEXT: vpermw %zmm23, %zmm26, %zmm22 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm27 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX512DQBW-FAST-NEXT: vpshufb %xmm27, %xmm10, %xmm23 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm29 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512DQBW-FAST-NEXT: # ymm29 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpshufb %ymm29, %ymm4, %ymm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,0,0,1,10,10,10,11] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm30, %zmm23 +; AVX512DQBW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm25 = ymm21[0],ymm20[0],ymm21[1],ymm20[1],ymm21[2],ymm20[2],ymm21[3],ymm20[3],ymm21[4],ymm20[4],ymm21[5],ymm20[5],ymm21[6],ymm20[6],ymm21[7],ymm20[7],ymm21[16],ymm20[16],ymm21[17],ymm20[17],ymm21[18],ymm20[18],ymm21[19],ymm20[19],ymm21[20],ymm20[20],ymm21[21],ymm20[21],ymm21[22],ymm20[22],ymm21[23],ymm20[23] +; AVX512DQBW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm17[8],xmm5[8],xmm17[9],xmm5[9],xmm17[10],xmm5[10],xmm17[11],xmm5[11],xmm17[12],xmm5[12],xmm17[13],xmm5[13],xmm17[14],xmm5[14],xmm17[15],xmm5[15] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm28, %zmm28 +; AVX512DQBW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm25 = ymm19[0],ymm18[0],ymm19[1],ymm18[1],ymm19[2],ymm18[2],ymm19[3],ymm18[3],ymm19[4],ymm18[4],ymm19[5],ymm18[5],ymm19[6],ymm18[6],ymm19[7],ymm18[7],ymm19[16],ymm18[16],ymm19[17],ymm18[17],ymm19[18],ymm18[18],ymm19[19],ymm18[19],ymm19[20],ymm18[20],ymm19[21],ymm18[21],ymm19[22],ymm18[22],ymm19[23],ymm18[23] +; AVX512DQBW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm31 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm31, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm31 = +; AVX512DQBW-FAST-NEXT: vpermw %zmm25, %zmm24, %zmm24 +; AVX512DQBW-FAST-NEXT: vpshufb %xmm31, %xmm11, %xmm25 +; AVX512DQBW-FAST-NEXT: vpermw %zmm28, %zmm26, %zmm24 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512DQBW-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpshufb %ymm0, %ymm16, %ymm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm30, %zmm25 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpshufb %xmm27, %xmm2, %xmm26 +; AVX512DQBW-FAST-NEXT: vpshufb %ymm29, %ymm28, %ymm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm30, %zmm26 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpshufb %xmm31, %xmm3, %xmm27 +; AVX512DQBW-FAST-NEXT: vpshufb %ymm0, %ymm29, %ymm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm27 +; AVX512DQBW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm30 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQBW-FAST-NEXT: vpshufb %xmm30, %xmm6, %xmm0 +; AVX512DQBW-FAST-NEXT: vpshufb %xmm30, %xmm7, %xmm31 +; AVX512DQBW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm31[0],xmm0[0],xmm31[1],xmm0[1],xmm31[2],xmm0[2],xmm31[3],xmm0[3],xmm31[4],xmm0[4],xmm31[5],xmm0[5],xmm31[6],xmm0[6],xmm31[7],xmm0[7] +; AVX512DQBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512DQBW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6] +; AVX512DQBW-FAST-NEXT: vpermw %ymm6, %ymm2, %ymm6 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm31 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQBW-FAST-NEXT: vpshufb %xmm31, %xmm8, %xmm6 +; AVX512DQBW-FAST-NEXT: vpshufb %xmm31, %xmm9, %xmm7 +; AVX512DQBW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX512DQBW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512DQBW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] +; AVX512DQBW-FAST-NEXT: vpermw %ymm7, %ymm8, %ymm7 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512DQBW-FAST-NEXT: movl $-1840700270, %eax # imm = 0x92492492 +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k2 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm0, %zmm6 {%k2} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX512DQBW-FAST-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm10, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,0,1,4,4,4,5] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm7, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX512DQBW-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512DQBW-FAST-NEXT: vpshufb %xmm4, %xmm11, %xmm11 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm11, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,0,1,4,4,4,5] +; AVX512DQBW-FAST-NEXT: movabsq $585610922974906400, %rax # imm = 0x820820820820820 +; AVX512DQBW-FAST-NEXT: kmovq %rax, %k3 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm7, %zmm6 {%k3} +; AVX512DQBW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQBW-FAST-NEXT: vpshufb %ymm11, %ymm12, %ymm7 +; AVX512DQBW-FAST-NEXT: vpshufb %ymm11, %ymm13, %ymm3 +; AVX512DQBW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[4],ymm7[4],ymm3[5],ymm7[5],ymm3[6],ymm7[6],ymm3[7],ymm7[7],ymm3[16],ymm7[16],ymm3[17],ymm7[17],ymm3[18],ymm7[18],ymm3[19],ymm7[19],ymm3[20],ymm7[20],ymm3[21],ymm7[21],ymm3[22],ymm7[22],ymm3[23],ymm7[23] +; AVX512DQBW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15],ymm13[24],ymm12[24],ymm13[25],ymm12[25],ymm13[26],ymm12[26],ymm13[27],ymm12[27],ymm13[28],ymm12[28],ymm13[29],ymm12[29],ymm13[30],ymm12[30],ymm13[31],ymm12[31] +; AVX512DQBW-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512DQBW-FAST-NEXT: vpermw %ymm7, %ymm12, %ymm7 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 +; AVX512DQBW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQBW-FAST-NEXT: vpshufb %ymm13, %ymm14, %ymm7 +; AVX512DQBW-FAST-NEXT: vpshufb %ymm13, %ymm15, %ymm16 +; AVX512DQBW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm16[0],ymm7[0],ymm16[1],ymm7[1],ymm16[2],ymm7[2],ymm16[3],ymm7[3],ymm16[4],ymm7[4],ymm16[5],ymm7[5],ymm16[6],ymm7[6],ymm16[7],ymm7[7],ymm16[16],ymm7[16],ymm16[17],ymm7[17],ymm16[18],ymm7[18],ymm16[19],ymm7[19],ymm16[20],ymm7[20],ymm16[21],ymm7[21],ymm16[22],ymm7[22],ymm16[23],ymm7[23] +; AVX512DQBW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm14 = ymm15[8],ymm14[8],ymm15[9],ymm14[9],ymm15[10],ymm14[10],ymm15[11],ymm14[11],ymm15[12],ymm14[12],ymm15[13],ymm14[13],ymm15[14],ymm14[14],ymm15[15],ymm14[15],ymm15[24],ymm14[24],ymm15[25],ymm14[25],ymm15[26],ymm14[26],ymm15[27],ymm14[27],ymm15[28],ymm14[28],ymm15[29],ymm14[29],ymm15[30],ymm14[30],ymm15[31],ymm14[31] +; AVX512DQBW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512DQBW-FAST-NEXT: vpermw %ymm14, %ymm15, %ymm14 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm7, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm3, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpshufb %zmm3, %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermq {{.*#+}} zmm14 = zmm14[2,2,2,3,6,6,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm14, %zmm7 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512DQBW-FAST-NEXT: vpshufb %zmm14, %zmm1, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,2,2,3,6,6,6,7] +; AVX512DQBW-FAST-NEXT: movabsq $-9076969306111049208, %rax # imm = 0x8208208208208208 +; AVX512DQBW-FAST-NEXT: kmovq %rax, %k4 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm16, %zmm7 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa %xmm5, %xmm0 +; AVX512DQBW-FAST-NEXT: vpshufb %xmm30, %xmm5, %xmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %xmm17, %xmm5 +; AVX512DQBW-FAST-NEXT: vpshufb %xmm30, %xmm17, %xmm17 +; AVX512DQBW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] +; AVX512DQBW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512DQBW-FAST-NEXT: vpermw %ymm17, %ymm2, %ymm2 +; AVX512DQBW-FAST-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,0,0,1] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm2, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512DQBW-FAST-NEXT: vpshufb %xmm31, %xmm0, %xmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512DQBW-FAST-NEXT: vpshufb %xmm31, %xmm5, %xmm17 +; AVX512DQBW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] +; AVX512DQBW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512DQBW-FAST-NEXT: vpermw %ymm17, %ymm8, %ymm8 +; AVX512DQBW-FAST-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,0,0,1] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm8, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm2, %zmm8 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX512DQBW-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm2 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm1, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512DQBW-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm1 +; AVX512DQBW-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm8 {%k3} +; AVX512DQBW-FAST-NEXT: vpshufb %ymm11, %ymm18, %ymm0 +; AVX512DQBW-FAST-NEXT: vpshufb %ymm11, %ymm19, %ymm1 +; AVX512DQBW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512DQBW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm19[8],ymm18[8],ymm19[9],ymm18[9],ymm19[10],ymm18[10],ymm19[11],ymm18[11],ymm19[12],ymm18[12],ymm19[13],ymm18[13],ymm19[14],ymm18[14],ymm19[15],ymm18[15],ymm19[24],ymm18[24],ymm19[25],ymm18[25],ymm19[26],ymm18[26],ymm19[27],ymm18[27],ymm19[28],ymm18[28],ymm19[29],ymm18[29],ymm19[30],ymm18[30],ymm19[31],ymm18[31] +; AVX512DQBW-FAST-NEXT: vpermw %ymm1, %ymm12, %ymm1 +; AVX512DQBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vpshufb %ymm13, %ymm20, %ymm1 +; AVX512DQBW-FAST-NEXT: vpshufb %ymm13, %ymm21, %ymm2 +; AVX512DQBW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512DQBW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm21[8],ymm20[8],ymm21[9],ymm20[9],ymm21[10],ymm20[10],ymm21[11],ymm20[11],ymm21[12],ymm20[12],ymm21[13],ymm20[13],ymm21[14],ymm20[14],ymm21[15],ymm20[15],ymm21[24],ymm20[24],ymm21[25],ymm20[25],ymm21[26],ymm20[26],ymm21[27],ymm20[27],ymm21[28],ymm20[28],ymm21[29],ymm20[29],ymm21[30],ymm20[30],ymm21[31],ymm20[31] +; AVX512DQBW-FAST-NEXT: vpermw %ymm2, %ymm15, %ymm2 +; AVX512DQBW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vpshufb %zmm3, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm0, %zmm1 {%k2} +; AVX512DQBW-FAST-NEXT: vpshufb %zmm14, %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm1 {%k4} +; AVX512DQBW-FAST-NEXT: movl $1227133513, %eax # imm = 0x49249249 +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm23, %zmm22 {%k1} +; AVX512DQBW-FAST-NEXT: movabsq $2342443691899625602, %rax # imm = 0x2082082082082082 +; AVX512DQBW-FAST-NEXT: kmovq %rax, %k2 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm25, %zmm22 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm26, %zmm24 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm27, %zmm24 {%k2} +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQBW-FAST-NEXT: popq %rax +; AVX512DQBW-FAST-NEXT: vzeroupper +; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 %in.vec1 = load <64 x i8>, ptr %in.vecptr1, align 64 %in.vec2 = load <64 x i8>, ptr %in.vecptr2, align 64 @@ -5329,10 +5470,8 @@ ; AVX512: {{.*}} ; AVX512-FAST: {{.*}} ; AVX512-SLOW: {{.*}} -; AVX512BW-ONLY-FAST: {{.*}} ; AVX512DQ-FAST: {{.*}} ; AVX512DQ-SLOW: {{.*}} -; AVX512DQBW-FAST: {{.*}} ; AVX512F-ONLY-FAST: {{.*}} ; AVX512F-ONLY-SLOW: {{.*}} ; FALLBACK0: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -23,11 +23,11 @@ ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: movdqa (%r8), %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,5,6,7] @@ -62,12 +62,12 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa (%rdx), %xmm1 ; AVX-NEXT: vmovdqa (%r8), %xmm2 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] ; AVX-NEXT: vpextrw $6, %xmm0, 12(%rax) ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rax) @@ -98,114 +98,137 @@ ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa (%rdx), %xmm3 -; SSE-NEXT: movdqa (%r8), %xmm5 -; SSE-NEXT: movdqa (%r10), %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: pxor %xmm7, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[0,1,0,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm6 +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa (%rsi), %xmm0 +; SSE-NEXT: movdqa (%rdx), %xmm7 +; SSE-NEXT: movdqa (%rcx), %xmm6 +; SSE-NEXT: movdqa (%r8), %xmm9 +; SSE-NEXT: movdqa (%r9), %xmm8 +; SSE-NEXT: movdqa (%r10), %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,3] +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: por %xmm1, %xmm10 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,2,0,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,6] -; SSE-NEXT: packuswb %xmm8, %xmm6 -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm2[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: pandn %xmm10, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm2[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: pand %xmm10, %xmm11 +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,1,0] +; SSE-NEXT: pandn %xmm12, %xmm10 +; SSE-NEXT: por %xmm11, %xmm10 +; SSE-NEXT: pand %xmm1, %xmm10 +; SSE-NEXT: por %xmm5, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; SSE-NEXT: pand %xmm5, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm9[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm12, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm8[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,0,2,1] +; SSE-NEXT: pandn %xmm13, %xmm12 +; SSE-NEXT: por %xmm11, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] +; SSE-NEXT: pand %xmm11, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm3[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,0,0,0] +; SSE-NEXT: pandn %xmm13, %xmm11 +; SSE-NEXT: por %xmm12, %xmm11 +; SSE-NEXT: pandn %xmm11, %xmm5 +; SSE-NEXT: por %xmm10, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] -; SSE-NEXT: pandn %xmm9, %xmm7 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: pandn %xmm7, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,3,1,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,2,0,4,5,6,7] -; SSE-NEXT: packuswb %xmm6, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[3,1,2,1] -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,0,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm6 -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,0,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,1,3] +; SSE-NEXT: pandn %xmm8, %xmm4 +; SSE-NEXT: por %xmm9, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: por %xmm7, %xmm8 +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: por %xmm8, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,0,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movq %xmm3, 16(%rax) -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: psllq $24, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movq %xmm4, 16(%rax) +; SSE-NEXT: movdqa %xmm5, (%rax) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE-NEXT: movd %xmm0, 24(%rax) -; SSE-NEXT: movdqa %xmm4, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride7_vf4: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,4,8,12],zero,zero,zero,xmm0[1,5,9,13],zero,zero,zero,xmm0[2,6] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[0,4,8],zero,zero,zero,zero,xmm1[1,5,9],zero,zero +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm0 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastss (%r10), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,4,8,12],zero,zero,zero,xmm1[1,5,9,13],zero,zero,zero,xmm1[2,6] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm0[0,4,8],zero,zero,zero,zero,xmm0[1,5,9],zero,zero ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,14],zero,zero,zero,xmm0[3,7,11,15],zero,zero,zero,xmm0[u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[2,6,10],zero,zero,zero,zero,xmm1[3,7,11,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,14],zero,zero,zero,xmm1[3,7,11,15],zero,zero,zero,xmm1[u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[2,6,10],zero,zero,zero,zero,xmm0[3,7,11,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vpextrd $2, %xmm0, 24(%rax) ; AVX1-ONLY-NEXT: vmovq %xmm0, 16(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rax) +; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i8_stride7_vf4: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-ONLY-NEXT: vpbroadcastd (%r10), %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX2-ONLY-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX2-ONLY-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-ONLY-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero @@ -221,15 +244,18 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512F-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-NEXT: vmovdqa (%r8), %xmm1 +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX512F-NEXT: vpbroadcastd (%r10), %ymm2 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512F-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512F-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 -; AVX512F-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX512F-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27,u,u,u,u] ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero @@ -245,15 +271,18 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm1 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX512BW-NEXT: vpbroadcastd (%r10), %ymm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,ymm0[26,30],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero @@ -451,94 +480,106 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u],zero,zero,xmm2[5,13,u,u,u],zero,zero,xmm2[6,14,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,5,13],zero,zero,xmm1[u,u,u,6,14],zero,zero,xmm1[u,u] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,12],zero,xmm0[u,u,u,u,5,13],zero,xmm0[u,u,u,u,6,14] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm3[4,u,u,u,u],zero,zero,xmm3[5,u,u,u,u],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm5, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u],zero,zero,xmm2[7,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,7,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm0[u,u,u,u,7,15],zero,xmm0[u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[6,u,u,u,u],zero,zero,xmm3[7,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,255,255,255,255,0,0,0,0,255,255,255,255,0,0,0] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpblendvb %xmm7, %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm2[0,8,u,u,u],zero,zero,xmm2[1,9,u,u,u],zero,zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[0,8],zero,zero,xmm1[u,u,u,1,9],zero,zero,xmm1[u,u,u,2,10] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,0,8],zero,xmm0[u,u,u,u,1,9],zero,xmm0[u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u],zero,zero,xmm3[0,u,u,u,u],zero,zero,xmm3[1,u,u] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm6[6,7],zero,zero,zero,zero,zero,xmm6[8,9],zero,zero +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[4,5],zero,zero,zero,zero,zero,xmm8[6,7],zero,zero,zero,zero,zero,xmm8[8,9] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,1],zero,zero,zero,zero,zero,xmm8[2,3],zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1],zero,zero,zero,zero,zero,xmm6[2,3],zero,zero,zero,zero,zero,xmm6[4,5] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,4,5],zero,xmm4[u,u,u,u,6,7],zero,xmm4[u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u],zero,zero,xmm5[2,u,u,u,u],zero,zero,xmm5[3,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[u,u,u,3,11],zero,zero,xmm1[u,u,u,4,12],zero,zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,10,u,u,u],zero,zero,xmm2[3,11,u,u,u],zero,zero,xmm2[4,12] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,2,10],zero,xmm0[u,u,u,u,3,11],zero,xmm0[u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u],zero,zero,xmm3[2,u,u,u,u],zero,zero,xmm3[3,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 16(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rax) -; AVX1-ONLY-NEXT: vmovq %xmm5, 48(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%rax) +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[4,5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[12,15,u,u,u,u,u,14,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm4[9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [255,255,0,0,0,0,0,255,255,255,0,0,0,0,0,255] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[8,11],zero,zero,zero,zero,zero,xmm3[10,13],zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,9],zero,zero,zero,zero,zero,xmm4[10,11],zero,zero,zero,zero,zero,xmm4[12,13] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-ONLY-NEXT: vpsrlq $56, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm1[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,10,11],zero,xmm1[u,u,u,u,12,13],zero,xmm1[u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,xmm0[5,u,u,u,u],zero,zero,xmm0[6,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vmovlps %xmm1, 48(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) +; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i8_stride7_vf8: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm2[5,13],zero,zero,zero,zero,zero,ymm2[6,14],zero,zero,zero,zero,zero,zero,zero,ymm2[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,ymm4[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[4,12],zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,ymm1[6,14,22],zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28],zero,zero -; AVX2-SLOW-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,ymm1[18],zero,zero,zero,zero,zero,zero,ymm1[19],zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm0 -; AVX2-SLOW-NEXT: vmovq %xmm0, 48(%rax) -; AVX2-SLOW-NEXT: vmovdqa %xmm3, 32(%rax) +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm6 = mem[0],zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[1,1,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm7 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm8 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,zero,zero,zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[2,10],zero,zero,zero,zero,zero,ymm8[19,27],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero +; AVX2-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[4,13],zero,zero,zero,zero,zero,ymm3[5,14],zero,zero,zero,zero,zero,ymm3[22,31],zero,zero,zero,zero,zero,ymm3[23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,12],zero,zero,zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,zero,zero,ymm4[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,5,13,u,u,u,u,u,6,14,u,u,u,u,u,23,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vmovq %xmm1, 48(%rax) +; AVX2-SLOW-NEXT: vmovdqa %xmm0, 32(%rax) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -546,41 +587,47 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[1,5,9,13],zero,zero,zero,ymm1[2,6,10,14],zero,zero,zero,ymm1[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,3,5,u,5,1,3,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,4,8],zero,zero,zero,zero,ymm3[1,5,9],zero,zero,zero,zero,ymm3[2,6,18],zero,zero,zero,zero,ymm3[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero -; AVX2-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-FAST-NEXT: vmovq %xmm0, 48(%rax) -; AVX2-FAST-NEXT: vmovdqa %xmm1, 32(%rax) +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm6 = mem[0],zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[2,3,2,3,0,1,0,1,u,u,u,u,2,3,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm7 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,0,2] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,zero,zero,zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[2,10],zero,zero,zero,zero,zero,ymm8[19,27],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero +; AVX2-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[4,13],zero,zero,zero,zero,zero,ymm2[5,14],zero,zero,zero,zero,zero,ymm2[22,31],zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,12],zero,zero,zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,zero,zero,ymm4[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,5,13,u,u,u,u,u,6,14,u,u,u,u,u,23,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vmovq %xmm1, 48(%rax) +; AVX2-FAST-NEXT: vmovdqa %xmm0, 32(%rax) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -588,41 +635,47 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm2[5,13],zero,zero,zero,zero,zero,ymm2[6,14],zero,zero,zero,zero,zero,zero,zero,ymm2[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,ymm4[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %ymm5, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[4,12],zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,ymm1[6,14,22],zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28],zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,ymm1[18],zero,zero,zero,zero,zero,zero,ymm1[19],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm6 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[2,3,2,3,0,1,0,1,u,u,u,u,2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,zero,zero,zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[2,10],zero,zero,zero,zero,zero,ymm8[19,27],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero +; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[4,13],zero,zero,zero,zero,zero,ymm2[5,14],zero,zero,zero,zero,zero,ymm2[22,31],zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,12],zero,zero,zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,zero,zero,ymm4[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,5,13,u,u,u,u,u,6,14,u,u,u,u,u,23,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,u,u,u,u,u,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm0, 48(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm1, 48(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -638,11 +691,11 @@ ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm2[19,27,u,u,u],zero,zero,ymm2[20,28] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,5,13],zero,zero,ymm2[u,u,u,6,14],zero,zero,ymm2[u,u,u],zero,zero,ymm2[23,31,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 @@ -679,29 +732,29 @@ ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,u> -; AVX512F-FAST-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6] -; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpermi2q %ymm4, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6] +; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,4,8],zero,zero,zero,zero,ymm2[1,5,9],zero,zero,zero,zero,ymm2[18,22,26],zero,zero,zero,zero,ymm2[19,23,27],zero,zero,zero,zero ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,3,5,u,5,1,3,u> ; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm2[19,27,u,u,u],zero,zero,ymm2[20,28] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,8],zero,zero,ymm0[u,u,u,1,9],zero,zero,ymm0[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm0[19,27,u,u,u],zero,zero,ymm0[20,28] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u,u],zero,zero,ymm3[1,9,u,u,u],zero,zero,zero,zero,ymm3[u,u,u,19,27],zero,zero,ymm3[u,u,u,20,28],zero,zero -; AVX512F-FAST-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7] -; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[1,5,9,13],zero,zero,zero,ymm1[2,6,10,14],zero,zero,zero,ymm1[19,23,27,31],zero,zero,zero,ymm1[u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,3,5,7,1,3,5,7] +; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6,10,14],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,ymm0[u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512F-FAST-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512F-FAST-NEXT: vmovq %xmm1, 48(%rax) @@ -721,11 +774,11 @@ ; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zmm1[18],zero,zero,zero,zero,zero,zero,zmm1[19],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46,54],zero,zero,zero,zero,zero,zero,zmm1[55],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] @@ -760,33 +813,33 @@ ; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512BW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,u> -; AVX512BW-FAST-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,3,5,7,1,3,5,7] -; AVX512BW-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm0 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6,10,14],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-FAST-NEXT: vpermi2q %ymm4, %ymm0, %ymm1 +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm0 +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,3,5,7,1,3,5,7] +; AVX512BW-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm2[1,5,9,13],zero,zero,zero,ymm2[2,6,10,14],zero,zero,zero,ymm2[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,3,5,u,5,1,3,u> ; AVX512BW-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm3 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,4,8],zero,zero,zero,zero,ymm3[1,5,9],zero,zero,zero,zero,ymm3[2,6,18],zero,zero,zero,zero,ymm3[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-FAST-NEXT: vpor %ymm0, %ymm3, %ymm0 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero -; AVX512BW-FAST-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX512BW-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,8],zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,ymm0[2,10,18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,ymm0[20,28] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8],zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,ymm0[20,28],zero,zero +; AVX512BW-FAST-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6] ; AVX512BW-FAST-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512BW-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX512BW-FAST-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 {%k1} = ymm1[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa %ymm2, (%rax) -; AVX512BW-FAST-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) -; AVX512BW-FAST-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm1[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa %ymm0, (%rax) +; AVX512BW-FAST-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) +; AVX512BW-FAST-NEXT: vextracti32x4 $3, %zmm1, %xmm0 ; AVX512BW-FAST-NEXT: vmovq %xmm0, 48(%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq @@ -813,349 +866,342 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $56, %rsp +; SSE-NEXT: subq $40, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa (%rsi), %xmm4 -; SSE-NEXT: movdqa (%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rcx), %xmm5 -; SSE-NEXT: movdqa (%r8), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,4,6,6,7] -; SSE-NEXT: movdqa %xmm9, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] -; SSE-NEXT: pandn %xmm4, %xmm10 -; SSE-NEXT: por %xmm3, %xmm10 -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; SSE-NEXT: pand %xmm4, %xmm10 -; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa (%rsi), %xmm3 +; SSE-NEXT: movdqa (%rdx), %xmm4 +; SSE-NEXT: movdqa (%rcx), %xmm6 +; SSE-NEXT: movdqa (%r8), %xmm2 +; SSE-NEXT: movdqa (%r9), %xmm13 ; SSE-NEXT: movdqa (%rax), %xmm7 -; SSE-NEXT: por %xmm10, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,6,7,7,7] -; SSE-NEXT: movdqa %xmm7, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm11, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,1,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm14 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,0,3] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm12, %xmm14 -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm14, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0] -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: pandn %xmm4, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: por %xmm4, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] -; SSE-NEXT: pand %xmm4, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm15[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; SSE-NEXT: pandn %xmm7, %xmm4 -; SSE-NEXT: por %xmm14, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm15 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,1,2,3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: pandn %xmm5, %xmm9 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,1,2,3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,0] +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,6,6] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,0,0,0,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] -; SSE-NEXT: movdqa %xmm11, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm14[8],xmm5[9],xmm14[9],xmm5[10],xmm14[10],xmm5[11],xmm14[11],xmm5[12],xmm14[12],xmm5[13],xmm14[13],xmm5[14],xmm14[14],xmm5[15],xmm14[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: pandn %xmm3, %xmm12 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: por %xmm3, %xmm12 +; SSE-NEXT: pand %xmm1, %xmm12 +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; SSE-NEXT: pand %xmm14, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: por %xmm12, %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,6,7,7,7] +; SSE-NEXT: movdqa %xmm7, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: pand %xmm12, %xmm14 +; SSE-NEXT: por %xmm14, %xmm7 +; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,5,6,6] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,3,3] +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: pandn %xmm10, %xmm13 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,0,0,0,4,5,6,7] +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: por %xmm7, %xmm9 +; SSE-NEXT: movdqa %xmm2, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: pand %xmm2, %xmm10 +; SSE-NEXT: por %xmm10, %xmm13 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] ; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm9, %xmm14 +; SSE-NEXT: pandn %xmm13, %xmm14 ; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,1,3] -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,1,3,2] -; SSE-NEXT: pand %xmm10, %xmm9 -; SSE-NEXT: por %xmm7, %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,1,3] +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,1,3,2] +; SSE-NEXT: pand %xmm8, %xmm10 +; SSE-NEXT: por %xmm9, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,5,5,5,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: pshuflw $233, (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm2, %xmm5 +; SSE-NEXT: por %xmm9, %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE-NEXT: movdqa %xmm12, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,6,5] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,6,5] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: pand %xmm13, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,5,7] +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pandn %xmm3, %xmm13 -; SSE-NEXT: por %xmm6, %xmm13 +; SSE-NEXT: pandn %xmm3, %xmm15 +; SSE-NEXT: por %xmm5, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] ; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm15, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufd $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,2] +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm11, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,2,2] +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,7,5,6,4] ; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm15[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,1,3] -; SSE-NEXT: pand %xmm12, %xmm9 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm1[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] +; SSE-NEXT: pand %xmm12, %xmm10 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,2] -; SSE-NEXT: movdqa %xmm11, %xmm9 -; SSE-NEXT: pandn %xmm7, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[2,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] -; SSE-NEXT: pand %xmm11, %xmm7 -; SSE-NEXT: por %xmm7, %xmm9 -; SSE-NEXT: pand %xmm10, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,0,0] -; SSE-NEXT: pandn %xmm7, %xmm10 -; SSE-NEXT: por %xmm9, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; SSE-NEXT: pand %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: por %xmm7, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm15[1,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] -; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm7[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,3] +; SSE-NEXT: pandn %xmm10, %xmm4 +; SSE-NEXT: por %xmm9, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm13[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm9, %xmm10 +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm9 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] +; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm7, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshufd $101, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,4] -; SSE-NEXT: pandn %xmm7, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: pand %xmm6, %xmm11 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: por %xmm11, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] +; SSE-NEXT: pandn %xmm4, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,1,1,1,4,5,6,7] +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pshuflw $234, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] +; SSE-NEXT: pandn %xmm2, %xmm12 +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: pand %xmm8, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: por %xmm12, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm8 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm0, 16(%rax) -; SSE-NEXT: movdqa %xmm10, 32(%rax) +; SSE-NEXT: movdqa %xmm8, 32(%rax) +; SSE-NEXT: movdqa %xmm9, 16(%rax) ; SSE-NEXT: movdqa %xmm3, 64(%rax) ; SSE-NEXT: movdqa %xmm14, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: addq $56, %rsp +; SSE-NEXT: addq $40, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride7_vf16: @@ -1169,25 +1215,24 @@ ; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm9[6,7],zero,zero,zero,zero,zero,xmm9[8,9],zero,zero -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm10[4,5],zero,zero,zero,zero,zero,xmm10[6,7],zero,zero,zero,zero,zero,xmm10[8,9] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm10[0,1],zero,zero,zero,zero,zero,xmm10[2,3],zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[0,1],zero,zero,zero,zero,zero,xmm9[2,3],zero,zero,zero,zero,zero,xmm9[4,5] -; AVX1-ONLY-NEXT: vpor %xmm8, %xmm11, %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm8, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,4,5],zero,xmm8[u,u,u,u,6,7],zero,xmm8[u,u,u,u] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm8[6,7],zero,zero,zero,zero,zero,xmm8[8,9],zero,zero +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[4,5],zero,zero,zero,zero,zero,xmm9[6,7],zero,zero,zero,zero,zero,xmm9[8,9] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm9[0,1],zero,zero,zero,zero,zero,xmm9[2,3],zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[0,1],zero,zero,zero,zero,zero,xmm8[2,3],zero,zero,zero,zero,zero,xmm8[4,5] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,4,5],zero,xmm10[u,u,u,u,6,7],zero,xmm10[u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u],zero,zero,xmm1[2,u,u,u,u],zero,zero,xmm1[3,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u] ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm12, %ymm11 +; AVX1-ONLY-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm11 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm11, %ymm3 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u],zero,xmm5[7,u,u,u,u,u],zero,xmm5[8,u,u,u,u,u],zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,7],zero,xmm4[u,u,u,u,u,8],zero,xmm4[u,u,u,u,u,9] @@ -1197,24 +1242,24 @@ ; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = ; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm11, %xmm12, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,zero,xmm10[10,11],zero,zero,zero,zero,zero,xmm10[12,13],zero,zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[10,11],zero,zero,zero,zero,zero,xmm9[12,13],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX1-ONLY-NEXT: vandps %ymm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,xmm9[10,11],zero,zero,zero,zero,zero,xmm9[12,13],zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[10,11],zero,zero,zero,zero,zero,xmm8[12,13],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX1-ONLY-NEXT: vandps %ymm9, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,u,u],zero,xmm2[7,u,u,u,u,u],zero,xmm2[8,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,7],zero,xmm0[u,u,u,u,u,8],zero,xmm0[u,u] ; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm11 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,xmm11[u,u,u,u,5,6],zero,xmm11[u,u,u,u,12,13],zero,xmm11[u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[6,u,u,u,u],zero,zero,xmm1[7,u,u,u,u],zero,zero,xmm1[8,u] ; AVX1-ONLY-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[8,9],zero,xmm8[u,u,u,u,10,11],zero,xmm8[u,u,u,u,12,13] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9],zero,xmm10[u,u,u,u,10,11],zero,xmm10[u,u,u,u,12,13] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm1[4,u,u,u,u],zero,zero,xmm1[5,u,u,u,u],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm9, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,xmm9[8,9],zero,zero,zero,zero,zero,xmm9[10,11],zero,zero,zero ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] @@ -1234,8 +1279,7 @@ ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u],zero,zero,xmm1[9,u,u,u,u],zero,zero,xmm1[10,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm10, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm6[12,13],zero,zero,zero,zero,zero,xmm6[14,15],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm9[12,13],zero,zero,zero,zero,zero,xmm9[14,15],zero,zero,zero,zero,zero @@ -1349,9 +1393,8 @@ ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm10 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[1,1,0,0,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,2,0,0,1] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm11, %ymm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm10[0,2,0,2] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = @@ -1378,10 +1421,8 @@ ; AVX2-FAST-NEXT: vpor %ymm13, %ymm12, %ymm12 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm12 = xmm2[0,1,2,3,4,5,5,6] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm13, %ymm12 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,3,1,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = @@ -1417,73 +1458,73 @@ ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm6[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[5],zero,zero,zero,zero,zero,zero,ymm9[6],zero,zero,zero,zero,zero,ymm9[23],zero,zero,zero,zero,zero,zero,ymm9[24],zero,zero,zero,zero,zero,zero,ymm9[25] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm7[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,ymm11[5],zero,zero,zero,zero,zero,zero,ymm11[6],zero,zero,zero,zero,zero,ymm11[23],zero,zero,zero,zero,zero,zero,ymm11[24],zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[4],zero,zero,zero,zero,zero,zero,ymm10[5],zero,zero,zero,zero,zero,zero,ymm10[6],zero,zero,zero,zero,zero,zero,zero,ymm10[23],zero,zero,zero,zero,zero,zero,ymm10[24],zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = zero,ymm11[4],zero,zero,zero,zero,zero,zero,ymm11[5],zero,zero,zero,zero,zero,zero,ymm11[6],zero,zero,zero,zero,zero,ymm11[23],zero,zero,zero,zero,zero,zero,ymm11[24],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm8, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,ymm11[5],zero,zero,zero,zero,zero,zero,ymm11[6],zero,zero,zero,zero,zero,ymm11[23],zero,zero,zero,zero,zero,zero,ymm11[24],zero,zero,zero,zero,zero,zero,ymm11[25] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm11, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm8[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,ymm12[5],zero,zero,zero,zero,zero,zero,ymm12[6],zero,zero,zero,zero,zero,ymm12[23],zero,zero,zero,zero,zero,zero,ymm12[24],zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpor %ymm12, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[4],zero,zero,zero,zero,zero,zero,ymm10[5],zero,zero,zero,zero,zero,zero,ymm10[6],zero,zero,zero,zero,zero,zero,zero,ymm10[23],zero,zero,zero,zero,zero,zero,ymm10[24],zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = zero,ymm12[4],zero,zero,zero,zero,zero,zero,ymm12[5],zero,zero,zero,zero,zero,zero,ymm12[6],zero,zero,zero,zero,zero,ymm12[23],zero,zero,zero,zero,zero,zero,ymm12[24],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpor %ymm12, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,0] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm10[0,2,0,2] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm8[0,2,0,2] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,ymm12[0,8],zero,zero,zero,zero,zero,ymm12[1,9],zero,zero,zero,zero,zero,ymm12[18,26],zero,zero,zero,zero,zero,ymm12[19,27],zero,zero,zero,zero,zero,ymm12[20,28] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm6[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm7[0,2,0,2] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,8],zero,zero,zero,zero,zero,ymm13[1,9],zero,zero,zero,zero,zero,ymm13[2,10],zero,zero,zero,zero,zero,ymm13[19,27],zero,zero,zero,zero,zero,ymm13[20,28],zero,zero ; AVX2-FAST-PERLANE-NEXT: vpor %ymm12, %ymm13, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,3,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[3,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,zero,zero,zero,ymm6[10,2],zero,zero,zero,zero,zero,ymm6[11,3],zero,zero,zero,zero,zero,ymm6[20,28],zero,zero,zero,zero,zero,ymm6[21,29],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,3,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[2,10],zero,zero,zero,zero,zero,ymm7[3,19],zero,zero,zero,zero,zero,ymm7[28,20],zero,zero,zero,zero,zero,ymm7[29,21],zero -; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm6, %ymm10, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[12,13],zero,zero,zero,zero,zero,xmm4[14,15],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,3,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[2,10],zero,zero,zero,zero,zero,ymm8[3,19],zero,zero,zero,zero,zero,ymm8[28,20],zero,zero,zero,zero,zero,ymm8[29,21],zero +; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm10, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm5[12,13],zero,zero,zero,zero,zero,xmm5[14,15],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[12,13],zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm8[13,14,15,4,5],zero,zero,xmm8[14,15,14,15,12],zero,zero,xmm8[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,14,15,4,5],zero,zero,xmm2[14,15,14,15,12],zero,zero,xmm2[15] ; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -1493,77 +1534,76 @@ ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm3 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, (%r9), %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[12,13,u,u,u],zero,zero,xmm6[14,15,u,u,u] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,12,13],zero,zero,xmm4[u,u,u,14,15],zero,zero,xmm4[u,u,u] -; AVX512F-SLOW-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10],zero,xmm5[u,u,u,u,13,12],zero,xmm5[u,u,u,u,15,14],zero -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm1[13,u,u,u,u],zero,zero,xmm1[14,u,u,u,u],zero,zero,xmm1[15] -; AVX512F-SLOW-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm5 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[3,1,1,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[1],zero,zero,ymm4[u,u,u,10,2],zero,zero,ymm4[u,u,u,11,3],zero,zero,ymm4[u,u,u,20,28],zero,zero,ymm4[u,u,u,21,29],zero,zero,ymm4[u] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[1,3,3,1] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[1,9,u,u,u],zero,zero,ymm6[2,10,u,u,u],zero,zero,ymm6[3,19,u,u,u],zero,zero,ymm6[28,20,u,u,u],zero,zero,ymm6[29,21,u] -; AVX512F-SLOW-NEXT: vpor %ymm4, %ymm6, %ymm4 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,5,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,3,1,3] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm2 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm7 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm9[u,u,u,u,u,5],zero,ymm9[u,u,u,u,u,6],zero,ymm9[u,u,u,u,u],zero,ymm9[23,u,u,u,u,u],zero,ymm9[24,u,u,u,u] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u],zero,ymm11[5,u,u,u,u,u],zero,ymm11[6,u,u,u,u,u,23],zero,ymm11[u,u,u,u,u,24],zero,ymm11[u,u,u,u] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX512F-SLOW-NEXT: vpternlogq $50, %ymm10, %ymm12, %ymm11 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm8[u,u,u,5],zero,ymm8[u,u,u,u,u,6],zero,ymm8[u,u,u,u,u],zero,ymm8[23,u,u,u,u,u],zero,ymm8[24,u,u,u,u,u],zero +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u],zero,ymm13[5,u,u,u,u,u],zero,ymm13[6,u,u,u,u,u,23],zero,ymm13[u,u,u,u,u,24],zero,ymm13[u,u,u,u,u,25] +; AVX512F-SLOW-NEXT: vpternlogq $200, %ymm11, %ymm12, %ymm13 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm8[0,2,0,2] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,8],zero,zero,ymm11[u,u,u,1,9],zero,zero,ymm11[u,u,u,2,10],zero,zero,ymm11[u,u,u,19,27],zero,zero,ymm11[u,u,u,20,28],zero,zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm11 +; AVX512F-SLOW-NEXT: vporq %zmm10, %zmm11, %zmm10 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[4],zero,ymm7[u,u,u,u,u,5],zero,ymm7[u,u,u,u,u,6],zero,ymm7[u,u,u,u,u],zero,ymm7[23,u,u,u,u,u],zero,ymm7[24,u,u] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,ymm12[4,u,u,u,u,u],zero,ymm12[5,u,u,u,u,u],zero,ymm12[6,u,u,u,u,u,23],zero,ymm12[u,u,u,u,u,24],zero,ymm12[u,u] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; AVX512F-SLOW-NEXT: vpternlogq $200, %ymm11, %ymm13, %ymm12 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,0,2] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8],zero,ymm11[u,u,u,u,1,9],zero,ymm11[u,u,u,u,18,26],zero,ymm11[u,u,u,u,19,27],zero,ymm11[u,u,u,u] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] +; AVX512F-SLOW-NEXT: vpandn %ymm12, %ymm13, %ymm12 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[1,1,0,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] +; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 +; AVX512F-SLOW-NEXT: vporq %zmm12, %zmm11, %zmm11 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,1,1,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,ymm8[u,u,u,10,2],zero,zero,ymm8[u,u,u,11,3],zero,zero,ymm8[u,u,u,20,28],zero,zero,ymm8[u,u,u,21,29],zero,zero,ymm8[u] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9,u,u,u],zero,zero,ymm9[2,10,u,u,u],zero,zero,ymm9[3,19,u,u,u],zero,zero,ymm9[28,20,u,u,u],zero,zero,ymm9[29,21,u] +; AVX512F-SLOW-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,5,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9],zero,ymm7[u,u,u,u,2,10],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u,20,28],zero,ymm7[u,u,u,u,21] -; AVX512F-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm7 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm7 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm7, %zmm4 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $50, %ymm6, %ymm8, %ymm7 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,0,2] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u,u],zero,zero,ymm3[1,9,u,u,u],zero,zero,ymm3[18,26,u,u,u],zero,zero,ymm3[19,27,u,u,u],zero,zero,ymm3[20,28] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[u,u,u,5],zero,ymm2[u,u,u,u,u,6],zero,ymm2[u,u,u,u,u],zero,ymm2[23,u,u,u,u,u],zero,ymm2[24,u,u,u,u,u],zero -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u,u,25] -; AVX512F-SLOW-NEXT: vpternlogq $200, %ymm6, %ymm8, %ymm7 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10],zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vporq %zmm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,0] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] -; AVX512F-SLOW-NEXT: vpandn %ymm3, %ymm6, %ymm3 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,0,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0] -; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[4],zero,ymm0[u,u,u,u,u,5],zero,ymm0[u,u,u,u,u,6],zero,ymm0[u,u,u,u,u],zero,ymm0[23,u,u,u,u,u],zero,ymm0[24,u,u] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4,u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u] -; AVX512F-SLOW-NEXT: vpternlogq $200, %ymm3, %ymm6, %ymm7 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,8],zero,ymm0[u,u,u,u,1,9],zero,ymm0[u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, 96(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa %ymm4, 64(%rax) +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm7 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm7 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u] +; AVX512F-SLOW-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] +; AVX512F-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm7, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, 96(%rax) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -1571,70 +1611,65 @@ ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm5 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm5 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm6 ; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm0 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm6 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm7 -; AVX512F-FAST-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[12,13,u,u,u],zero,zero,xmm4[14,15,u,u,u] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,12,13],zero,zero,xmm2[u,u,u,14,15],zero,zero,xmm2[u,u,u] -; AVX512F-FAST-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10],zero,xmm3[u,u,u,u,13,12],zero,xmm3[u,u,u,u,15,14],zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] -; AVX512F-FAST-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm7[3,1,1,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1],zero,zero,ymm2[u,u,u,10,2],zero,zero,ymm2[u,u,u,11,3],zero,zero,ymm2[u,u,u,20,28],zero,zero,ymm2[u,u,u,21,29],zero,zero,ymm2[u] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm6[1,3,3,1] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[1,9,u,u,u],zero,zero,ymm4[2,10,u,u,u],zero,zero,ymm4[3,19,u,u,u],zero,zero,ymm4[28,20,u,u,u],zero,zero,ymm4[29,21,u] -; AVX512F-FAST-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5,5,6] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,2,3,3,2,2,3,3] -; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,1,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] -; AVX512F-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm5 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm5, %zmm2 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm6[0,2,0,2] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8,u,u,u],zero,zero,ymm4[1,9,u,u,u],zero,zero,ymm4[18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,2,6,1,5,2,6] -; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm5, %ymm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u],zero,zero,ymm6[1,5,u,u,u],zero,zero,ymm6[2,6,u,u,u],zero,zero,ymm6[19,23,u,u,u],zero,zero,ymm6[24,28,u,u,u],zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm7[0,2,0,2] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,ymm6[u,u,u,1,9],zero,zero,ymm6[u,u,u,2,10],zero,zero,ymm6[u,u,u,19,27],zero,zero,ymm6[u,u,u,20,28],zero,zero -; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm5, %ymm7 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,5],zero,zero,ymm7[u,u,u,2,6],zero,zero,ymm7[u,u,u,19,23],zero,zero,ymm7[u,u,u,24,28],zero,zero,ymm7[u,u,u,25] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vporq %zmm4, %zmm6, %zmm4 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[1,1,0,0,4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm7, %ymm6 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm7 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm8 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm9 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm9[3,1,1,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[1],zero,zero,ymm10[u,u,u,10,2],zero,zero,ymm10[u,u,u,11,3],zero,zero,ymm10[u,u,u,20,28],zero,zero,ymm10[u,u,u,21,29],zero,zero,ymm10[u] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm8[1,3,3,1] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = zero,ymm11[1,9,u,u,u],zero,zero,ymm11[2,10,u,u,u],zero,zero,ymm11[3,19,u,u,u],zero,zero,ymm11[28,20,u,u,u],zero,zero,ymm11[29,21,u] +; AVX512F-FAST-NEXT: vpor %ymm10, %ymm11, %ymm10 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm7[1,3,1,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,1,9],zero,ymm12[u,u,u,u,2,10],zero,ymm12[u,u,u,u,19,27],zero,ymm12[u,u,u,u,20,28],zero,ymm12[u,u,u,u,21] +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm12 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm12 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[12,13,u,u,u],zero,zero,xmm3[14,15,u,u,u] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,12,13],zero,zero,xmm1[u,u,u,14,15],zero,zero,xmm1[u,u,u] +; AVX512F-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10],zero,xmm2[u,u,u,u,13,12],zero,xmm2[u,u,u,u,15,14],zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] +; AVX512F-FAST-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm12, %zmm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,2,0,2] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u,u],zero,zero,ymm3[1,9,u,u,u],zero,zero,ymm3[18,26,u,u,u],zero,zero,ymm3[19,27,u,u,u],zero,zero,ymm3[20,28] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,5,2,6,1,5,2,6] +; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u],zero,zero,ymm5[1,5,u,u,u],zero,zero,ymm5[2,6,u,u,u],zero,zero,ymm5[19,23,u,u,u],zero,zero,ymm5[24,28,u,u,u],zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm9[0,2,0,2] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,ymm5[u,u,u,1,9],zero,zero,ymm5[u,u,u,2,10],zero,zero,ymm5[u,u,u,19,27],zero,zero,ymm5[u,u,u,20,28],zero,zero +; AVX512F-FAST-NEXT: vpermd %ymm9, %ymm4, %ymm6 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,5],zero,zero,ymm6[u,u,u,2,6],zero,zero,ymm6[u,u,u,19,23],zero,zero,ymm6[u,u,u,24,28],zero,zero,ymm6[u,u,u,25] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vporq %zmm3, %zmm5, %zmm3 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm7[0,2,0,2] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,0,8],zero,ymm5[u,u,u,u,1,9],zero,ymm5[u,u,u,u,18,26],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u] +; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,4],zero,ymm4[u,u,u,u,1,5],zero,ymm4[u,u,u,u,2,6],zero,ymm4[u,u,u,u,19,23],zero,ymm4[u,u,u,u,24,28],zero,ymm4[u] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,2,0,2] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,0,8],zero,ymm6[u,u,u,u,1,9],zero,ymm6[u,u,u,u,18,26],zero,ymm6[u,u,u,u,19,27],zero,ymm6[u,u,u,u] -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4],zero,ymm1[u,u,u,u,1,5],zero,ymm1[u,u,u,u,2,6],zero,ymm1[u,u,u,u,19,23],zero,ymm1[u,u,u,u,24,28],zero,ymm1[u] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm3, 96(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-FAST-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,1,0,4,4,5,4] +; AVX512F-FAST-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, 96(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-FAST-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -1647,19 +1682,18 @@ ; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm4 ; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm5 ; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm6 +; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm7 +; AVX512BW-SLOW-NEXT: vmovdqa (%r10), %xmm8 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm9 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti128 $1, (%r9), %ymm6, %ymm6 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, (%r10), %zmm6, %zmm6 -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] -; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm8 -; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,6,7,7,7] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,3,2] +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,6,7,7,7] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,3,2] ; AVX512BW-SLOW-NEXT: movw $-32510, %cx # imm = 0x8102 ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %xmm9, %xmm7 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1} ; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[12,13],zero,zero,zero,zero,zero,xmm2[14,15],zero,zero,zero,zero,zero ; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] @@ -1667,11 +1701,11 @@ ; AVX512BW-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512BW-SLOW-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %xmm7, %xmm2 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqu8 %xmm6, %xmm2 {%k1} ; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] ; AVX512BW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512BW-SLOW-NEXT: vpermw %ymm8, %ymm3, %ymm3 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm6[1,3,1,3] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm9[1,3,1,3] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX512BW-SLOW-NEXT: movl $67637280, %ecx # imm = 0x4081020 ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 @@ -1685,13 +1719,16 @@ ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k1} ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] -; AVX512BW-SLOW-NEXT: vpermi2w %zmm6, %zmm8, %zmm4 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4],zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0] +; AVX512BW-SLOW-NEXT: vpermw %ymm8, %ymm4, %ymm4 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,0] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm9[4],zero,zero,zero,zero,zero,zero,ymm9[5],zero,zero,zero,zero,zero,zero,ymm9[6],zero,zero,zero,zero,zero,zero,zero,ymm9[23],zero,zero,zero,zero,zero,zero,ymm9[24],zero,zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm9[2,3,0,1] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm9[0,2,0,2] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 ; AVX512BW-SLOW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 @@ -1732,66 +1769,66 @@ ; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %xmm3 ; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm4 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 -; AVX512BW-FAST-NEXT: vinserti128 $1, (%r9), %ymm4, %ymm4 -; AVX512BW-FAST-NEXT: vinserti32x4 $2, (%r10), %zmm4, %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa (%r9), %xmm5 +; AVX512BW-FAST-NEXT: vmovdqa (%r10), %xmm6 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm7 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm9 +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512BW-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpermw %ymm6, %ymm10, %ymm10 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm7[1,3,1,3] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX512BW-FAST-NEXT: movl $67637280, %ecx # imm = 0x4081020 +; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 +; AVX512BW-FAST-NEXT: vmovdqu8 %ymm10, %ymm11 {%k1} +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm9[1,3,3,1] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,ymm10[1,9],zero,zero,zero,zero,zero,ymm10[2,10],zero,zero,zero,zero,zero,ymm10[3,19],zero,zero,zero,zero,zero,ymm10[28,20],zero,zero,zero,zero,zero,ymm10[29,21],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm8[3,1,1,3] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[1],zero,zero,zero,zero,zero,ymm12[10,2],zero,zero,zero,zero,zero,ymm12[11,3],zero,zero,zero,zero,zero,ymm12[20,28],zero,zero,zero,zero,zero,ymm12[21,29],zero,zero,zero +; AVX512BW-FAST-NEXT: vpor %ymm10, %ymm12, %ymm10 +; AVX512BW-FAST-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 +; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 +; AVX512BW-FAST-NEXT: vmovdqu8 %ymm11, %ymm10 {%k1} ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[12,13],zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[12,13],zero,zero,zero,zero,zero,xmm1[14,15],zero,zero,zero ; AVX512BW-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero -; AVX512BW-FAST-NEXT: vextracti64x4 $1, %zmm4, %ymm2 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm2[13],zero,zero,zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,zero,zero,xmm2[15] -; AVX512BW-FAST-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm6[13],zero,zero,zero,zero,zero,zero,xmm6[14],zero,zero,zero,zero,zero,zero,xmm6[15] +; AVX512BW-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512BW-FAST-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 ; AVX512BW-FAST-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512BW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm1, %ymm1 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm4[1,3,1,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] -; AVX512BW-FAST-NEXT: movl $67637280, %ecx # imm = 0x4081020 -; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %ymm1, %ymm3 {%k1} -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm6[1,3,3,1] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[3,19],zero,zero,zero,zero,zero,ymm1[28,20],zero,zero,zero,zero,zero,ymm1[29,21],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm5[3,1,1,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero -; AVX512BW-FAST-NEXT: vpor %ymm1, %ymm7, %ymm1 -; AVX512BW-FAST-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 -; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] -; AVX512BW-FAST-NEXT: vpermi2w %zmm4, %zmm2, %zmm3 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm10, %zmm1 ; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,5,2,6,1,5,2,6] ; AVX512BW-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm7 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] +; AVX512BW-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm3 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm8[0,2,0,2] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zero,zmm3[2,10],zero,zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zero,zero,zmm3[33,37],zero,zero,zero,zero,zero,zmm3[34,38],zero,zero,zero,zero,zero,zmm3[51,55],zero,zero,zero,zero,zero,zmm3[56,60],zero,zero,zero,zero,zero,zmm3[57] +; AVX512BW-FAST-NEXT: vpermd %ymm9, %ymm2, %ymm4 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm9[0,2,0,2] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm4 = zero,zero,zmm4[0,8],zero,zero,zero,zero,zero,zmm4[1,9],zero,zero,zero,zero,zero,zmm4[18,26],zero,zero,zero,zero,zero,zmm4[19,27],zero,zero,zero,zero,zero,zmm4[20,28],zero,zero,zero,zero,zero,zmm4[33,37],zero,zero,zero,zero,zero,zmm4[34,38],zero,zero,zero,zero,zero,zmm4[51,55],zero,zero,zero,zero,zero,zmm4[56,60],zero,zero,zero,zero +; AVX512BW-FAST-NEXT: vporq %zmm3, %zmm4, %zmm3 +; AVX512BW-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm2 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm7[0,2,0,2] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] +; AVX512BW-FAST-NEXT: vpermw %zmm4, %zmm5, %zmm4 ; AVX512BW-FAST-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm4 {%k1} -; AVX512BW-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm3 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zero,zmm3[2,10],zero,zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zero,zero,zmm3[33,37],zero,zero,zero,zero,zero,zmm3[34,38],zero,zero,zero,zero,zero,zmm3[51,55],zero,zero,zero,zero,zero,zmm3[56,60],zero,zero,zero,zero,zero,zmm3[57] -; AVX512BW-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm2 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm6[0,2,0,2] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zmm2[18,26],zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zmm2[33,37],zero,zero,zero,zero,zero,zmm2[34,38],zero,zero,zero,zero,zero,zmm2[51,55],zero,zero,zero,zero,zero,zmm2[56,60],zero,zero,zero,zero -; AVX512BW-FAST-NEXT: vporq %zmm3, %zmm2, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm4, %zmm2 {%k1} ; AVX512BW-FAST-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm4, %zmm2 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-FAST-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq @@ -1818,977 +1855,966 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride7_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $360, %rsp # imm = 0x168 +; SSE-NEXT: subq $296, %rsp # imm = 0x128 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rsi), %xmm4 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rdx), %xmm3 -; SSE-NEXT: movdqa 16(%rcx), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r8), %xmm6 -; SSE-NEXT: movdqa 16(%r9), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rcx), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r8), %xmm8 +; SSE-NEXT: movdqa 16(%r9), %xmm7 +; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,6,6,6] -; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: movdqa %xmm1, %xmm14 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,5,5,7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,6,6,6] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,1,2,3] +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm8, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa 16(%rax), %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,6,6] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa 16(%rax), %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,6,6] +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,0,3] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm7, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0] -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,6,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,0,3] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,1,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,1,0,3] -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: movdqa (%rdi), %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa (%rcx), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,1,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm14 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: movdqa (%rdx), %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa (%r9), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[2,1,2,3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,0] -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa (%r8), %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: movdqa (%rax), %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: movdqa (%rdx), %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: por %xmm2, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa (%r9), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] +; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: movdqa (%r8), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: movdqa (%rax), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: pand %xmm1, %xmm12 +; SSE-NEXT: por %xmm12, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm10 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,3] +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm15, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: pand %xmm2, %xmm10 +; SSE-NEXT: pandn %xmm12, %xmm2 +; SSE-NEXT: por %xmm10, %xmm2 +; SSE-NEXT: pshufd $230, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] +; SSE-NEXT: movdqa %xmm11, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: pand %xmm1, %xmm10 +; SSE-NEXT: pshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,6,6,6,6] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm14 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,6,6,6,6] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,6,6,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,2,3] -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm1, %xmm10 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,6,5,7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[2,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm14 -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,2,2] -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,2,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,1,1,0] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm8[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] +; SSE-NEXT: pand %xmm5, %xmm10 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,5,6,4] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: movdqa %xmm12, %xmm11 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm11, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: pand %xmm11, %xmm10 +; SSE-NEXT: por %xmm10, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] ; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: pandn %xmm3, %xmm10 -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm4, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,2,2] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,1,2,3,4,5,6,7] +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm3, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,2,1] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,7,5,6,4] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE-NEXT: movdqa %xmm15, %xmm10 -; SSE-NEXT: pandn %xmm3, %xmm10 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm4, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] ; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,0,2,1,4,5,6,7] +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: pand %xmm1, %xmm14 +; SSE-NEXT: por %xmm10, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[1,1,1,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,1,1,0] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm10, %xmm3 -; SSE-NEXT: pshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] -; SSE-NEXT: pand %xmm11, %xmm10 -; SSE-NEXT: por %xmm10, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm13, %xmm10 -; SSE-NEXT: pandn %xmm4, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por %xmm4, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm10 -; SSE-NEXT: por %xmm10, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; SSE-NEXT: pand %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm14 -; SSE-NEXT: por %xmm6, %xmm14 -; SSE-NEXT: pshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshuflw $216, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,0] +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm10 +; SSE-NEXT: por %xmm10, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm14 +; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] ; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa %xmm15, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: pand %xmm1, %xmm9 +; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,2] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm3, %xmm8 +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[2,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm3, %xmm8 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm9, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm8 +; SSE-NEXT: por %xmm8, %xmm3 +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] ; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,7,7,7] +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,5,5,5,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: pshufhw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,5,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm11, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: por %xmm9, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm13[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: pandn %xmm9, %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,3,3] +; SSE-NEXT: pand %xmm15, %xmm9 +; SSE-NEXT: por %xmm9, %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm5[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm9, %xmm7 +; SSE-NEXT: pand %xmm4, %xmm14 +; SSE-NEXT: por %xmm14, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] +; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: pshufhw $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm0, %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: por %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,6,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,6,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,2] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm5, %xmm13 +; SSE-NEXT: pandn %xmm7, %xmm13 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm12 +; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,2,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] +; SSE-NEXT: movdqa %xmm2, %xmm13 +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; SSE-NEXT: pand %xmm0, %xmm13 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa (%rsp), %xmm10 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: pand %xmm11, %xmm7 +; SSE-NEXT: por %xmm7, %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,2,2] ; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: por %xmm2, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,7,5,6,4] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm9[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] +; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm10[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3] +; SSE-NEXT: pandn %xmm7, %xmm6 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: por %xmm1, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: pand %xmm1, %xmm15 -; SSE-NEXT: pandn %xmm9, %xmm1 -; SSE-NEXT: por %xmm15, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[2,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] ; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: pshufhw $216, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; SSE-NEXT: pandn %xmm2, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 ; SSE-NEXT: por %xmm1, %xmm11 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm4, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7] +; SSE-NEXT: pand %xmm8, %xmm11 +; SSE-NEXT: pandn %xmm10, %xmm8 +; SSE-NEXT: por %xmm11, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: por %xmm11, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm15 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm1, %xmm15 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm15, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm8, %xmm0 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm4, 32(%rax) -; SSE-NEXT: movdqa %xmm0, 96(%rax) -; SSE-NEXT: movdqa %xmm7, 112(%rax) -; SSE-NEXT: movdqa %xmm14, 176(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: movdqa %xmm0, 176(%rax) +; SSE-NEXT: movdqa %xmm2, 144(%rax) +; SSE-NEXT: movdqa %xmm3, 128(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps %xmm0, 112(%rax) +; SSE-NEXT: movdqa %xmm12, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rax) +; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%rax) +; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rax) +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%rax) -; SSE-NEXT: addq $360, %rsp # imm = 0x168 +; SSE-NEXT: addq $296, %rsp # imm = 0x128 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $216, %rsp +; AVX1-ONLY-NEXT: subq $152, %rsp ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm14 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,u],zero,zero,xmm14[9,u,u,u,u],zero,zero,xmm14[10,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,3],zero,xmm1[u,u,u,u,4,5],zero,xmm1[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u> -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u> -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,7],zero,xmm6[u,u,u,u,u,8],zero,xmm6[u,u,u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm6[8],xmm10[9],xmm6[9],xmm10[10],xmm6[10],xmm10[11],xmm6[11],xmm10[12],xmm6[12],xmm10[13],xmm6[13],xmm10[14],xmm6[14],xmm10[15],xmm6[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm2, %ymm15 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm2, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm8 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[8,9],zero,xmm3[u,u,u,u,10,11],zero,xmm3[u,u,u,u,12,13] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm15 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <8,9,128,u,u,u,u,10,11,128,u,u,u,u,12,13> +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u],zero,zero,xmm3[2,u,u,u,u],zero,zero,xmm3[3,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,4,5],zero,xmm2[u,u,u,u,6,7],zero,xmm2[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm11, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm11, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm9, %ymm10, %ymm9 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u],zero,xmm0[7,u,u,u,u,u],zero,xmm0[8,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm9, %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,7],zero,xmm9[u,u,u,u,u,8],zero,xmm9[u,u] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm7[u,u,u,u,5,6],zero,xmm7[u,u,u,u,12,13],zero,xmm7[u] +; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[6,u,u,u,u],zero,zero,xmm7[7,u,u,u,u],zero,zero,xmm7[8,u] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u],zero,xmm0[7,u,u,u,u,u],zero,xmm0[8,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,7],zero,xmm10[u,u,u,u,u,8],zero,xmm10[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3],xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm10, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u],zero,xmm9[7,u,u,u,u,u],zero,xmm9[8,u,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,7],zero,xmm6[u,u,u,u,u,8],zero,xmm6[u,u,u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm8, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm11, %ymm13, %ymm11 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm11, %ymm15 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm11, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm14, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[8,9],zero,xmm1[u,u,u,u,10,11],zero,xmm1[u,u,u,u,12,13] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,u],zero,zero,xmm3[9,u,u,u,u],zero,zero,xmm3[10,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,2,3],zero,xmm5[u,u,u,u,4,5],zero,xmm5[u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u],zero,zero,xmm14[2,u,u,u,u],zero,zero,xmm14[3,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,4,5],zero,xmm1[u,u,u,u,6,7],zero,xmm1[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm15, %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm15 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u],zero,xmm12[7,u,u,u,u,u],zero,xmm12[8,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,7],zero,xmm13[u,u,u,u,u,8],zero,xmm13[u,u] +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm13, %xmm11 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,xmm11[u,u,u,u,5,6],zero,xmm11[u,u,u,u,12,13],zero,xmm11[u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[6,u,u,u,u],zero,zero,xmm3[7,u,u,u,u],zero,zero,xmm3[8,u] +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,u],zero,xmm15[7,u,u,u,u,u],zero,xmm15[8,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[u,u,u,7],zero,xmm1[u,u,u,u,u,8],zero,xmm1[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u],zero,xmm4[7,u,u,u,u,u],zero,xmm4[8,u,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,7],zero,xmm2[u,u,u,u,u,8],zero,xmm2[u,u,u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vandps %ymm14, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm13, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm13, %ymm11 +; AVX1-ONLY-NEXT: vandps %ymm13, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm11, %ymm12, %ymm11 +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u> +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm4, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm4, %ymm12 +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,6,7,u,u,u,u,u,8,9,u,u,u,u,u,10] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2,3,4,5,6,7],zero,xmm1[9,10,11,12,13,14],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm15[13],zero,zero,zero,zero,zero,zero,xmm15[14],zero,zero,zero,zero,zero,zero,xmm15[15] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6,7,8,9],zero,xmm0[11,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm15[11],zero,zero,zero,zero,zero,zero,xmm15[12],zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5> +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u],zero,zero,xmm7[2,u,u,u,u],zero,zero,xmm7[3,u,u,u,u] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u> -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,6,7,u,u,u,u,u,8,9,u,u,u,u,u,10] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,4,5],zero,xmm1[u,u,u,u,6,7],zero,xmm1[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm8, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm11 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm11[4,5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u],zero,zero,xmm7[9,u,u,u,u],zero,zero,xmm7[10,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,2,3],zero,xmm12[u,u,u,u,4,5],zero,xmm12[u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm14, %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,6,7],zero,xmm12[u,u,u,u,8,9],zero,xmm12[u,u,u,u,10] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[u],zero,zero,xmm7[11,u,u,u,u],zero,zero,xmm7[12,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm12, %ymm2 ; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm9, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm3, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0],zero,xmm6[2,3,4,5,6,7],zero,xmm6[9,10,11,12,13,14],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,xmm14[13],zero,zero,zero,zero,zero,zero,xmm14[14],zero,zero,zero,zero,zero,zero,xmm14[15] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm12, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6,7,8,9],zero,xmm0[11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm14[11],zero,zero,zero,zero,zero,zero,xmm14[12],zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm13[8],xmm3[8],xmm13[9],xmm3[9],xmm13[10],xmm3[10],xmm13[11],xmm3[11],xmm13[12],xmm3[12],xmm13[13],xmm3[13],xmm13[14],xmm3[14],xmm13[15],xmm3[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,zero,xmm8[9,u,u,u,u],zero,zero,xmm8[10,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,2,3],zero,xmm5[u,u,u,u,4,5],zero,xmm5[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,6,7],zero,xmm5[u,u,u,u,8,9],zero,xmm5[u,u,u,u,10] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u],zero,zero,xmm8[11,u,u,u,u],zero,zero,xmm8[12,u,u,u,u],zero -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5> -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[4,5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[10],zero,xmm7[u,u,u,u,13,12],zero,xmm7[u,u,u,u,15,14],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm8[13,u,u,u,u],zero,zero,xmm8[14,u,u,u,u],zero,zero,xmm8[15] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u],zero,zero,xmm8[2,u,u,u,u],zero,zero,xmm8[3,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5],zero,xmm7[u,u,u,u,6,7],zero,xmm7[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[4,5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10],zero,xmm6[u,u,u,u,13,12],zero,xmm6[u,u,u,u,15,14],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm7[13,u,u,u,u],zero,zero,xmm7[14,u,u,u,u],zero,zero,xmm7[15] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rax) -; AVX1-ONLY-NEXT: addq $216, %rsp +; AVX1-ONLY-NEXT: addq $152, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -2833,70 +2859,70 @@ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm11 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm14 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm8, %ymm9, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm13 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm15 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,5,5,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm14 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm15 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm11 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm0 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[1,1,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm9, %ymm7, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm8, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u],zero,xmm0[7],zero,xmm0[5,u,u,u],zero,xmm0[8],zero,xmm0[6,u,u,u],zero -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero,xmm14[u,u,u,9] -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm7, %xmm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u] -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm15[8],xmm14[9],xmm15[9],xmm14[10],xmm15[10],xmm14[11],xmm15[11],xmm14[12],xmm15[12],xmm14[13],xmm15[13],xmm14[14],xmm15[14],xmm14[15],xmm15[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm1, %ymm7, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm12[4,u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm0[0,1,2,3,4,5,5,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm7, %ymm9, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm9 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6,u,u,u],zero +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm13[u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u,u,9] +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm7, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[1,1,0,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm15[u,u,u,7],zero,xmm15[5],zero,xmm15[u,u,u,8],zero,xmm15[6],zero,xmm15[u,u] +; AVX2-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm10 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm10[4,u,u,u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[4],zero,xmm11[u,u,u,7],zero,xmm11[5],zero,xmm11[u,u,u,8],zero,xmm11[6],zero +; AVX2-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm7, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm10 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] @@ -2913,67 +2939,65 @@ ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm12[18],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22],zero +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm13[18],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] ; AVX2-SLOW-NEXT: vpor %ymm1, %ymm7, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm11[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm12[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[23],zero,ymm4[27,20,21,26],zero,ymm4[24],zero,ymm4[26,27,26,27],zero,ymm4[25] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm11 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[23],zero,ymm4[27,20,21,26],zero,ymm4[24],zero,ymm4[26,27,26,27],zero,ymm4[25] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27],zero +; AVX2-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] ; AVX2-SLOW-NEXT: vpor %ymm1, %ymm7, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero +; AVX2-SLOW-NEXT: vpor %ymm1, %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm12[25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm13 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm12 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero,zero,ymm4[18],zero -; AVX2-SLOW-NEXT: vpor %ymm6, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm13[1,2,3,0,1,14],zero,ymm13[0,1,0,1,14,15],zero,ymm13[15,16,17,18,19,16],zero,ymm13[30,31,16,17,16,17],zero,ymm13[31,30,31] +; AVX2-SLOW-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm13[1,2,3,0,1,14],zero,ymm13[0,1,0,1,14,15],zero,ymm13[15,16,17,18,19,16],zero,ymm13[30,31,16,17,16,17],zero,ymm13[31,30,31] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[13],zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero -; AVX2-SLOW-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm10, (%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm10, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, 128(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-SLOW-NEXT: popq %rax @@ -2987,119 +3011,115 @@ ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm3 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm9 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm10 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm9 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm10 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm15 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm11 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm12 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm11 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm11[0,1,2,3,4,5,5,6] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm14 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm13 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,0] +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm14 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm15 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm6, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u],zero,xmm15[7],zero,xmm15[5,u,u,u],zero,xmm15[8],zero,xmm15[6,u,u,u],zero -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u,u,9] -; AVX2-FAST-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u] -; AVX2-FAST-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm12[4,u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero -; AVX2-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm7 ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 ; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm8 ; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm7 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u,u,9] +; AVX2-FAST-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm13, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[1,1,0,0,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,2,0,0,1] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] +; AVX2-FAST-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = zero,xmm14[4,u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[4],zero,xmm15[u,u,u,7],zero,xmm15[5],zero,xmm15[u,u,u,8],zero,xmm15[6],zero +; AVX2-FAST-NEXT: vpor %xmm11, %xmm12, %xmm11 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> ; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm11, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[17,18,19,30],zero,ymm0[28],zero,ymm0[28,29,30,31],zero,ymm0[29],zero,ymm0[31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm11, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm7[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,4,5,5,7,4,5] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[20],zero,ymm8[18],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,ymm6[27,28,29,30],zero,ymm6[28],zero,ymm6[26,27,30,31],zero,ymm6[29] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm8[27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u> +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[17,18,19,30],zero,ymm0[28],zero,ymm0[28,29,30,31],zero,ymm0[29],zero,ymm0[31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm11, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,ymm6[27,28,29,30],zero,ymm6[28],zero,ymm6[26,27,30,31],zero,ymm6[29] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[20],zero,ymm8[18],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm8[27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u> -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] @@ -3142,12 +3162,12 @@ ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 128(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm12, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm9, (%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm11, 192(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm11, 128(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm9, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm10, 192(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -3158,69 +3178,69 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm11[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm6, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u],zero,xmm15[7],zero,xmm15[5,u,u,u],zero,xmm15[8],zero,xmm15[6,u,u,u],zero -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u,u,9] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm12[4,u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u,u,9] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm13, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = zero,xmm14[4,u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[4],zero,xmm15[u,u,u,7],zero,xmm15[5],zero,xmm15[u,u,u,8],zero,xmm15[6],zero +; AVX2-FAST-PERLANE-NEXT: vpor %xmm11, %xmm12, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm11, %ymm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[17,18,19,30],zero,ymm0[28],zero,ymm0[28,29,30,31],zero,ymm0[29],zero,ymm0[31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] @@ -3245,51 +3265,51 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[20],zero,ymm8[18],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm13, %ymm14, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[20],zero,ymm8[18],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm13, %ymm14, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18],zero @@ -3309,625 +3329,458 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; -; AVX512F-ONLY-SLOW-LABEL: store_i8_stride7_vf32: -; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r10), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,u,u,u,u,26,27,24,25] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm7, %zmm8, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm5[18],zero,ymm5[20,21,20,21],zero,ymm5[19],zero,ymm5[19,20,21,22],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm5[23],zero,ymm5[23,24,25,26],zero,ymm5[24],zero,ymm5[30,31] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vporq %zmm7, %zmm8, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] -; AVX512F-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpandq %ymm16, %ymm8, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm11[18,19,20,21],zero,ymm11[19],zero,ymm11[25,26,27,22],zero,ymm11[20],zero -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero,ymm2[25] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vporq %zmm9, %zmm8, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,1,1,4,4,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] -; AVX512F-ONLY-SLOW-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpandq %ymm17, %ymm8, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm11[23],zero,ymm11[21,22,23,26],zero,ymm11[24],zero,ymm11[28,29,26,27] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vporq %zmm10, %zmm8, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u,u],zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u,u,9] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm7[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u],zero,xmm7[7],zero,xmm7[5,u,u,u],zero,xmm7[8],zero,xmm7[6,u,u] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm13, %zmm14, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm18 = zmm13[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = zero,xmm13[4,u,u,u],zero,xmm13[7],zero,xmm13[5,u,u,u],zero,xmm13[8],zero,xmm13[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero -; AVX512F-ONLY-SLOW-NEXT: vpor %xmm10, %xmm15, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm15, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm19 = zmm10[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r10), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[1,1,0,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm0[0,0,1,0,4,4,5,4] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm19, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm1[14,u,u],zero,zero,zero,zero,ymm1[15,u,u],zero,zero,zero,zero,ymm1[16,u,u],zero,zero,zero,zero,ymm1[17,u,u],zero,zero,zero,zero,ymm1[18] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[u,u,0,1,14,15],zero,ymm2[u,u,13,2,3,16],zero,ymm2[u,u,28,29,16,17],zero,ymm2[u,u,19,28,29,18],zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u],zero,ymm3[14,u,u,u,u,u],zero,ymm3[15,u,u,u,u,u],zero,ymm3[16,u,u,u,u,u],zero,ymm3[17,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,14],zero,ymm2[u,u,u,u,u,15],zero,ymm2[u,u,u,u,u,16],zero,ymm2[u,u,u,u,u,17],zero,ymm2[u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm7, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm7, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm5[u,u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm7, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,5,5,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-ONLY-SLOW-NEXT: vpandn %ymm7, %ymm9, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm4[13,u,u,u,u],zero,zero,ymm4[14,u,u,u,u],zero,zero,ymm4[15,u,u,u,u],zero,zero,ymm4[16,u,u,u,u],zero,zero,ymm4[17,u,u] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm16, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm17, %ymm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vzeroupper -; AVX512F-ONLY-SLOW-NEXT: retq +; AVX512F-SLOW-LABEL: store_i8_stride7_vf32: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa64 (%r9), %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa64 (%r10), %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u],zero,xmm8[7],zero,xmm8[5,u,u,u],zero,xmm8[8],zero,xmm8[6,u,u,u],zero +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u,u,9] +; AVX512F-SLOW-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm10, %zmm7 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[u,u,u],zero,xmm13[7],zero,xmm13[5,u,u,u],zero,xmm13[8],zero,xmm13[6,u,u] +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero,xmm14[u,u] +; AVX512F-SLOW-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm11, %zmm10 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm15 = zmm10[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm15 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm10 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm10[4,u,u,u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6] +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm11 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[4],zero,xmm11[u,u,u,7],zero,xmm11[5],zero,xmm11[u,u,u,8],zero,xmm11[6],zero +; AVX512F-SLOW-NEXT: vpor %xmm7, %xmm12, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm12, %zmm7 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm16 = zmm7[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm12 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[1,1,0,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm20 = zmm0[0,0,1,0,4,4,5,4] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm20 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm20 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero,ymm5[20] +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero,ymm1[27],zero,ymm1[25] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm6[18,19,20,21],zero,ymm6[19],zero,ymm6[25,26,27,22],zero,ymm6[20],zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm15, %zmm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero,zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm21 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm7[19],zero,ymm7[21,20,21,22],zero,ymm7[20],zero,ymm7[22,23] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm6[23],zero,ymm6[21,22,23,26],zero,ymm6[24],zero,ymm6[28,29,26,27] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm15, %zmm2 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vporq %zmm1, %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm15 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,u,u,u,u,26,27,24,25] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512F-SLOW-NEXT: vpermi2d %zmm0, %zmm2, %zmm16 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm5 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[20],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm4[18],zero,ymm4[20,21,20,21],zero,ymm4[19],zero,ymm4[19,20,21,22],zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm4[23],zero,ymm4[23,24,25,26],zero,ymm4[24],zero,ymm4[30,31] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vporq %zmm0, %zmm2, %zmm15 +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm15 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm15 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm3[14,u,u],zero,zero,zero,zero,ymm3[15,u,u],zero,zero,zero,zero,ymm3[16,u,u],zero,zero,zero,zero,ymm3[17,u,u],zero,zero,zero,zero,ymm3[18] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[0,1,14],zero,ymm7[u,u,0,1,14,15],zero,ymm7[u,u,13,2,3,16],zero,ymm7[u,u,28,29,16,17],zero,ymm7[u,u,19,28,29,18],zero +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm13 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u],zero,ymm13[14,u,u,u,u,u],zero,ymm13[15,u,u,u,u,u],zero,ymm13[16,u,u,u,u,u],zero,ymm13[17,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,14],zero,ymm6[u,u,u,u,u,15],zero,ymm6[u,u,u,u,u,16],zero,ymm6[u,u,u,u,u,17],zero,ymm6[u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm4[u,u,u,u,u,14],zero,ymm4[u,u,u,u,u,15],zero,ymm4[u,u,u,u,u,16],zero,ymm4[u,u,u,u,u,17],zero,ymm4[u,u,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[13,u,u,u,u,u],zero,ymm5[14,u,u,u,u,u],zero,ymm5[15,u,u,u,u,u],zero,ymm5[16,u,u,u,u,u],zero,ymm5[17,u,u,u] +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,5,5,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-SLOW-NEXT: vpandn %ymm2, %ymm8, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm9 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm9[13,u,u,u,u],zero,zero,ymm9[14,u,u,u,u],zero,zero,ymm9[15,u,u,u,u],zero,zero,ymm9[16,u,u,u,u],zero,zero,ymm9[17,u,u] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm13[30],zero,ymm13[28,u,u,u],zero,ymm13[31],zero,ymm13[29,u] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm7[28],zero,ymm7[30,31,30,31],zero,ymm7[29],zero,ymm7[31,28,29] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[27],zero,zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm5[27,u,u,u],zero,ymm5[30],zero,ymm5[28,u,u,u],zero,ymm5[31],zero +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa %ymm3, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i8_stride7_vf32: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 (%r10), %ymm17 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u],zero,xmm8[7],zero,xmm8[5,u,u,u],zero,xmm8[8],zero,xmm8[6,u,u,u],zero -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u,u,9] -; AVX512F-FAST-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm10, %zmm7 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm11 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u] -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] -; AVX512F-FAST-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm13, %zmm10 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm15 = zmm10[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm15 -; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm10 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[1,1,0,0,4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm13, %ymm7 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm13 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm13[4,u,u,u],zero,xmm13[7],zero,xmm13[5,u,u,u],zero,xmm13[8],zero,xmm13[6] -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm14 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero -; AVX512F-FAST-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm0[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm7 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[0,1,14],zero,ymm2[u,u,0,1,14,15],zero,ymm2[u,u,13,2,3,16],zero,ymm2[u,u,28,29,16,17],zero,ymm2[u,u,19,28,29,18],zero -; AVX512F-FAST-NEXT: vpor %ymm0, %ymm15, %ymm0 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm6 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa (%r10), %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero,ymm5[20] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm6[18,19,20,21],zero,ymm6[19],zero,ymm6[25,26,27,22],zero,ymm6[20],zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm4[23,u,u,u],zero,ymm4[26],zero,ymm4[24,u,u,u],zero,ymm4[27],zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vporq %zmm7, %zmm8, %zmm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20],zero,zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm3[19],zero,ymm3[21,20,21,22],zero,ymm3[20],zero,ymm3[22,23] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm6[23],zero,ymm6[21,22,23,26],zero,ymm6[24],zero,ymm6[28,29,26,27] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vporq %zmm8, %zmm9, %zmm8 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm8 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,2,3,2,10,11,10,11] +; AVX512F-FAST-NEXT: vpermi2q %zmm7, %zmm9, %zmm10 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[20],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm0[18],zero,ymm0[20,21,20,21],zero,ymm0[19],zero,ymm0[19,20,21,22],zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm0[23],zero,ymm0[23,24,25,26],zero,ymm0[24],zero,ymm0[30,31] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vporq %zmm7, %zmm9, %zmm7 +; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm7 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero +; AVX512F-FAST-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm10 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u,u,u] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm8 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm5[u,u,u,u],zero,ymm5[14,u,u,u,u,u],zero,ymm5[15,u,u,u,u,u],zero,ymm5[16,u,u,u,u,u],zero,ymm5[17,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm6[u,u,u,u,14],zero,ymm6[u,u,u,u,u,15],zero,ymm6[u,u,u,u,u,16],zero,ymm6[u,u,u,u,u,17],zero,ymm6[u,u,u,u,u] ; AVX512F-FAST-NEXT: vpor %ymm11, %ymm12, %ymm11 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm14 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm15 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm16 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm16 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm0[u,u,u,u,u,14],zero,ymm0[u,u,u,u,u,15],zero,ymm0[u,u,u,u,u,16],zero,ymm0[u,u,u,u,u,17],zero,ymm0[u,u,u] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[13,u,u,u,u,u],zero,ymm1[14,u,u,u,u,u],zero,ymm1[15,u,u,u,u,u],zero,ymm1[16,u,u,u,u,u],zero,ymm1[17,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512F-FAST-NEXT: vpor %ymm8, %ymm11, %ymm8 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm11 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm12 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm13, %zmm17 +; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm13 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm13[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-FAST-NEXT: vpandnq %ymm8, %ymm18, %ymm8 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm2[13,u,u,u,u],zero,zero,ymm2[14,u,u,u,u],zero,zero,ymm2[15,u,u,u,u],zero,zero,ymm2[16,u,u,u,u],zero,zero,ymm2[17,u,u] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm18 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm8 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm8 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u],zero,xmm15[7],zero,xmm15[5,u,u,u],zero,xmm15[8],zero,xmm15[6,u,u,u],zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm14[u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero,xmm14[u,u,u,9] +; AVX512F-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[u,u,u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6,u,u] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u] +; AVX512F-FAST-NEXT: vpor %xmm1, %xmm14, %xmm1 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm9, %zmm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[4],zero,xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero +; AVX512F-FAST-NEXT: vpor %xmm0, %xmm9, %xmm0 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm9, %zmm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,0,1,0,4,4,5,4] ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm9 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm1[u,u,u,u,u,14],zero,ymm1[u,u,u,u,u,15],zero,ymm1[u,u,u,u,u,16],zero,ymm1[u,u,u,u,u,17],zero,ymm1[u,u,u] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[13,u,u,u,u,u],zero,ymm3[14,u,u,u,u,u],zero,ymm3[15,u,u,u,u,u],zero,ymm3[16,u,u,u,u,u],zero,ymm3[17,u,u,u] -; AVX512F-FAST-NEXT: vpor %ymm0, %ymm8, %ymm0 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,5,5,6] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3] -; AVX512F-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-FAST-NEXT: vpandn %ymm8, %ymm10, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm13 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,ymm13[13,u,u,u,u],zero,zero,ymm13[14,u,u,u,u],zero,zero,ymm13[15,u,u,u,u],zero,zero,ymm13[16,u,u,u,u],zero,zero,ymm13[17,u,u] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] -; AVX512F-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpand %ymm0, %ymm9, %ymm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero,ymm2[25] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vporq %zmm10, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20],zero,zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm2[19],zero,ymm2[21,20,21,22],zero,ymm2[20],zero,ymm2[22,23] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vporq %zmm10, %zmm11, %zmm10 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[20],zero,ymm3[18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm1[18],zero,ymm1[20,21,20,21],zero,ymm1[19],zero,ymm1[19,20,21,22],zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vporq %zmm0, %zmm11, %zmm0 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [5,5,4,0,5,5,4,0] -; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm11, %ymm12, %ymm11 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm11 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm9 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm5 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero,zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm5[30],zero,ymm5[28,u,u,u],zero,ymm5[31],zero,ymm5[29,u] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512F-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero,zero +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512F-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm2[28],zero,ymm2[30,31,30,31],zero,ymm2[29],zero,ymm2[31,28,29] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27,u,u,u],zero,ymm2[30],zero,ymm2[28,u,u,u],zero,ymm2[31],zero ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX512F-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-FAST-NEXT: vpternlogq $248, %ymm9, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512F-FAST-NEXT: vmovdqa %ymm2, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; -; AVX512DQ-SLOW-LABEL: store_i8_stride7_vf32: -; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r10), %ymm4 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,u,u,u,u,26,27,24,25] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm7, %zmm8, %zmm9 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm5[18],zero,ymm5[20,21,20,21],zero,ymm5[19],zero,ymm5[19,20,21,22],zero -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm5[23],zero,ymm5[23,24,25,26],zero,ymm5[24],zero,ymm5[30,31] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vporq %zmm7, %zmm8, %zmm7 -; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] -; AVX512DQ-SLOW-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpandq %ymm16, %ymm8, %ymm8 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm11[18,19,20,21],zero,ymm11[19],zero,ymm11[25,26,27,22],zero,ymm11[20],zero -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero,ymm2[25] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vporq %zmm9, %zmm8, %zmm9 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,1,1,4,4,5,5] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm17 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] -; AVX512DQ-SLOW-NEXT: # ymm17 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpandq %ymm17, %ymm8, %ymm8 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm11[23],zero,ymm11[21,22,23,26],zero,ymm11[24],zero,ymm11[28,29,26,27] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm20 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vporq %zmm10, %zmm8, %zmm8 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm11 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u,u],zero -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u,u,9] -; AVX512DQ-SLOW-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm7[0,1,0,1,4,5,4,5] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u],zero,xmm7[7],zero,xmm7[5,u,u,u],zero,xmm7[8],zero,xmm7[6,u,u] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u] -; AVX512DQ-SLOW-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm13, %zmm14, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm18 = zmm13[0,1,0,1,4,5,4,5] -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm13 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = zero,xmm13[4,u,u,u],zero,xmm13[7],zero,xmm13[5,u,u,u],zero,xmm13[8],zero,xmm13[6] -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm14 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero -; AVX512DQ-SLOW-NEXT: vpor %xmm10, %xmm15, %xmm10 -; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm15, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm19 = zmm10[0,1,0,1,4,5,4,5] -; AVX512DQ-SLOW-NEXT: vmovdqa (%r10), %xmm15 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[1,1,0,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm0[0,0,1,0,4,4,5,4] -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm19, %zmm10 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm10 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm1[14,u,u],zero,zero,zero,zero,ymm1[15,u,u],zero,zero,zero,zero,ymm1[16,u,u],zero,zero,zero,zero,ymm1[17,u,u],zero,zero,zero,zero,ymm1[18] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[u,u,0,1,14,15],zero,ymm2[u,u,13,2,3,16],zero,ymm2[u,u,28,29,16,17],zero,ymm2[u,u,19,28,29,18],zero -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u],zero,ymm3[14,u,u,u,u,u],zero,ymm3[15,u,u,u,u,u],zero,ymm3[16,u,u,u,u,u],zero,ymm3[17,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,14],zero,ymm2[u,u,u,u,u,15],zero,ymm2[u,u,u,u,u,16],zero,ymm2[u,u,u,u,u,17],zero,ymm2[u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm7, %ymm1 -; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm7, %zmm1 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm5[u,u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u] -; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm7, %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,5,5,6] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512DQ-SLOW-NEXT: vpandn %ymm7, %ymm9, %ymm7 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm4[13,u,u,u,u],zero,zero,ymm4[14,u,u,u,u],zero,zero,ymm4[15,u,u,u,u],zero,zero,ymm4[16,u,u,u,u],zero,zero,ymm4[17,u,u] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 -; AVX512DQ-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm7 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm16, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm17, %ymm1, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 -; AVX512DQ-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 -; AVX512DQ-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512DQ-SLOW-NEXT: vzeroupper -; AVX512DQ-SLOW-NEXT: retq -; ; AVX512BW-SLOW-LABEL: store_i8_stride7_vf32: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,1,0,1,14],zero,ymm1[14,15,0,1,14,15],zero,ymm1[13,14,15,16,17,16],zero,ymm1[30,31,30,31,16,17],zero,ymm1[31,28,29,30,31] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero -; AVX512BW-SLOW-NEXT: vpor %ymm0, %ymm5, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm8 -; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm5 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[0,1,14],zero,ymm4[12,13,0,1,14,15],zero,ymm4[3,12,13,2,3,16],zero,ymm4[30,31,28,29,16,17],zero,ymm4[31,18,19,28,29,18],zero -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero,zero,ymm2[18] -; AVX512BW-SLOW-NEXT: vpor %ymm0, %ymm6, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %ymm0 +; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %ymm2 +; AVX512BW-SLOW-NEXT: vmovdqa (%r10), %xmm8 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0] +; AVX512BW-SLOW-NEXT: vpermw %ymm8, %ymm4, %ymm4 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,0] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm9 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm9[4,u,u,u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6] +; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm10 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[4],zero,xmm10[u,u,u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero +; AVX512BW-SLOW-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm6, %zmm5 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: movabsq $4647998506761461824, %r11 # imm = 0x4081020408102040 +; AVX512BW-SLOW-NEXT: kmovq %r11, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm4, %zmm5 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u],zero,xmm6[7],zero,xmm6[5,u,u,u],zero,xmm6[8],zero,xmm6[6,u,u] +; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,7],zero,xmm7[5],zero,xmm7[u,u,u,8],zero,xmm7[6],zero,xmm7[u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm4, %xmm11, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm11, %zmm4 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm11 = zmm4[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm12 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6,u,u,u],zero +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u,u,9] +; AVX512BW-SLOW-NEXT: vpor %xmm4, %xmm14, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm14, %zmm4 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm4[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C +; AVX512BW-SLOW-NEXT: kmovq %rcx, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm11, %zmm4 {%k1} +; AVX512BW-SLOW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 +; AVX512BW-SLOW-NEXT: kmovq %rcx, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm5, %zmm4 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm0[0,1,0,1,14],zero,ymm0[14,15,0,1,14,15],zero,ymm0[13,14,15,16,17,16],zero,ymm0[30,31,30,31,16,17],zero,ymm0[31,28,29,30,31] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vpor %ymm5, %ymm11, %ymm5 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm11, %zmm11 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[0,1,14],zero,ymm3[12,13,0,1,14,15],zero,ymm3[3,12,13,2,3,16],zero,ymm3[30,31,28,29,16,17],zero,ymm3[31,18,19,28,29,18],zero +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18] +; AVX512BW-SLOW-NEXT: vpor %ymm5, %ymm12, %ymm5 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 ; AVX512BW-SLOW-NEXT: movabsq $435749858791416001, %rcx # imm = 0x60C1830183060C1 ; AVX512BW-SLOW-NEXT: kmovq %rcx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm5, %zmm0 {%k1} -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[13],zero,zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm11, %zmm5 {%k1} ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero -; AVX512BW-SLOW-NEXT: vpor %ymm7, %ymm9, %ymm7 -; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm11 -; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm13 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[13],zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm9, %zmm9 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512BW-SLOW-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512BW-SLOW-NEXT: vpermw %ymm7, %ymm15, %ymm15 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm16 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 +; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512BW-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-SLOW-NEXT: vpermw %ymm8, %ymm10, %ymm10 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm8[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 ; AVX512BW-SLOW-NEXT: movabsq $2323999253380730912, %rcx # imm = 0x2040810204081020 ; AVX512BW-SLOW-NEXT: kmovq %rcx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm9 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm10, %zmm9 {%k1} ; AVX512BW-SLOW-NEXT: movabsq $4066998693416279096, %rcx # imm = 0x3870E1C3870E1C38 ; AVX512BW-SLOW-NEXT: kmovq %rcx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm9, %zmm0 {%k1} -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm9 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm9, %zmm5 {%k1} +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm9 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm9 = zmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm9[18,19,20,21],zero,zmm9[19],zero,zmm9[25,26,27,22],zero,zmm9[20],zero,zmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm9[55],zero,zero,zero,zero,zmm9[58],zero,zmm9[56],zero,zero,zero,zero,zmm9[59],zero ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm15 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm15 = zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm15[18],zero,zero,zero,zero,zmm15[21],zero,zmm15[19],zero,zero,zero,zero,zmm15[22],zero,zmm15[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm15[55],zero,zero,zero,zero,zmm15[58],zero,zmm15[56],zero,zero,zero,zero,zmm15[59],zero,zmm15[57] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm15 = zmm15[2,3,2,3,6,7,6,7] -; AVX512BW-SLOW-NEXT: vporq %zmm9, %zmm15, %zmm9 -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,0,1,1,4,4,5,5] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm10 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm10[18],zero,zero,zero,zero,zmm10[21],zero,zmm10[19],zero,zero,zero,zero,zmm10[22],zero,zmm10[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm10[55],zero,zero,zero,zero,zmm10[58],zero,zmm10[56],zero,zero,zero,zero,zmm10[59],zero,zmm10[57] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] +; AVX512BW-SLOW-NEXT: vporq %zmm9, %zmm10, %zmm9 +; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,1,1,4,4,5,5] ; AVX512BW-SLOW-NEXT: movl $676341840, %ecx # imm = 0x28502850 ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm15 {%k1} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm16 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm1[23],zero,ymm1[21,22,23,26],zero,ymm1[24],zero,ymm1[28,29,26,27] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm17 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[2,3,2,3] -; AVX512BW-SLOW-NEXT: vporq %ymm16, %ymm17, %ymm16 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm10 {%k1} = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm0[23],zero,ymm0[21,22,23,26],zero,ymm0[24],zero,ymm0[28,29,26,27] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 ; AVX512BW-SLOW-NEXT: movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060 ; AVX512BW-SLOW-NEXT: kmovq %rcx, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm9 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] -; AVX512BW-SLOW-NEXT: vpermw %zmm7, %zmm15, %zmm15 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm16 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm6[18],zero,zmm6[20,21,20,21],zero,zmm6[19],zero,zmm6[19,20,21,22],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm6[55],zero,zmm6[55,56,57,58],zero,zmm6[56],zero,zmm6[62,63] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm17 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[20],zero,zmm5[18],zero,zero,zero,zero,zmm5[21],zero,zmm5[19],zero,zero,zero,zero,zmm5[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm5[57],zero,zmm5[55],zero,zero,zero,zero,zmm5[58],zero,zmm5[56],zero,zero -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm17 = zmm17[2,3,2,3,6,7,6,7] -; AVX512BW-SLOW-NEXT: vporq %zmm16, %zmm17, %zmm16 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm10, %zmm9 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] +; AVX512BW-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm10 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm11 = zmm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm7[18],zero,zmm7[20,21,20,21],zero,zmm7[19],zero,zmm7[19,20,21,22],zero,zmm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm7[55],zero,zmm7[55,56,57,58],zero,zmm7[56],zero,zmm7[62,63] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm12 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm6[20],zero,zmm6[18],zero,zero,zero,zero,zmm6[21],zero,zmm6[19],zero,zero,zero,zero,zmm6[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm6[57],zero,zmm6[55],zero,zero,zero,zero,zmm6[58],zero,zmm6[56],zero,zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[2,3,2,3,6,7,6,7] +; AVX512BW-SLOW-NEXT: vporq %zmm11, %zmm12, %zmm11 ; AVX512BW-SLOW-NEXT: movabsq $1161999626690365456, %rcx # imm = 0x1020408102040810 ; AVX512BW-SLOW-NEXT: kmovq %rcx, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm16 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm10, %zmm11 {%k2} ; AVX512BW-SLOW-NEXT: movabsq $2033499346708139548, %rcx # imm = 0x1C3870E1C3870E1C ; AVX512BW-SLOW-NEXT: kmovq %rcx, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm16, %zmm9 {%k2} -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm16 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] -; AVX512BW-SLOW-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,1,0,1,4,5,4,5] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6,u,u,u],zero -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u,u,9] -; AVX512BW-SLOW-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm8, %zmm8 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5] -; AVX512BW-SLOW-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C -; AVX512BW-SLOW-NEXT: kmovq %rcx, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm12, %zmm8 {%k2} -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero -; AVX512BW-SLOW-NEXT: vpor %xmm10, %xmm12, %xmm10 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm11, %zmm10 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,1,0,1,4,5,4,5] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] -; AVX512BW-SLOW-NEXT: vpermw %zmm7, %zmm11, %zmm11 -; AVX512BW-SLOW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 -; AVX512BW-SLOW-NEXT: kmovq %rcx, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm11, %zmm10 {%k2} -; AVX512BW-SLOW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 -; AVX512BW-SLOW-NEXT: kmovq %rcx, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm10, %zmm8 {%k2} -; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm11, %zmm9 {%k2} +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] ; AVX512BW-SLOW-NEXT: movl $338170920, %ecx # imm = 0x14281428 ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 {%k2} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm4[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] -; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,3,3,4,6,7,7] -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm1, %ymm3 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 {%k2} = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,3,3,4,6,7,7] +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1} +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,2,3] ; AVX512BW-SLOW-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} -; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15] -; AVX512BW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-SLOW-NEXT: vpermw %ymm7, %ymm2, %ymm2 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} +; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15] +; AVX512BW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-SLOW-NEXT: vpermw %ymm8, %ymm1, %ymm1 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512BW-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512BW-SLOW-NEXT: movl $-2130574328, %ecx # imm = 0x81020408 ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm3 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm1, %ymm2 {%k1} ; AVX512BW-SLOW-NEXT: movl $-507279602, %ecx # imm = 0xE1C3870E ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa %ymm1, 192(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa %ymm0, 192(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; @@ -3951,9 +3804,9 @@ ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[0,1,14],zero,ymm4[12,13,0,1,14,15],zero,ymm4[3,12,13,2,3,16],zero,ymm4[30,31,28,29,16,17],zero,ymm4[31,18,19,28,29,18],zero ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18] ; AVX512BW-FAST-NEXT: vpor %ymm0, %ymm6, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] +; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %xmm11 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 @@ -3963,88 +3816,90 @@ ; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[13],zero,zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero ; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero -; AVX512BW-FAST-NEXT: vpor %ymm7, %ymm10, %ymm7 -; AVX512BW-FAST-NEXT: vmovdqa (%r9), %xmm11 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero +; AVX512BW-FAST-NEXT: vpor %ymm7, %ymm12, %ymm7 +; AVX512BW-FAST-NEXT: vmovdqa (%r9), %xmm12 ; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm13 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm10 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm14, %zmm14 +; AVX512BW-FAST-NEXT: vmovdqa (%r10), %xmm15 +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512BW-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpermw %ymm15, %ymm7, %ymm16 ; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512BW-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpermw %ymm7, %ymm15, %ymm15 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm16 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm17 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm16, %zmm16 ; AVX512BW-FAST-NEXT: movabsq $2323999253380730912, %rcx # imm = 0x2040810204081020 ; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm15, %zmm10 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm16, %zmm14 {%k1} ; AVX512BW-FAST-NEXT: movabsq $4066998693416279096, %rcx # imm = 0x3870E1C3870E1C38 ; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm10, %zmm0 {%k1} -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm10 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,zmm10[19],zero,zmm10[21,20,21,22],zero,zmm10[20],zero,zmm10[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm10[55],zero,zmm10[53,54,55,58],zero,zmm10[56],zero,zmm10[60,61,58,59] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm15 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm15 = zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm15[21],zero,zmm15[19],zero,zero,zero,zero,zmm15[22],zero,zmm15[20],zero,zero,zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm15[57],zero,zmm15[55],zero,zero,zero,zero,zmm15[58],zero,zmm15[56],zero,zero,zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm15 = zmm15[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vporq %zmm10, %zmm15, %zmm15 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm10 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm10[18,19,20,21],zero,zmm10[19],zero,zmm10[25,26,27,22],zero,zmm10[20],zero,zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm10[55],zero,zero,zero,zero,zmm10[58],zero,zmm10[56],zero,zero,zero,zero,zmm10[59],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm16 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm16 = zmm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[18],zero,zero,zero,zero,zmm16[21],zero,zmm16[19],zero,zero,zero,zero,zmm16[22],zero,zmm16[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[55],zero,zero,zero,zero,zmm16[58],zero,zmm16[56],zero,zero,zero,zero,zmm16[59],zero,zmm16[57] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vporq %zmm10, %zmm16, %zmm10 -; AVX512BW-FAST-NEXT: movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060 -; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm15, %zmm10 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] -; AVX512BW-FAST-NEXT: vpermw %zmm7, %zmm15, %zmm15 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm16 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm6[18],zero,zmm6[20,21,20,21],zero,zmm6[19],zero,zmm6[19,20,21,22],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm6[55],zero,zmm6[55,56,57,58],zero,zmm6[56],zero,zmm6[62,63] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm17 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[20],zero,zmm5[18],zero,zero,zero,zero,zmm5[21],zero,zmm5[19],zero,zero,zero,zero,zmm5[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm5[57],zero,zmm5[55],zero,zero,zero,zero,zmm5[58],zero,zmm5[56],zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm17 = zmm17[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vporq %zmm16, %zmm17, %zmm16 -; AVX512BW-FAST-NEXT: movabsq $1161999626690365456, %rcx # imm = 0x1020408102040810 -; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm15, %zmm16 {%k1} -; AVX512BW-FAST-NEXT: movabsq $2033499346708139548, %rcx # imm = 0x1C3870E1C3870E1C -; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm16, %zmm10 {%k1} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] -; AVX512BW-FAST-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,1,0,1,4,5,4,5] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u,u,9] -; AVX512BW-FAST-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm14, %zmm0 {%k1} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[u,u,u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u] +; AVX512BW-FAST-NEXT: vporq %xmm14, %xmm16, %xmm14 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm10, %zmm10 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,1,0,1,4,5,4,5] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u,u,9] +; AVX512BW-FAST-NEXT: vpor %xmm11, %xmm14, %xmm11 ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm8, %zmm8 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm8, %zmm8 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5] ; AVX512BW-FAST-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C ; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm12, %zmm8 {%k1} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero -; AVX512BW-FAST-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm11, %zmm9 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm10, %zmm8 {%k1} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm12[4,u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero +; AVX512BW-FAST-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm9 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,1,0,1,4,5,4,5] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm10 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] -; AVX512BW-FAST-NEXT: vpermw %zmm7, %zmm11, %zmm11 +; AVX512BW-FAST-NEXT: vpermw %zmm10, %zmm11, %zmm10 ; AVX512BW-FAST-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm11, %zmm9 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm10, %zmm9 {%k1} ; AVX512BW-FAST-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm9, %zmm8 {%k1} +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm9 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm9 = zmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,zmm9[19],zero,zmm9[21,20,21,22],zero,zmm9[20],zero,zmm9[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm9[55],zero,zmm9[53,54,55,58],zero,zmm9[56],zero,zmm9[60,61,58,59] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm10 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm10[21],zero,zmm10[19],zero,zero,zero,zero,zmm10[22],zero,zmm10[20],zero,zero,zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm10[57],zero,zmm10[55],zero,zero,zero,zero,zmm10[58],zero,zmm10[56],zero,zero,zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vporq %zmm9, %zmm10, %zmm9 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm10 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm10[18,19,20,21],zero,zmm10[19],zero,zmm10[25,26,27,22],zero,zmm10[20],zero,zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm10[55],zero,zero,zero,zero,zmm10[58],zero,zmm10[56],zero,zero,zero,zero,zmm10[59],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm11 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm11 = zmm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm11[18],zero,zero,zero,zero,zmm11[21],zero,zmm11[19],zero,zero,zero,zero,zmm11[22],zero,zmm11[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm11[55],zero,zero,zero,zero,zmm11[58],zero,zmm11[56],zero,zero,zero,zero,zmm11[59],zero,zmm11[57] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vporq %zmm10, %zmm11, %zmm10 +; AVX512BW-FAST-NEXT: movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060 +; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm9, %zmm10 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] +; AVX512BW-FAST-NEXT: vpermw %zmm7, %zmm9, %zmm9 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm11 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm6[18],zero,zmm6[20,21,20,21],zero,zmm6[19],zero,zmm6[19,20,21,22],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm6[55],zero,zmm6[55,56,57,58],zero,zmm6[56],zero,zmm6[62,63] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm12 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[20],zero,zmm5[18],zero,zero,zero,zero,zmm5[21],zero,zmm5[19],zero,zero,zero,zero,zmm5[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm5[57],zero,zmm5[55],zero,zero,zero,zero,zmm5[58],zero,zmm5[56],zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm12 = zmm12[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vporq %zmm11, %zmm12, %zmm11 +; AVX512BW-FAST-NEXT: movabsq $1161999626690365456, %rcx # imm = 0x1020408102040810 +; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm9, %zmm11 {%k1} +; AVX512BW-FAST-NEXT: movabsq $2033499346708139548, %rcx # imm = 0x1C3870E1C3870E1C +; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm11, %zmm10 {%k1} ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm4[28],zero,ymm4[30,31,30,31],zero,ymm4[29],zero,ymm4[31,28,29] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero @@ -4073,8 +3928,8 @@ ; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 ; AVX512BW-FAST-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa %ymm1, 192(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq @@ -5504,548 +5359,519 @@ ; SSE-NEXT: movaps %xmm0, 400(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 384(%rax) -; SSE-NEXT: addq $648, %rsp # imm = 0x288 -; SSE-NEXT: retq -; -; AVX1-ONLY-LABEL: store_i8_stride7_vf64: -; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $616, %rsp # imm = 0x268 -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm6[13,u,u,u,u],zero,zero,xmm6[14,u,u,u,u],zero,zero,xmm6[15] -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u],zero,zero,xmm6[11,u,u,u,u],zero,zero,xmm6[12,u,u,u,u],zero -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,6,7],zero,xmm0[u,u,u,u,8,9],zero,xmm0[u,u,u,u,10] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm7[8],xmm11[9],xmm7[9],xmm11[10],xmm7[10],xmm11[11],xmm7[11],xmm11[12],xmm7[12],xmm11[13],xmm7[13],xmm11[14],xmm7[14],xmm11[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,3],zero,xmm0[u,u,u,u,4,5],zero,xmm0[u,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u],zero,zero,xmm6[9,u,u,u,u],zero,zero,xmm6[10,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u> -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u> -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm13, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm11[8],xmm7[9],xmm11[9],xmm7[10],xmm11[10],xmm7[11],xmm11[11],xmm7[12],xmm11[12],xmm7[13],xmm11[13],xmm7[14],xmm11[14],xmm7[15],xmm11[15] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u> -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 384(%rax) +; SSE-NEXT: addq $648, %rsp # imm = 0x288 +; SSE-NEXT: retq +; +; AVX1-ONLY-LABEL: store_i8_stride7_vf64: +; AVX1-ONLY: # %bb.0: +; AVX1-ONLY-NEXT: subq $536, %rsp # imm = 0x218 +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,u,u,u],zero,xmm1[7,u,u,u,u,u],zero,xmm1[8,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[u,u,u,u,5,6],zero,xmm0[u,u,u,u,12,13],zero,xmm0[u] +; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[6,u,u,u,u],zero,zero,xmm4[7,u,u,u,u],zero,zero,xmm4[8,u] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm2[4,u,u,u,u],zero,zero,xmm2[5,u,u,u,u],zero,zero -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm11 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm4[4,u,u,u,u],zero,zero,xmm4[5,u,u,u,u],zero,zero +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm10 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9],zero,xmm2[u,u,u,u,10,11],zero,xmm2[u,u,u,u,12,13] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <8,9,128,u,u,u,u,10,11,128,u,u,u,u,12,13> +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u],zero,zero,xmm8[11,u,u,u,u],zero,zero,xmm8[12,u,u,u,u],zero -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,6,7],zero,xmm5[u,u,u,u,8,9],zero,xmm5[u,u,u,u,10] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,2,3],zero,xmm5[u,u,u,u,4,5],zero,xmm5[u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u],zero,zero,xmm8[9,u,u,u,u],zero,zero,xmm8[10,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u> -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,u],zero,xmm0[7,u,u,u,u,u],zero,xmm0[8,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,7],zero,xmm1[u,u,u,u,u,8],zero,xmm1[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u],zero,xmm6[7,u,u,u,u,u],zero,xmm6[8,u,u,u,u,u],zero +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,7],zero,xmm7[u,u,u,u,u,8],zero,xmm7[u,u,u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u],zero,zero,xmm10[11,u,u,u,u],zero,zero,xmm10[12,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[u,6,7],zero,xmm8[u,u,u,u,8,9],zero,xmm8[u,u,u,u,10] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm0 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <8,9,128,u,u,u,u,10,11,128,u,u,u,u,12,13> -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u],zero,zero,xmm12[2,u,u,u,u],zero,zero,xmm12[3,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,4,5],zero,xmm2[u,u,u,u,6,7],zero,xmm2[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3],xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm11, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm1, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[u,u,u,u,u],zero,xmm6[7,u,u,u,u,u],zero,xmm6[8,u,u] -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm8 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,7],zero,xmm8[u,u,u,u,u,8],zero,xmm8[u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[u,u,u,u,5,6],zero,xmm0[u,u,u,u,12,13],zero,xmm0[u] -; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[6,u,u,u,u],zero,zero,xmm7[7,u,u,u,u],zero,zero,xmm7[8,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm15, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm15 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,u],zero,xmm5[7,u,u,u,u,u],zero,xmm5[8,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm14 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm14[u],zero,xmm14[7,u,u,u,u,u],zero,xmm14[8,u,u,u,u,u],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,7],zero,xmm2[u,u,u,u,u,8],zero,xmm2[u,u,u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm9, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm9, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u],zero,zero,xmm7[11,u,u,u,u],zero,zero,xmm7[12,u,u,u,u],zero -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,6,7],zero,xmm4[u,u,u,u,8,9],zero,xmm4[u,u,u,u,10] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,2,3],zero,xmm4[u,u,u,u,4,5],zero,xmm4[u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u],zero,zero,xmm7[9,u,u,u,u],zero,zero,xmm7[10,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u> -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm13 +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm12[4,u,u,u,u],zero,zero,xmm12[5,u,u,u,u],zero,zero +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u> -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm3[4,u,u,u,u],zero,zero,xmm3[5,u,u,u,u],zero,zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9],zero,xmm1[u,u,u,u,10,11],zero,xmm1[u,u,u,u,12,13] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u],zero,zero,xmm3[2,u,u,u,u],zero,zero,xmm3[3,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,4,5],zero,xmm1[u,u,u,u,6,7],zero,xmm1[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[u,u],zero,zero,xmm12[2,u,u,u,u],zero,zero,xmm12[3,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[u,u,4,5],zero,xmm3[u,u,u,u,6,7],zero,xmm3[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3],xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm8 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9> -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm9, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm9, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm13[8],xmm7[8],xmm13[9],xmm7[9],xmm13[10],xmm7[10],xmm13[11],xmm7[11],xmm13[12],xmm7[12],xmm13[13],xmm7[13],xmm13[14],xmm7[14],xmm13[15],xmm7[15] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,u,u,u,u,7],zero,xmm13[u,u,u,u,u,8],zero,xmm13[u,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm3[u,u,u,u,5,6],zero,xmm3[u,u,u,u,12,13],zero,xmm3[u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm12[6,u,u,u,u],zero,zero,xmm12[7,u,u,u,u],zero,zero,xmm12[8,u] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm14[u,u,u,7],zero,xmm14[u,u,u,u,u,8],zero,xmm14[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm8[8],xmm14[9],xmm8[9],xmm14[10],xmm8[10],xmm14[11],xmm8[11],xmm14[12],xmm8[12],xmm14[13],xmm8[13],xmm14[14],xmm8[14],xmm14[15],xmm8[15] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm9[8],xmm15[8],xmm9[9],xmm15[9],xmm9[10],xmm15[10],xmm9[11],xmm15[11],xmm9[12],xmm15[12],xmm9[13],xmm15[13],xmm9[14],xmm15[14],xmm9[15],xmm15[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,u],zero,xmm9[7,u,u,u,u,u],zero,xmm9[8,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,7],zero,xmm13[u,u,u,u,u,8],zero,xmm13[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm13[8],xmm7[9],xmm13[9],xmm7[10],xmm13[10],xmm7[11],xmm13[11],xmm7[12],xmm13[12],xmm7[13],xmm13[13],xmm7[14],xmm13[14],xmm7[15],xmm13[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10],zero,xmm0[u,u,u,u,13,12],zero,xmm0[u,u,u,u,15,14],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm12[13,u,u,u,u],zero,zero,xmm12[14,u,u,u,u],zero,zero,xmm12[15] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[u,6,7],zero,xmm11[u,u,u,u,8,9],zero,xmm11[u,u,u,u,10] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[u],zero,zero,xmm12[11,u,u,u,u],zero,zero,xmm12[12,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u],zero,xmm1[7,u,u,u,u,u],zero,xmm1[8,u,u,u,u,u],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,7],zero,xmm10[u,u,u,u,u,8],zero,xmm10[u,u,u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm8[8],xmm14[8],xmm8[9],xmm14[9],xmm8[10],xmm14[10],xmm8[11],xmm14[11],xmm8[12],xmm14[12],xmm8[13],xmm14[13],xmm8[14],xmm14[14],xmm8[15],xmm14[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u> +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[u,u,6,7,8,9],zero,xmm4[u,u,13,14,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm6[9,u,u],zero,zero,zero,zero,xmm6[10,u,u],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3],zero,xmm4[u,6,7,8,9,10],zero,xmm4[u,13,14,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,xmm5[9,u],zero,zero,zero,zero,zero,xmm5[10,u],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4],zero,xmm4[6,7,8,9,10,11],zero,xmm4[13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,xmm12[9],zero,zero,zero,zero,zero,zero,xmm12[10],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,u,u,u,7],zero,xmm6[u,u,u,u,u,8],zero,xmm6[u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[u,u,u,u,5,6],zero,xmm0[u,u,u,u,12,13],zero,xmm0[u] +; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[6,u,u,u,u],zero,zero,xmm10[7,u,u,u,u],zero,zero,xmm10[8,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <8,9,128,u,u,u,u,10,11,128,u,u,u,u,12,13> +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,1,2,3,4],zero,xmm3[u,u,8,9,10,11],zero,xmm3[u,u,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u],zero,zero,zero,zero,xmm6[7,u,u],zero,zero,zero,zero,xmm6[8,u,u],zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,1,2,3,4,5],zero,xmm3[u,8,9,10,11,12],zero,xmm3[u,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u],zero,zero,zero,zero,zero,xmm5[7,u],zero,zero,zero,zero,zero,xmm5[8,u],zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm3[1,2,3,4,5,6],zero,xmm3[8,9,10,11,12,13],zero,xmm3[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[6],zero,zero,zero,zero,zero,zero,xmm12[7],zero,zero,zero,zero,zero,zero,xmm12[8],zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u],zero,xmm7[7,u,u,u,u,u],zero,xmm7[8,u,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,7],zero,xmm8[u,u,u,u,u,8],zero,xmm8[u,u,u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm13[8],xmm9[9],xmm13[9],xmm9[10],xmm13[10],xmm9[11],xmm13[11],xmm9[12],xmm13[12],xmm9[13],xmm13[13],xmm9[14],xmm13[14],xmm9[15],xmm13[15] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[u],zero,zero,xmm10[11,u,u,u,u],zero,zero,xmm10[12,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,6,7],zero,xmm4[u,u,u,u,8,9],zero,xmm4[u,u,u,u,10] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,2,3],zero,xmm4[u,u,u,u,4,5],zero,xmm4[u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u],zero,zero,xmm10[9,u,u,u,u],zero,zero,xmm10[10,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm9, %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0],zero,xmm0[u,u,4,5,6,7],zero,xmm0[u,u,11,12,13,14],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm6[11,u,u],zero,zero,zero,zero,xmm6[12,u,u],zero,zero,zero,zero,xmm6[13] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1],zero,xmm3[u,4,5,6,7,8],zero,xmm3[u,11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm5[11,u],zero,zero,zero,zero,zero,xmm5[12,u],zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2],zero,xmm3[4,5,6,7,8,9],zero,xmm3[11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm12[11],zero,zero,zero,zero,zero,zero,xmm12[12],zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm6[4,u,u,u,u],zero,zero,xmm6[5,u,u,u,u],zero,zero +; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8,9],zero,xmm2[u,u,u,u,10,11],zero,xmm2[u,u,u,u,12,13] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u],zero,zero,xmm6[2,u,u,u,u],zero,zero,xmm6[3,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,4,5],zero,xmm2[u,u,u,u,6,7],zero,xmm2[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm8 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9> +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm15, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,u],zero,xmm5[7,u,u,u,u,u],zero,xmm5[8,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,7],zero,xmm2[u,u,u,u,u,8],zero,xmm2[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm7, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u],zero,xmm9[7,u,u,u,u,u],zero,xmm9[8,u,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,7],zero,xmm1[u,u,u,u,u,8],zero,xmm1[u,u,u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm8, %xmm15, %xmm8 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm1[8],xmm9[9],xmm1[9],xmm9[10],xmm1[10],xmm9[11],xmm1[11],xmm9[12],xmm1[12],xmm9[13],xmm1[13],xmm9[14],xmm1[14],xmm9[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm14, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm8 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2],zero,xmm8[u,u,6,7,8,9],zero,xmm8[u,u,13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm3[9,u,u],zero,zero,zero,zero,xmm3[10,u,u],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3],zero,xmm8[u,6,7,8,9,10],zero,xmm8[u,13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,zero,xmm4[9,u],zero,zero,zero,zero,zero,xmm4[10,u],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4],zero,xmm8[6,7,8,9,10,11],zero,xmm8[13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,zero,zero,xmm6[9],zero,zero,zero,zero,zero,zero,xmm6[10],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,1,2,3,4],zero,xmm0[u,u,8,9,10,11],zero,xmm0[u,u,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u],zero,zero,zero,zero,xmm3[7,u,u],zero,zero,zero,zero,xmm3[8,u,u],zero +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,1,2,3,4,5],zero,xmm0[u,8,9,10,11,12],zero,xmm0[u,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u],zero,zero,zero,zero,zero,xmm4[7,u],zero,zero,zero,zero,zero,xmm4[8,u],zero +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4,5,6],zero,xmm0[8,9,10,11,12,13],zero,xmm0[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[6],zero,zero,zero,zero,zero,zero,xmm6[7],zero,zero,zero,zero,zero,zero,xmm6[8],zero +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0],zero,xmm0[u,u,4,5,6,7],zero,xmm0[u,u,11,12,13,14],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm3[11,u,u],zero,zero,zero,zero,xmm3[12,u,u],zero,zero,zero,zero,xmm3[13] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[u,4,5,6,7,8],zero,xmm1[u,11,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm4[11,u],zero,zero,zero,zero,zero,xmm4[12,u],zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[4,5,6,7,8,9],zero,xmm1[11,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[11],zero,zero,zero,zero,zero,zero,xmm6[12],zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[u,4,5,6,7,0],zero,xmm0[u,11,12,13,14,1],zero,xmm0[u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[13,u],zero,zero,zero,zero,zero,xmm5[14,u],zero,zero,zero,zero,zero,xmm5[15,u] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[13,u],zero,zero,zero,zero,zero,xmm4[14,u],zero,zero,zero,zero,zero,xmm4[15,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2,3,4,5,6,7],zero,xmm0[9,10,11,12,13,14],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm12[13],zero,zero,zero,zero,zero,zero,xmm12[14],zero,zero,zero,zero,zero,zero,xmm12[15] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm6[13],zero,zero,zero,zero,zero,zero,xmm6[14],zero,zero,zero,zero,zero,zero,xmm6[15] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5> -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10],zero,xmm4[u,u,u,u,13,12],zero,xmm4[u,u,u,u,15,14],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm14[13,u,u,u,u],zero,zero,xmm14[14,u,u,u,u],zero,zero,xmm14[15] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm10 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm10[4,5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm12, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm10, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm13 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[4,5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm13, %ymm10 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm13, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm14, %ymm5 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm12, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm8, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm8, %xmm12, %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm12 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[4,5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm12, %ymm8 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm13, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm8, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm6[13,u,u,u,u],zero,zero,xmm6[14,u,u,u,u],zero,zero,xmm6[15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10],zero,xmm6[u,u,u,u,13,12],zero,xmm6[u,u,u,u,15,14],zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5> +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm2[4,5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <128,13,u,u,u,u,128,128,14,u,u,u,u,128,128,15> +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm9, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <10,128,u,u,u,u,13,12,128,u,u,u,u,15,14,128> +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm9, %xmm15 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm15[4,5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u],zero,zero,xmm10[2,u,u,u,u],zero,zero,xmm10[3,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[u,u,4,5],zero,xmm15[u,u,u,u,6,7],zero,xmm15[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm9, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[4,5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[8],mem[8],xmm7[9],mem[9],xmm7[10],mem[10],xmm7[11],mem[11],xmm7[12],mem[12],xmm7[13],mem[13],xmm7[14],mem[14],xmm7[15],mem[15] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[4,5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 352(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -6054,7 +5880,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%rax) -; AVX1-ONLY-NEXT: addq $616, %rsp # imm = 0x268 +; AVX1-ONLY-NEXT: addq $536, %rsp # imm = 0x218 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -6066,29 +5892,29 @@ ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm7 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm4 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,ymm0[27,20,21,26],zero,ymm0[24],zero,ymm0[26,27,26,27],zero,ymm0[25] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[7],zero,ymm0[11,4,5,10],zero,ymm0[8],zero,ymm0[10,11,10,11],zero,ymm0[9],zero,ymm0[23],zero,ymm0[27,20,21,26],zero,ymm0[24],zero,ymm0[26,27,26,27],zero,ymm0[25] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero,ymm1[27],zero +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[9],zero,ymm1[7],zero,zero,zero,zero,ymm1[10],zero,ymm1[8],zero,zero,zero,zero,ymm1[11],zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero,ymm1[27],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm6[9],zero,ymm6[7],zero,zero,zero,zero,ymm6[10],zero,ymm6[8],zero,zero,zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm5[9],zero,ymm5[7],zero,zero,zero,zero,ymm5[10],zero,ymm5[8],zero,zero,zero,zero,ymm5[11],zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero,ymm5[27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -6096,7 +5922,7 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[6,7,10,11,8,9,6,7,8,9,10,11,10,11,8,9,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -6112,7 +5938,7 @@ ; AVX2-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[12,13,10,11,12,13,14,15,14,15,12,13,12,13,14,15,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -6145,556 +5971,562 @@ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm8 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm13 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3],xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm4 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm12 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm9 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm7 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm8 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[1,1,0,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,0] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[1,1,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm11 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3],xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %xmm10 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[1,1,0,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm12, %ymm15, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm10, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm14, %ymm15, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm4, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm14, %xmm12 -; AVX2-SLOW-NEXT: vpor %xmm2, %xmm12, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm14 +; AVX2-SLOW-NEXT: vpor %xmm2, %xmm14, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm12 +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm9, %xmm14 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm14 -; AVX2-SLOW-NEXT: vpor %xmm12, %xmm14, %xmm12 +; AVX2-SLOW-NEXT: vmovdqa %xmm12, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm13 +; AVX2-SLOW-NEXT: vpor %xmm14, %xmm13, %xmm13 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm12, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm13, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm4 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm6, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm2 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm6, %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-SLOW-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm14 -; AVX2-SLOW-NEXT: vpor %xmm2, %xmm14, %xmm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm14 +; AVX2-SLOW-NEXT: vpor %xmm4, %xmm14, %xmm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm5, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm15 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm15, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm15, %ymm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm4 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm11, %xmm13 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm13, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm10, %xmm13 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm13, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm12, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm12, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm6, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = xmm13[8],mem[8],xmm13[9],mem[9],xmm13[10],mem[10],xmm13[11],mem[11],xmm13[12],mem[12],xmm13[13],mem[13],xmm13[14],mem[14],xmm13[15],mem[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5,5,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm2, %ymm7, %ymm2 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm11[8],xmm15[8],xmm11[9],xmm15[9],xmm11[10],xmm15[10],xmm11[11],xmm15[11],xmm11[12],xmm15[12],xmm11[13],xmm15[13],xmm11[14],xmm15[14],xmm11[15],xmm15[15] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm4 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5,5,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,5,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm2, %ymm5, %ymm2 -; AVX2-SLOW-NEXT: vpshuflw $150, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,4,128,2,128,128,128,128,5,128,3,128,128,128,128,6,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm14, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm14, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [4,128,2,128,128,128,128,5,128,3,128,128,128,128,6,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm11, %ymm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX2-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpshuflw $150, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,1,1,4,4,5,5] -; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm5 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm6 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0] -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm5 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,1,1,4,4,5,5] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm6, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm10, %ymm7 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm9 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0] +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm13, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,2,128,128,128,128,5,128,3,128,128,128,128,6,128,4,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [2,128,128,128,128,5,128,3,128,128,128,128,6,128,4,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm7, %ymm9, %ymm7 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX2-SLOW-NEXT: vpor %ymm5, %ymm9, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm15, %ymm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm12, %ymm7 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm6, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,ymm0[27,20,21,26],zero,ymm0[24],zero,ymm0[26,27,26,27],zero,ymm0[25] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm15, %ymm4 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero,ymm1[27],zero +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] ; AVX2-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero,zero,zero,ymm12[27] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm14[25],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm14 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm11, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm8 -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm8, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm1[7],zero,ymm1[11,4,5,10],zero,ymm1[8],zero,ymm1[10,11,10,11],zero,ymm1[9],zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[9],zero,ymm0[7],zero,zero,zero,zero,ymm0[10],zero,ymm0[8],zero,zero,zero,zero,ymm0[11],zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm9 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm6[9],zero,ymm6[7],zero,zero,zero,zero,ymm6[10],zero,ymm6[8],zero,zero,zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm7[9],zero,ymm7[7],zero,zero,zero,zero,ymm7[10],zero,ymm7[8],zero,zero,zero,zero,ymm7[11],zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero,ymm7[27] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX2-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm14[25],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm11[25],zero,ymm11[23],zero,zero,zero,zero,ymm11[26],zero,ymm11[24],zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX2-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm10[6,7,10,11,8,9,6,7,8,9,10,11,10,11,8,9,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vpor %ymm3, %ymm8, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm10, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm9, %ymm9 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm2, %ymm11 +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm7, %ymm11 ; AVX2-SLOW-NEXT: vpor %ymm9, %ymm11, %ymm9 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm5, %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm15, %ymm7 -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm12, %ymm8 -; AVX2-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm3, %ymm9, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm14 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm12, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm13 +; AVX2-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm15, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa %ymm15, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpor %ymm5, %ymm8, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm8 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm15, %ymm10 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm10 ; AVX2-SLOW-NEXT: vpor %ymm8, %ymm10, %ymm8 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm11 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8 -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm13, %ymm7 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm11, %ymm9 -; AVX2-SLOW-NEXT: vpor %ymm7, %ymm9, %ymm7 -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm14, %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm7, %ymm9, %ymm7 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm15, %ymm9 +; AVX2-SLOW-NEXT: vpor %ymm5, %ymm9, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm5, %ymm9, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm8, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm3, %ymm8, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,14,u,12,u,u,u,u,15,u,13,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm8, %ymm5 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm14[u,u,u,14,u,12,u,u,u,u,15,u,13,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm13[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm9, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm9, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm8, %ymm5 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm8, %ymm5 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm8, %ymm5 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[12,13,10,11,12,13,14,15,14,15,12,13,12,13,14,15,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm8, %ymm5 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 320(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 320(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 160(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 416(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 416(%rax) ; AVX2-SLOW-NEXT: addq $824, %rsp # imm = 0x338 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i8_stride7_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $648, %rsp # imm = 0x288 +; AVX2-FAST-NEXT: subq $568, %rsp # imm = 0x238 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm1[7],zero,ymm1[11,4,5,10],zero,ymm1[8],zero,ymm1[10,11,10,11],zero,ymm1[9],zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] ; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm8 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero,ymm7[27],zero -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm9 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[9],zero,ymm2[7],zero,zero,zero,zero,ymm2[10],zero,ymm2[8],zero,zero,zero,zero,ymm2[11],zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm7[9],zero,ymm7[7],zero,zero,zero,zero,ymm7[10],zero,ymm7[8],zero,zero,zero,zero,zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm6[9],zero,ymm6[7],zero,zero,zero,zero,ymm6[10],zero,ymm6[8],zero,zero,zero,zero,ymm6[11],zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27] ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[6,7,10,11,8,9,6,7,8,9,10,11,10,11,8,9,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[17,18,19,30],zero,ymm6[28],zero,ymm6[28,29,30,31],zero,ymm6[29],zero,ymm6[31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm6[1,2,3,14],zero,ymm6[12],zero,ymm6[12,13,14,15],zero,ymm6[13],zero,ymm6[15],zero,ymm6[17,18,19,30],zero,ymm6[28],zero,ymm6[28,29,30,31],zero,ymm6[29],zero,ymm6[31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[11],zero,zero,zero,zero,ymm7[14],zero,ymm7[12],zero,zero,zero,zero,ymm7[15],zero,ymm7[13],zero,ymm7[27],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm9[30],zero,ymm9[28],zero,zero,zero,zero,ymm9[31],zero,ymm9[29],zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm9[14],zero,ymm9[12],zero,zero,zero,zero,ymm9[15],zero,ymm9[13],zero,zero,zero,zero,zero,zero,ymm9[30],zero,ymm9[28],zero,zero,zero,zero,ymm9[31],zero,ymm9[29],zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm8[14],zero,ymm8[12],zero,zero,zero,zero,ymm8[15],zero,ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[12,13,10,11,12,13,14,15,14,15,12,13,12,13,14,15,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm14 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX2-FAST-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm2 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm10 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm13 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,0,0,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,2,0,0,1] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm4 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm13 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm12 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0] +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm8 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm5, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,0,0,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm9 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm14, %xmm14 +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm14, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm6, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm7, %xmm14 -; AVX2-FAST-NEXT: vpor %xmm6, %xmm14, %xmm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX2-FAST-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm11, %xmm14 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm15 +; AVX2-FAST-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm2, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm9, %xmm14 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm2 ; AVX2-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm13, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0 +; AVX2-FAST-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm14 -; AVX2-FAST-NEXT: vpor %xmm3, %xmm14, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm15 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm15, %ymm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm9, %xmm12 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm6 ; AVX2-FAST-NEXT: vpor %xmm2, %xmm6, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-FAST-NEXT: vmovdqa %xmm12, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm15 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm15, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm1 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm15, %xmm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm3 +; AVX2-FAST-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,0] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-FAST-NEXT: vpblendvb %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm13, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm7[8],xmm10[9],xmm7[9],xmm10[10],xmm7[10],xmm10[11],xmm7[11],xmm10[12],xmm7[12],xmm10[13],xmm7[13],xmm10[14],xmm7[14],xmm10[15],xmm7[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] @@ -6706,222 +6538,208 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm14[8],mem[8],xmm14[9],mem[9],xmm14[10],mem[10],xmm14[11],mem[11],xmm14[12],mem[12],xmm14[13],mem[13],xmm14[14],mem[14],xmm14[15],mem[15] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,5,6] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,4,5,5,6] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm12[8],xmm15[9],xmm12[9],xmm15[10],xmm12[10],xmm15[11],xmm12[11],xmm15[12],xmm12[12],xmm15[13],xmm12[13],xmm15[14],xmm12[14],xmm15[15],xmm12[15] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[17,18,19,30],zero,ymm1[28],zero,ymm1[28,29,30,31],zero,ymm1[29],zero,ymm1[31] -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm5 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29],zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm3, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm6 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,ymm0[27,28,29,30],zero,ymm0[28],zero,ymm0[26,27,30,31],zero,ymm0[29] -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm11 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm11[7],zero,ymm11[11,4,5,10],zero,ymm11[8],zero,ymm11[10,11,10,11],zero,ymm11[9],zero,ymm11[23],zero,ymm11[27,20,21,26],zero,ymm11[24],zero,ymm11[26,27,26,27],zero,ymm11[25] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[9],zero,ymm6[7],zero,zero,zero,zero,ymm6[10],zero,ymm6[8],zero,zero,zero,zero,ymm6[11],zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm0[27],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm2, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm11 -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[23],zero,ymm12[27,20,21,26],zero,ymm12[24],zero,ymm12[26,27,26,27],zero,ymm12[25] -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero,ymm5[27],zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm10[25],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero,ymm4[27] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpor %ymm1, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm2[9],zero,ymm2[7],zero,zero,zero,zero,ymm2[10],zero,ymm2[8],zero,zero,zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm1[9],zero,ymm1[7],zero,zero,zero,zero,ymm1[10],zero,ymm1[8],zero,zero,zero,zero,ymm1[11],zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero,ymm1[27] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm4, %ymm5, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm12 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vpor %ymm4, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm4 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[6,7,10,11,8,9,6,7,8,9,10,11,10,11,8,9,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm1[1,2,3,14],zero,ymm1[12],zero,ymm1[12,13,14,15],zero,ymm1[13],zero,ymm1[15],zero,ymm1[17,18,19,30],zero,ymm1[28],zero,ymm1[28,29,30,31],zero,ymm1[29],zero,ymm1[31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm10 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm7, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] -; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm11 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm11, %ymm15, %ymm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm14, %ymm8 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[11],zero,zero,zero,zero,ymm2[14],zero,ymm2[12],zero,zero,zero,zero,ymm2[15],zero,ymm2[13],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm9 +; AVX2-FAST-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,ymm6[14],zero,ymm6[12],zero,zero,zero,zero,ymm6[15],zero,ymm6[13],zero,zero,zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm10 +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm11[14],zero,ymm11[12],zero,zero,zero,zero,ymm11[15],zero,ymm11[13],zero,zero,zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[27],zero,ymm5[27,28,29,30],zero,ymm5[28],zero,ymm5[26,27,30,31],zero,ymm5[29] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm12[27],zero,zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm9, %ymm10, %ymm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[12,13,10,11,12,13,14,15,14,15,12,13,12,13,14,15,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm7, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,2,128,128,128,128,5,128,3,128,128,128,128,6,128,4,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [2,128,128,128,128,5,128,3,128,128,128,128,6,128,4,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm12 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm9, %ymm12, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,5,128,3,128,128,128,128,6,128,4,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm14 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,5,128,3,128,128,128,128,6,128,4,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm13 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm14, %ymm13, %ymm13 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm9, %ymm13, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm10 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm10, %ymm11, %ymm10 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm9, %ymm11 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm12 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm10, %ymm11, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [128,4,128,2,128,128,128,128,5,128,3,128,128,128,128,6,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm12 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm10, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,4,5,5,7,4,5] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm15, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [4,128,2,128,128,128,128,5,128,3,128,128,128,128,6,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm14 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm12, %ymm14, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,2,3,4,5,2,3,4,5,12,13,14,15,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm15 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm9 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm12, %ymm15, %ymm12 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm11 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm9, %ymm11, %ymm9 -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm7[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm15, %ymm11 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm9, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpor %ymm9, %ymm11, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpor %ymm6, %ymm12, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm9, %ymm6, %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm6 -; AVX2-FAST-NEXT: vpor %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm13, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm9, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm15, %ymm13 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm11, %ymm13, %ymm11 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm13 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm11, %ymm13, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpor %ymm0, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm6 +; AVX2-FAST-NEXT: vpor %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm6 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm11 +; AVX2-FAST-NEXT: vpor %ymm6, %ymm11, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm4 +; AVX2-FAST-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FAST-NEXT: vmovdqa %ymm1, 320(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 320(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 128(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm10, 352(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm14, 128(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6929,211 +6747,207 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-FAST-NEXT: addq $648, %rsp # imm = 0x288 +; AVX2-FAST-NEXT: addq $568, %rsp # imm = 0x238 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i8_stride7_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $648, %rsp # imm = 0x288 +; AVX2-FAST-PERLANE-NEXT: subq $568, %rsp # imm = 0x238 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm1[7],zero,ymm1[11,4,5,10],zero,ymm1[8],zero,ymm1[10,11,10,11],zero,ymm1[9],zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero,ymm7[27],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[9],zero,ymm2[7],zero,zero,zero,zero,ymm2[10],zero,ymm2[8],zero,zero,zero,zero,ymm2[11],zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm7[9],zero,ymm7[7],zero,zero,zero,zero,ymm7[10],zero,ymm7[8],zero,zero,zero,zero,zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm6[9],zero,ymm6[7],zero,zero,zero,zero,ymm6[10],zero,ymm6[8],zero,zero,zero,zero,ymm6[11],zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[6,7,10,11,8,9,6,7,8,9,10,11,10,11,8,9,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[17,18,19,30],zero,ymm6[28],zero,ymm6[28,29,30,31],zero,ymm6[29],zero,ymm6[31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm6[1,2,3,14],zero,ymm6[12],zero,ymm6[12,13,14,15],zero,ymm6[13],zero,ymm6[15],zero,ymm6[17,18,19,30],zero,ymm6[28],zero,ymm6[28,29,30,31],zero,ymm6[29],zero,ymm6[31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[11],zero,zero,zero,zero,ymm7[14],zero,ymm7[12],zero,zero,zero,zero,ymm7[15],zero,ymm7[13],zero,ymm7[27],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm9[30],zero,ymm9[28],zero,zero,zero,zero,ymm9[31],zero,ymm9[29],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm9[14],zero,ymm9[12],zero,zero,zero,zero,ymm9[15],zero,ymm9[13],zero,zero,zero,zero,zero,zero,ymm9[30],zero,ymm9[28],zero,zero,zero,zero,ymm9[31],zero,ymm9[29],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm8[14],zero,ymm8[12],zero,zero,zero,zero,ymm8[15],zero,ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[12,13,10,11,12,13,14,15,14,15,12,13,12,13,14,15,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm5, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm12, %ymm6, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm14, %ymm6, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm6, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm11, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm12, %xmm15, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm14, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm6, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm12, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm15, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm11, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm13, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] @@ -7145,16 +6959,16 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm12[8],mem[8],xmm12[9],mem[9],xmm12[10],mem[10],xmm12[11],mem[11],xmm12[12],mem[12],xmm12[13],mem[13],xmm12[14],mem[14],xmm12[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm14[8],mem[8],xmm14[9],mem[9],xmm14[10],mem[10],xmm14[11],mem[11],xmm14[12],mem[12],xmm14[13],mem[13],xmm14[14],mem[14],xmm14[15],mem[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm8[8],xmm14[9],xmm8[9],xmm14[10],xmm8[10],xmm14[11],xmm8[11],xmm14[12],xmm8[12],xmm14[13],xmm8[13],xmm14[14],xmm8[14],xmm14[15],xmm8[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] @@ -7163,8 +6977,8 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm12[8],xmm15[9],xmm12[9],xmm15[10],xmm12[10],xmm15[11],xmm12[11],xmm15[12],xmm12[12],xmm15[13],xmm12[13],xmm15[14],xmm12[14],xmm15[15],xmm12[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] @@ -7174,192 +6988,179 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[17,18,19,30],zero,ymm1[28],zero,ymm1[28,29,30,31],zero,ymm1[29],zero,ymm1[31] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm3, %ymm6, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,ymm0[27,28,29,30],zero,ymm0[28],zero,ymm0[26,27,30,31],zero,ymm0[29] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm11[7],zero,ymm11[11,4,5,10],zero,ymm11[8],zero,ymm11[10,11,10,11],zero,ymm11[9],zero,ymm11[23],zero,ymm11[27,20,21,26],zero,ymm11[24],zero,ymm11[26,27,26,27],zero,ymm11[25] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[9],zero,ymm6[7],zero,zero,zero,zero,ymm6[10],zero,ymm6[8],zero,zero,zero,zero,ymm6[11],zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm0[27],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[23],zero,ymm5[27,20,21,26],zero,ymm5[24],zero,ymm5[26,27,26,27],zero,ymm5[25] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero,zero,zero,ymm12[27],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm10[25],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero,ymm4[27] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm2[9],zero,ymm2[7],zero,zero,zero,zero,ymm2[10],zero,ymm2[8],zero,zero,zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm1[9],zero,ymm1[7],zero,zero,zero,zero,ymm1[10],zero,ymm1[8],zero,zero,zero,zero,ymm1[11],zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero,ymm1[27] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm4, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm4, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[6,7,10,11,8,9,6,7,8,9,10,11,10,11,8,9,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm1[1,2,3,14],zero,ymm1[12],zero,ymm1[12,13,14,15],zero,ymm1[13],zero,ymm1[15],zero,ymm1[17,18,19,30],zero,ymm1[28],zero,ymm1[28,29,30,31],zero,ymm1[29],zero,ymm1[31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm6, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm10, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm1, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm15, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm7, %ymm11, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm13, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[11],zero,zero,zero,zero,ymm2[14],zero,ymm2[12],zero,zero,zero,zero,ymm2[15],zero,ymm2[13],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm14, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,ymm6[14],zero,ymm6[12],zero,zero,zero,zero,ymm6[15],zero,ymm6[13],zero,zero,zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm5, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm11[14],zero,ymm11[12],zero,zero,zero,zero,ymm11[15],zero,ymm11[13],zero,zero,zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm14, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm7, %ymm9, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[27],zero,ymm5[27,28,29,30],zero,ymm5[28],zero,ymm5[26,27,30,31],zero,ymm5[29] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm12[27],zero,zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm10, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[12,13,10,11,12,13,14,15,14,15,12,13,12,13,14,15,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [128,2,128,128,128,128,5,128,3,128,128,128,128,6,128,4,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm2, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [2,128,128,128,128,5,128,3,128,128,128,128,6,128,4,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm12, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,5,128,3,128,128,128,128,6,128,4,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm6, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,5,128,3,128,128,128,128,6,128,4,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm3, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm14, %ymm13, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm9, %ymm13, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm10, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm9, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm3, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm3, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm10, %ymm12, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm13, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm10, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [128,4,128,2,128,128,128,128,5,128,3,128,128,128,128,6,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm8, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [4,128,2,128,128,128,128,5,128,3,128,128,128,128,6,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm5, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm12, %ymm14, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,2,3,4,5,2,3,4,5,12,13,14,15,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm4, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm15, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm15, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm12, %ymm15, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm8, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm11, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm15, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm13, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm7, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm9, %ymm11, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm12, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm6, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm7, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm11, %ymm13, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm6, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm12, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm9, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm3, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm3, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm3, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm11, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm14, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm4, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm13, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm10, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm15, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm7, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm9, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm15, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm8, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm7, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 320(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 352(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 288(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7367,2303 +7168,1469 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $648, %rsp # imm = 0x288 +; AVX2-FAST-PERLANE-NEXT: addq $568, %rsp # imm = 0x238 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: store_i8_stride7_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $1416, %rsp # imm = 0x588 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18] -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm12 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero -; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm9 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: subq $1656, %rsp # imm = 0x678 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm10 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm12 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[0,1,14],zero,ymm12[12,13,0,1,14,15],zero,ymm12[3,12,13,2,3,16],zero,ymm12[30,31,28,29,16,17],zero,ymm12[31,18,19,28,29,18],zero ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm27 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm8, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm28 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm4 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm7 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[0,1,0,1,14],zero,ymm7[14,15,0,1,14,15],zero,ymm7[13,14,15,16,17,16],zero,ymm7[30,31,30,31,16,17],zero,ymm7[31,28,29,30,31] ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm9 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm9[14],zero,zero,zero,zero,zero,zero,ymm9[15],zero,zero,zero,zero,zero,zero,ymm9[16],zero,zero,zero,zero,zero,zero,ymm9[17],zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm8 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[13,u,u,u,u,u],zero,ymm8[14,u,u,u,u,u],zero,ymm8[15,u,u,u,u,u],zero,ymm8[16,u,u,u,u,u],zero,ymm8[17,u,u,u] +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm2 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] -; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm23 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm11 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[27],zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29] -; AVX512F-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[25],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm5 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm5[14],zero,ymm5[12],zero,zero,zero,zero,ymm5[15],zero,ymm5[13],zero,zero,zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero,zero +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,7,128,128,128,128,10,128,8,128,128,128,128,11,128,9,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm1 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero -; AVX512F-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm1 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[12,13,14],zero,ymm3[12],zero,ymm3[14,15,14,15],zero,ymm3[13],zero,ymm3[15,12,13,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <9,128,7,u,u,u,128,10,128,8,u,u,u,128,11,128,25,128,23,u,u,u,128,26,128,24,u,u,u,128,27,128> +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm29 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm5 ; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm5[14],zero,ymm5[12],zero,zero,zero,zero,ymm5[15],zero,ymm5[13],zero,zero,zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,9,128,7,128,128,128,128,10,128,8,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm6, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm11, %ymm19 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm21 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm20 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[11,u,u,u],zero,ymm6[14],zero,ymm6[12,u,u,u],zero,ymm6[15],zero,ymm6[13,u,27,u,u,u],zero,ymm6[30],zero,ymm6[28,u,u,u],zero,ymm6[31],zero,ymm6[29,u] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm30 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,128,7,128,5,6,7,10,128,8,128,12,13,10,11,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm26 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm5[11],zero,zero,zero,zero,ymm5[14],zero,ymm5[12],zero,zero,zero,zero,ymm5[15],zero,ymm5[13],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,9,128,7,128,128,128,128,10,128,8,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm6, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm11, %ymm23 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[13],zero,ymm6[11,u,u,u],zero,ymm6[14],zero,ymm6[12,u,u,u],zero,ymm6[15],zero,ymm6[29],zero,ymm6[27,u,u,u],zero,ymm6[30],zero,ymm6[28,u,u,u],zero,ymm6[31],zero +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,8,9,128,7,128,7,8,9,10,128,8,128,14,15,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm24 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[12,13,10,11,12,13,14,15,14,15,12,13,12,13,14,15,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [6,7,10,11,8,9,6,7,8,9,10,11,10,11,8,9,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm20 +; AVX512F-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm17 +; AVX512F-SLOW-NEXT: vporq %xmm5, %xmm6, %xmm22 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm15 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm6[4,u,u,u],zero,xmm6[7],zero,xmm6[5,u,u,u],zero,xmm6[8],zero,xmm6[6] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm21 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm25 +; AVX512F-SLOW-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm12, %ymm2 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm10[5],zero,ymm10[3],zero,zero,zero,zero,ymm10[6],zero,ymm10[4],zero,zero,zero,zero,zero,zero,ymm10[21],zero,ymm10[19],zero,zero,zero,zero,ymm10[22],zero,ymm10[20],zero,zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm10, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa %ymm10, %ymm5 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[2,3,4,5],zero,ymm12[3],zero,ymm12[5,4,5,6],zero,ymm12[4],zero,ymm12[6,7,18,19,20,21],zero,ymm12[19],zero,ymm12[21,20,21,22],zero,ymm12[20],zero,ymm12[22,23] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm2 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm4[2],zero,zero,zero,zero,ymm4[5],zero,ymm4[3],zero,zero,zero,zero,ymm4[6],zero,ymm4[4],zero,ymm4[18],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm2 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm7, %ymm2 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[2],zero,ymm7[2,3,4,5],zero,ymm7[3],zero,ymm7[9,10,11,6],zero,ymm7[4],zero,ymm7[18],zero,ymm7[18,19,20,21],zero,ymm7[19],zero,ymm7[25,26,27,22],zero,ymm7[20],zero ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm8[23],zero,ymm8[21,22,23,26],zero,ymm8[24],zero,ymm8[28,29,26,27] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm8[18,19,20,21],zero,ymm8[19],zero,ymm8[25,26,27,22],zero,ymm8[20],zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm8, %ymm2 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm8[4],zero,ymm8[2],zero,zero,zero,zero,ymm8[5],zero,ymm8[3],zero,zero,zero,zero,ymm8[6],zero,ymm8[20],zero,ymm8[18],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm31 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm1 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22],zero,ymm12[20],zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm1 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermi2d %zmm0, %zmm2, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm9, %ymm2 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[4],zero,ymm9[2],zero,ymm9[4,5,4,5],zero,ymm9[3],zero,ymm9[3,4,5,6],zero,ymm9[20],zero,ymm9[18],zero,ymm9[20,21,20,21],zero,ymm9[19],zero,ymm9[19,20,21,22],zero +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm9, %ymm26 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm8 +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm8, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpermi2d %zmm2, %zmm3, %zmm24 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm19 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm29 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm30 -; AVX512F-SLOW-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX512F-SLOW-NEXT: vporq %xmm1, %xmm0, %xmm23 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm9 +; AVX512F-SLOW-NEXT: vporq %xmm0, %xmm1, %xmm20 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,u,0,u,2,3,u,1,u,18,u,19,18,u,19,u> +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,5,6] +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm1[4,u,u,u],zero,xmm1[7],zero,xmm1[5,u,u,u],zero,xmm1[8],zero,xmm1[6] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm17 +; AVX512F-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm22 -; AVX512F-SLOW-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,14],zero,ymm1[14,15,0,1,14,15],zero,ymm1[13,14,15,16,17,16],zero,ymm1[30,31,30,31,16,17],zero,ymm1[31,28,29,30,31] +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,u,0,u,2,3,u,1,u,18,u,19,18,u,19,u> -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,5,6] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermi2d %zmm0, %zmm2, %zmm8 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm26 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm9 -; AVX512F-SLOW-NEXT: vporq %xmm8, %xmm9, %xmm24 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm6, %ymm9 -; AVX512F-SLOW-NEXT: vpor %ymm8, %ymm9, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm12 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm12[14],zero,zero,zero,zero,zero,zero,ymm12[15],zero,zero,zero,zero,zero,zero,ymm12[16],zero,zero,zero,zero,zero,zero,ymm12[17],zero,zero,zero,zero,zero,zero,ymm12[18] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm1 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14],zero,ymm1[12,13,0,1,14,15],zero,ymm1[3,12,13,2,3,16],zero,ymm1[30,31,28,29,16,17],zero,ymm1[31,18,19,28,29,18],zero +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[0,1,14],zero,ymm0[12,13,0,1,14,15],zero,ymm0[3,12,13,2,3,16],zero,ymm0[30,31,28,29,16,17],zero,ymm0[31,18,19,28,29,18],zero -; AVX512F-SLOW-NEXT: vpor %ymm8, %ymm9, %ymm5 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,ymm11[14],zero,zero,zero,zero,zero,zero,ymm11[15],zero,zero,zero,zero,zero,zero,ymm11[16],zero,zero,zero,zero,zero,zero,ymm11[17],zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm6 -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm10, %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm10, %ymm28 -; AVX512F-SLOW-NEXT: vpor %ymm8, %ymm9, %ymm5 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm20 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm4 -; AVX512F-SLOW-NEXT: vporq %xmm5, %xmm4, %xmm21 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm7 -; AVX512F-SLOW-NEXT: vporq %xmm4, %xmm7, %xmm19 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm6 -; AVX512F-SLOW-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm4 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm27 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm6[0,1,0,1],zmm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm12 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,4,5,5,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-SLOW-NEXT: vpandn %ymm4, %ymm11, %ymm4 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = zero,ymm1[13],zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm23 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm4 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm18 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm14 -; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm14, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm29 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm7 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm4[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10] -; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm6 -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm11 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm11, %zmm26 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,1,1,4,4,5,5] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[13,u,u,u,u,u],zero,ymm1[14,u,u,u,u,u],zero,ymm1[15,u,u,u,u,u],zero,ymm1[16,u,u,u,u,u],zero,ymm1[17,u,u,u] +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm16 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm4, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm30 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm0 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-SLOW-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm16 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm5 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm28 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm22 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm7[8],xmm13[9],xmm7[9],xmm13[10],xmm7[10],xmm13[11],xmm7[11],xmm13[12],xmm7[12],xmm13[13],xmm7[13],xmm13[14],xmm7[14],xmm13[15],xmm7[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,0,1],zmm0[0,1,0,1] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm13 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm22, %zmm13, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm1 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm27 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3],xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm15, %xmm15 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm1 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm15 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,0,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm5[14],zero,ymm5[12],zero,zero,zero,zero,ymm5[15],zero,ymm5[13],zero,zero,zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero,zero ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm0[23],zero,ymm0[21,22,23,26],zero,ymm0[24],zero,ymm0[28,29,26,27] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm0[18,19,20,21],zero,ymm0[19],zero,ymm0[25,26,27,22],zero,ymm0[20],zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20],zero,zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm14[25],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[12,13,14],zero,ymm0[12],zero,ymm0[14,15,14,15],zero,ymm0[13],zero,ymm0[15,12,13,28,29,30],zero,ymm0[28],zero,ymm0[30,31,30,31],zero,ymm0[29],zero,ymm0[31,28,29] +; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm5 ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm8, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm29 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm8, %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm27 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm0[14],zero,ymm0[12],zero,zero,zero,zero,ymm0[15],zero,ymm0[13],zero,zero,zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29],zero,zero ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm0[13],zero,zero,zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm28 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512F-SLOW-NEXT: vpandnq %ymm14, %ymm28, %ymm14 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[11,u,u,u],zero,ymm0[14],zero,ymm0[12,u,u,u],zero,ymm0[15],zero,ymm0[13,u,27,u,u,u],zero,ymm0[30],zero,ymm0[28,u,u,u],zero,ymm0[31],zero,ymm0[29,u] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3],xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm15, %xmm15 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm21, %zmm9, %zmm14 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm9, %zmm9 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm4[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm18[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm3[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm14 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm14[11],zero,zero,zero,zero,ymm14[14],zero,ymm14[12],zero,zero,zero,zero,ymm14[15],zero,ymm14[13],zero,ymm14[27],zero,zero,zero,zero,ymm14[30],zero,ymm14[28],zero,zero,zero,zero,ymm14[31],zero,ymm14[29] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm9, %ymm26 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm14 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm14[13],zero,ymm14[11,u,u,u],zero,ymm14[14],zero,ymm14[12,u,u,u],zero,ymm14[15],zero,ymm14[29],zero,ymm14[27,u,u,u],zero,ymm14[30],zero,ymm14[28,u,u,u],zero,ymm14[31],zero +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm9, %ymm25 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm19 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] +; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm14, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm9, %ymm21 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[12,13,10,11,12,13,14,15,14,15,12,13,12,13,14,15,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm22 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm7[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm5[0,1,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm1[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm15[0,1,0,1] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm23, %zmm0 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm20, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm30, %ymm8 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[2],zero,zero,zero,zero,ymm8[5],zero,ymm8[3],zero,zero,zero,zero,ymm8[6],zero,ymm8[4],zero,ymm8[18],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22],zero,ymm8[20] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2],zero,ymm9[2,3,4,5],zero,ymm9[3],zero,ymm9[9,10,11,6],zero,ymm9[4],zero,ymm9[18],zero,ymm9[18,19,20,21],zero,ymm9[19],zero,ymm9[25,26,27,22],zero,ymm9[20],zero +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,ymm12[5],zero,ymm12[3],zero,zero,zero,zero,ymm12[6],zero,ymm12[4],zero,zero,zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22],zero,ymm12[20],zero,zero +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm11 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[2,3,4,5],zero,ymm11[3],zero,ymm11[5,4,5,6],zero,ymm11[4],zero,ymm11[6,7,18,19,20,21],zero,ymm11[19],zero,ymm11[21,20,21,22],zero,ymm11[20],zero,ymm11[22,23] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm7[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm6[2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm21 = ymm1[0,0,1,1,4,4,5,5] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-SLOW-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm1 = zmm1[0,1,0,1],mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[1,1,0,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 -; AVX512F-SLOW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm19, %ymm6 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm6 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vporq %zmm6, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] -; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpand %ymm6, %ymm13, %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm7 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vporq %zmm7, %zmm5, %zmm5 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm7 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm26[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm12 -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm12 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm0 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm7 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vporq %zmm0, %zmm7, %zmm0 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm7 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm13 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm13 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm30[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm22[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm7 -; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm6, %ymm10, %ymm11 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm21[2,3,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $236, %ymm6, %ymm4, %ymm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm8[0,1,0,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm11, %zmm4 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm5 = mem[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufhw $190, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm6 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm15[0,1,0,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm6, %zmm5 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm31[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm20[2,3,2,3] -; AVX512F-SLOW-NEXT: vpor %ymm4, %ymm8, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 -; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm11 = mem[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshuflw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm15 = mem[1,1,0,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm17 = mem[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm29[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm19 = mem[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm27[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm21 = mem[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm22 = mem[2,3,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm4, %zmm6, %zmm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm6 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm6 +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm20 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm23 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vporq %zmm20, %zmm23, %zmm20 +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm23 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm29 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vporq %zmm23, %zmm29, %zmm23 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm23 +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm20 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm29 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vporq %zmm20, %zmm29, %zmm20 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm29, %zmm20 +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm23 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm30 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vporq %zmm23, %zmm30, %zmm23 +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm30 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm28 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vporq %zmm30, %zmm28, %zmm28 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm29, %zmm28 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm1 +; AVX512F-SLOW-NEXT: vporq %ymm31, %ymm7, %ymm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vporq %ymm5, %ymm18, %ymm2 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm3, %zmm2 +; AVX512F-SLOW-NEXT: vpor %ymm8, %ymm9, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm23 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm23 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm15[0,0,1,0] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm11, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vpor %ymm12, %ymm13, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm3, %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm3 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm17 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm0[4],zero,ymm0[2],zero,zero,zero,zero,ymm0[5],zero,ymm0[3],zero,zero,zero,zero,ymm0[6],zero,ymm0[20],zero,ymm0[18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[4],zero,ymm0[2],zero,ymm0[4,5,4,5],zero,ymm0[3],zero,ymm0[3,4,5,6],zero,ymm0[20],zero,ymm0[18],zero,ymm0[20,21,20,21],zero,ymm0[19],zero,ymm0[19,20,21,22],zero +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm0[13],zero,zero,zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,2] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512F-SLOW-NEXT: vpandn %ymm10, %ymm12, %ymm10 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm15[0,1,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm26[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm25[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm21[0,1,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm22[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshuflw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm0 = mem[1,1,0,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm10 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm10 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm16 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm16 ; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm5 -; AVX512F-SLOW-NEXT: vporq %ymm17, %ymm18, %ymm6 -; AVX512F-SLOW-NEXT: vporq %ymm19, %ymm20, %ymm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm8[0,1,2,3],zmm6[4,5,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm25 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm24 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm24 -; AVX512F-SLOW-NEXT: vporq %ymm21, %ymm22, %ymm6 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,2,3],zmm6[4,5,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm14[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm6 = zmm9[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm3[0,0,1,0,4,4,5,4] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm5 +; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm6 = mem[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm10 = mem[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm10 +; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm6 = mem[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vpermq $16, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm11 = mem[0,0,1,0,4,4,5,4] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm11 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm6 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm10 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vporq %zmm6, %zmm10, %zmm6 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm24 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm24 +; AVX512F-SLOW-NEXT: vpor %ymm12, %ymm13, %ymm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm6, %zmm6 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm0 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm17[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 +; AVX512F-SLOW-NEXT: vpor %ymm3, %ymm8, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm9 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm9 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rax) -; AVX512F-SLOW-NEXT: addq $1416, %rsp # imm = 0x588 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rax) +; AVX512F-SLOW-NEXT: addq $1656, %rsp # imm = 0x678 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; -; AVX512F-ONLY-FAST-LABEL: store_i8_stride7_vf64: -; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $1496, %rsp # imm = 0x5D8 -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm15, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14],zero,ymm1[12,13,0,1,14,15],zero,ymm1[3,12,13,2,3,16],zero,ymm1[30,31,28,29,16,17],zero,ymm1[31,18,19,28,29,18],zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm1, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm21 -; AVX512F-ONLY-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpor %xmm0, %xmm5, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero,zero,zero,zero,zero,ymm15[18] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[0,1,14],zero,ymm3[12,13,0,1,14,15],zero,ymm3[3,12,13,2,3,16],zero,ymm3[30,31,28,29,16,17],zero,ymm3[31,18,19,28,29,18],zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm7, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm6, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20],zero,zero -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] -; AVX512F-ONLY-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128] -; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [2,2,3,3,2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,5,6] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm23, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm2, %ymm3, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero,ymm10[29],zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm31 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] -; AVX512F-ONLY-FAST-NEXT: # ymm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm31, %ymm2, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm3[0,1,2,3],zmm2[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm12, %xmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm1[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm1[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm23, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm0[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm15[21],zero,ymm15[19],zero,zero,zero,zero,ymm15[22],zero,ymm15[20],zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,5,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm0, %ymm25, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] -; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm9[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm13[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm1 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm7[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm7[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm12[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm26[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm11[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] -; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm11 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm14, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm5, %ymm7, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm10, %ymm8, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm9, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpandq %ymm5, %ymm27, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vporq %zmm2, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpandq %ymm31, %ymm24, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vporq %zmm3, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpandq %ymm31, %ymm25, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm19, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vporq %zmm3, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpandq %ymm31, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vporq %zmm2, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vporq %zmm2, %zmm5, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm0, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm26 = zmm0[0,1,0,1],mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[1,1,0,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,2,0,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[1,1,0,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] -; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm9[23],zero,ymm9[23,24,25,26],zero,ymm9[24],zero,ymm9[30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,4,5,5,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm15, %ymm9, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm24 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,0] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm13, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm11[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm17, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm14, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm12, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm20, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm1 -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $1496, %rsp # imm = 0x5D8 -; AVX512F-ONLY-FAST-NEXT: vzeroupper -; AVX512F-ONLY-FAST-NEXT: retq -; -; AVX512DQ-FAST-LABEL: store_i8_stride7_vf64: -; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $1496, %rsp # imm = 0x5D8 -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm15, %ymm17 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm15 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero -; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14],zero,ymm1[12,13,0,1,14,15],zero,ymm1[3,12,13,2,3,16],zero,ymm1[30,31,28,29,16,17],zero,ymm1[31,18,19,28,29,18],zero -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm22 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm2 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm31 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm1, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm20 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm21 -; AVX512DQ-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm28 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX512DQ-FAST-NEXT: vpor %xmm0, %xmm5, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm13 -; AVX512DQ-FAST-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm7 -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm7 -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero,zero,zero,zero,zero,ymm15[18] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[0,1,14],zero,ymm3[12,13,0,1,14,15],zero,ymm3[3,12,13,2,3,16],zero,ymm3[30,31,28,29,16,17],zero,ymm3[31,18,19,28,29,18],zero -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm7, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm6 -; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm6, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm17 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm2, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20],zero,zero -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] -; AVX512DQ-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128] -; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm29 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [2,2,3,3,2,2,3,3] -; AVX512DQ-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,5,6] -; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm23, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512DQ-FAST-NEXT: vpandn %ymm2, %ymm3, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm18 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero,ymm10[29],zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm31 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] -; AVX512DQ-FAST-NEXT: # ymm31 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm31, %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm10 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm3[0,1,2,3],zmm2[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm12, %xmm21 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm3 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm11 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm1 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm1[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm1 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm1[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] -; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm23, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm0[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm8 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm15[21],zero,ymm15[19],zero,zero,zero,zero,ymm15[22],zero,ymm15[20],zero,zero -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,5,7,4,5] -; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512DQ-FAST-NEXT: vpandnq %ymm0, %ymm25, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm23 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] -; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm9 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm9[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm29 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm13[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm1 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm7[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero,zero -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm7[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm12[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm26[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm4 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm13 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm11[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] -; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm9 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm11 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm12 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm14, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm5, %ymm7, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vpor %ymm10, %ymm8, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm9, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vpandq %ymm5, %ymm27, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vporq %zmm2, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vpandq %ymm31, %ymm24, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm3 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vporq %zmm3, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpandq %ymm31, %ymm25, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm19, %zmm0 -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm3 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vporq %zmm3, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,0,1,4,5,4,5] -; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,0,1,4,5,4,5] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vpandq %ymm31, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vporq %zmm2, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vporq %zmm2, %zmm5, %zmm22 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm0, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-FAST-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm26 = zmm0[0,1,0,1],mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[1,1,0,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,2,0,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm19 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[1,1,0,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm17 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm10 -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm12 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] -; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm9[23],zero,ymm9[23,24,25,26],zero,ymm9[24],zero,ymm9[30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,4,5,5,7,4,5] -; AVX512DQ-FAST-NEXT: vpermd %ymm15, %ymm9, %ymm20 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm24 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm24 -; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm25 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,0] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm18 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm18 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm16 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm16 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm3 -; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm13, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm23 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm23 -; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,0,1,4,5,4,5] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm11[0,1,0,1,4,5,4,5] -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm17, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm2 -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm14, %ymm1 -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm12, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm20, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm1 -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 384(%rax) -; AVX512DQ-FAST-NEXT: addq $1496, %rsp # imm = 0x5D8 -; AVX512DQ-FAST-NEXT: vzeroupper -; AVX512DQ-FAST-NEXT: retq +; AVX512F-FAST-LABEL: store_i8_stride7_vf64: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: subq $1784, %rsp # imm = 0x6F8 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero,zero,ymm2[18] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[0,1,14],zero,ymm11[12,13,0,1,14,15],zero,ymm11[3,12,13,2,3,16],zero,ymm11[30,31,28,29,16,17],zero,ymm11[31,18,19,28,29,18],zero +; AVX512F-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm13 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm13[14],zero,zero,zero,zero,zero,zero,ymm13[15],zero,zero,zero,zero,zero,zero,ymm13[16],zero,zero,zero,zero,zero,zero,ymm13[17],zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[0,1,0,1,14],zero,ymm7[14,15,0,1,14,15],zero,ymm7[13,14,15,16,17,16],zero,ymm7[30,31,30,31,16,17],zero,ymm7[31,28,29,30,31] +; AVX512F-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm15 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[13,u,u,u,u,u],zero,ymm15[14,u,u,u,u,u],zero,ymm15[15,u,u,u,u,u],zero,ymm15[16,u,u,u,u,u],zero,ymm15[17,u,u,u] +; AVX512F-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm4 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14],zero,ymm4[12],zero,zero,zero,zero,ymm4[15],zero,ymm4[13],zero,zero,zero,zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,7,128,128,128,128,10,128,8,128,128,128,128,11,128,9,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[12,13,14],zero,ymm3[12],zero,ymm3[14,15,14,15],zero,ymm3[13],zero,ymm3[15,12,13,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm30 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <9,128,7,u,u,u,128,10,128,8,u,u,u,128,11,128,25,128,23,u,u,u,128,26,128,24,u,u,u,128,27,128> +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm24 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX512F-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm6[14],zero,ymm6[12],zero,zero,zero,zero,ymm6[15],zero,ymm6[13],zero,zero,zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm4 +; AVX512F-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,9,128,7,128,128,128,128,10,128,8,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm27 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[11,u,u,u],zero,ymm4[14],zero,ymm4[12,u,u,u],zero,ymm4[15],zero,ymm4[13,u,27,u,u,u],zero,ymm4[30],zero,ymm4[28,u,u,u],zero,ymm4[31],zero,ymm4[29,u] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,9,128,7,128,5,6,7,10,128,8,128,12,13,10,11,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm28 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm8 +; AVX512F-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm8[11],zero,zero,zero,zero,ymm8[14],zero,ymm8[12],zero,zero,zero,zero,ymm8[15],zero,ymm8[13],zero,ymm8[27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29] +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX512F-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,9,128,7,128,128,128,128,10,128,8,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[13],zero,ymm4[11,u,u,u],zero,ymm4[14],zero,ymm4[12,u,u,u],zero,ymm4[15],zero,ymm4[29],zero,ymm4[27,u,u,u],zero,ymm4[30],zero,ymm4[28,u,u,u],zero,ymm4[31],zero +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,9,8,9,128,7,128,7,8,9,10,128,8,128,14,15,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm8 +; AVX512F-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [12,13,10,11,12,13,14,15,14,15,12,13,12,13,14,15,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,10,11,8,9,6,7,8,9,10,11,10,11,8,9,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm8 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm6 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm6 +; AVX512F-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u],zero,xmm6[7],zero,xmm6[5,u,u,u],zero,xmm6[8],zero,xmm6[6,u,u,u],zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u,u,9] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm21 +; AVX512F-FAST-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX512F-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm14 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm14, %xmm6 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm23 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm8 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm22 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm20 +; AVX512F-FAST-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX512F-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm8 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm10 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm9 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm9 +; AVX512F-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[4],zero,xmm0[u,u,u,7],zero,xmm0[5],zero,xmm0[u,u,u,8],zero,xmm0[6],zero +; AVX512F-FAST-NEXT: vpor %xmm9, %xmm12, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm5 +; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm2[5],zero,ymm2[3],zero,zero,zero,zero,ymm2[6],zero,ymm2[4],zero,zero,zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20],zero,zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm9, %zmm5 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[2,3,4,5],zero,ymm11[3],zero,ymm11[5,4,5,6],zero,ymm11[4],zero,ymm11[6,7,18,19,20,21],zero,ymm11[19],zero,ymm11[21,20,21,22],zero,ymm11[20],zero,ymm11[22,23] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm13[2],zero,zero,zero,zero,ymm13[5],zero,ymm13[3],zero,zero,zero,zero,ymm13[6],zero,ymm13[4],zero,ymm13[18],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero,ymm13[20] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[2],zero,ymm7[2,3,4,5],zero,ymm7[3],zero,ymm7[9,10,11,6],zero,ymm7[4],zero,ymm7[18],zero,ymm7[18,19,20,21],zero,ymm7[19],zero,ymm7[25,26,27,22],zero,ymm7[20],zero +; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm26 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm15[4],zero,ymm15[2],zero,zero,zero,zero,ymm15[5],zero,ymm15[3],zero,zero,zero,zero,ymm15[6],zero,ymm15[20],zero,ymm15[18],zero,zero,zero,zero,ymm15[21],zero,ymm15[19],zero,zero,zero,zero,ymm15[22] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm15 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm15[4],zero,ymm15[2],zero,ymm15[4,5,4,5],zero,ymm15[3],zero,ymm15[3,4,5,6],zero,ymm15[20],zero,ymm15[18],zero,ymm15[20,21,20,21],zero,ymm15[19],zero,ymm15[19,20,21,22],zero +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm25 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [2,2,3,2,10,11,10,11] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[4,5,2,3,2,3,4,5,2,3,4,5,12,13,14,15,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] +; AVX512F-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm27 +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,3,2,3,8,8,9,8] +; AVX512F-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX512F-FAST-NEXT: vporq %xmm1, %xmm2, %xmm31 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u],zero,xmm2[7],zero,xmm2[5,u,u,u],zero,xmm2[8],zero,xmm2[6,u,u,u],zero +; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm15 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,7],zero,xmm4[5],zero,xmm4[u,u,u,8],zero,xmm4[6],zero,xmm4[u,u,u,9] +; AVX512F-FAST-NEXT: vporq %xmm1, %xmm7, %xmm22 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,0,8,9,8,9] +; AVX512F-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm9 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm19 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm11 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[4],zero,xmm11[u,u,u,7],zero,xmm11[5],zero,xmm11[u,u,u,8],zero,xmm11[6],zero +; AVX512F-FAST-NEXT: vporq %xmm9, %xmm12, %xmm28 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[0,1,0,1,14],zero,ymm1[14,15,0,1,14,15],zero,ymm1[13,14,15,16,17,16],zero,ymm1[30,31,30,31,16,17],zero,ymm1[31,28,29,30,31] +; AVX512F-FAST-NEXT: vpor %ymm9, %ymm12, %ymm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero,ymm0[18] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[0,1,14],zero,ymm0[12,13,0,1,14,15],zero,ymm0[3,12,13,2,3,16],zero,ymm0[30,31,28,29,16,17],zero,ymm0[31,18,19,28,29,18],zero +; AVX512F-FAST-NEXT: vpor %ymm9, %ymm12, %ymm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[13,u,u,u,u,u],zero,ymm3[14,u,u,u,u,u],zero,ymm3[15,u,u,u,u,u],zero,ymm3[16,u,u,u,u,u],zero,ymm3[17,u,u,u] +; AVX512F-FAST-NEXT: vpor %ymm9, %ymm12, %ymm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-FAST-NEXT: vpandn %ymm7, %ymm8, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm8 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm16 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> +; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX512F-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm0 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm0[8],xmm14[9],xmm0[9],xmm14[10],xmm0[10],xmm14[11],xmm0[11],xmm14[12],xmm0[12],xmm14[13],xmm0[13],xmm14[14],xmm0[14],xmm14[15],xmm0[15] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm7 +; AVX512F-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm0 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm7 +; AVX512F-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm0 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm7[14],zero,ymm7[12],zero,zero,zero,zero,ymm7[15],zero,ymm7[13],zero,zero,zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[12,13,14],zero,ymm12[12],zero,ymm12[14,15,14,15],zero,ymm12[13],zero,ymm12[15,12,13,28,29,30],zero,ymm12[28],zero,ymm12[30,31,30,31],zero,ymm12[29],zero,ymm12[31,28,29] +; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm1 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] +; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm9 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm9[14],zero,ymm9[12],zero,zero,zero,zero,ymm9[15],zero,ymm9[13],zero,zero,zero,zero,zero,zero,ymm9[30],zero,ymm9[28],zero,zero,zero,zero,ymm9[31],zero,ymm9[29],zero,zero +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm12[2,3,2,3] +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[11,u,u,u],zero,ymm12[14],zero,ymm12[12,u,u,u],zero,ymm12[15],zero,ymm12[13,u,27,u,u,u],zero,ymm12[30],zero,ymm12[28,u,u,u],zero,ymm12[31],zero,ymm12[29,u] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm15, %xmm12 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm12, %ymm18 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm12 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,ymm12[11],zero,zero,zero,zero,ymm12[14],zero,ymm12[12],zero,zero,zero,zero,ymm12[15],zero,ymm12[13],zero,ymm12[27],zero,zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero,ymm12[29] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm12, %ymm26 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[13],zero,ymm12[11,u,u,u],zero,ymm12[14],zero,ymm12[12,u,u,u],zero,ymm12[15],zero,ymm12[29],zero,ymm12[27,u,u,u],zero,ymm12[30],zero,ymm12[28,u,u,u],zero,ymm12[31],zero +; AVX512F-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm12 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm31, %zmm6 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm22, %zmm19 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm12 +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm28, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[2],zero,zero,zero,zero,ymm0[5],zero,ymm0[3],zero,zero,zero,zero,ymm0[6],zero,ymm0[4],zero,ymm0[18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20] +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[2],zero,ymm1[2,3,4,5],zero,ymm1[3],zero,ymm1[9,10,11,6],zero,ymm1[4],zero,ymm1[18],zero,ymm1[18,19,20,21],zero,ymm1[19],zero,ymm1[25,26,27,22],zero,ymm1[20],zero +; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm1[5],zero,ymm1[3],zero,zero,zero,zero,ymm1[6],zero,ymm1[4],zero,zero,zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[2,3,4,5],zero,ymm1[3],zero,ymm1[5,4,5,6],zero,ymm1[4],zero,ymm1[6,7,18,19,20,21],zero,ymm1[19],zero,ymm1[21,20,21,22],zero,ymm1[20],zero,ymm1[22,23] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = zero,ymm3[4],zero,ymm3[2],zero,zero,zero,zero,ymm3[5],zero,ymm3[3],zero,zero,zero,zero,ymm3[6],zero,ymm3[20],zero,ymm3[18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4],zero,ymm10[2],zero,ymm10[4,5,4,5],zero,ymm10[3],zero,ymm10[3,4,5,6],zero,ymm10[20],zero,ymm10[18],zero,ymm10[20,21,20,21],zero,ymm10[19],zero,ymm10[19,20,21,22],zero +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm3[4,5,2,3,2,3,4,5,2,3,4,5,12,13,14,15,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512F-FAST-NEXT: vpandnq %ymm13, %ymm22, %ymm13 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm13 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm22 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vporq %zmm13, %zmm22, %zmm13 +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm22 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm30 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vporq %zmm22, %zmm30, %zmm22 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm22 +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm13 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm30 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vporq %zmm13, %zmm30, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm22, %zmm30, %zmm13 +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm22 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm31 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vporq %zmm22, %zmm31, %zmm22 +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm29 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm28 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vporq %zmm29, %zmm28, %zmm28 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm22, %zmm30, %zmm28 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm6[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm19 = zmm19[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm11, %zmm30, %zmm19 +; AVX512F-FAST-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm20[0,1,0,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm21[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm17[2,3,2,3] +; AVX512F-FAST-NEXT: vpor %ymm9, %ymm11, %ymm9 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm18[0,1,0,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm7, %zmm11, %zmm9 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512F-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm5[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm4[2,3,2,3] +; AVX512F-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm16[0,1,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm23[0,1,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm12[0,1,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm26[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm10 = mem[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm25[0,1,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm14[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm15[2,3,2,3] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm11, %zmm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm8 +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm0 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm0 +; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm3 = mem[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm4 = mem[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 +; AVX512F-FAST-NEXT: vpermq $16, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm3 = mem[0,0,1,0,4,4,5,4] +; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm5 = mem[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5 +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm3 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vporq %zmm3, %zmm4, %zmm3 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm27 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm27 +; AVX512F-FAST-NEXT: vpor %ymm7, %ymm10, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm3 +; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm4 = mem[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm4 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm19, %zmm4 +; AVX512F-FAST-NEXT: vpor %ymm6, %ymm14, %ymm6 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm1 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 +; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 384(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512F-FAST-NEXT: addq $1784, %rsp # imm = 0x6F8 +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq ; -; AVX512BW-ONLY-SLOW-LABEL: store_i8_stride7_vf64: -; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rax), %ymm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rax), %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm15, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm15, %ymm1, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm20, %ymm10, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm11, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: movabsq $2323999253380730912, %r10 # imm = 0x2040810204081020 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %r10, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm3, %zmm21 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm16, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm23, %ymm17, %ymm5 -; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm7, %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm25, %ymm18, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %ymm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm26, %ymm19, %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm3, %ymm7, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm27, %xmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm27, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %r10, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm22, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: movabsq $4066998693416279096, %r10 # imm = 0x3870E1C3870E1C38 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %r10, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm21, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm2, %ymm21, %ymm21 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm2, %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm9, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r9), %ymm21 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm20, %ymm21, %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r8), %ymm22 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm22, %ymm24 -; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm20, %ymm24, %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm24 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm22[20],zero,ymm22[18],zero,ymm22[20,21,20,21],zero,ymm22[19],zero,ymm22[19,20,21,22],zero -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm27 = ymm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm21[20],zero,ymm21[18],zero,zero,zero,zero,ymm21[21],zero,ymm21[19],zero,zero,zero,zero,ymm21[22] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm24, %ymm27, %ymm24 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: movabsq $145249953336295682, %r10 # imm = 0x204081020408102 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %r10, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm9, %zmm24 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm28 -; AVX512BW-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm28[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,1,1,4,4,5,5] -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm20 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6] -; AVX512BW-ONLY-SLOW-NEXT: movl $676341840, %r10d # imm = 0x28502850 -; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm20, %ymm27, %ymm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm25, %ymm28, %ymm25 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm26, %ymm27, %ymm26 -; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm25, %ymm26, %ymm25 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm25, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm26 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm26, %ymm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm29 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm23, %ymm29, %ymm23 -; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm14, %ymm23, %ymm14 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm23 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm26[18],zero,ymm26[18,19,20,21],zero,ymm26[19],zero,ymm26[25,26,27,22],zero,ymm26[20],zero -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm25 = ymm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm29[18],zero,zero,zero,zero,ymm29[21],zero,ymm29[19],zero,zero,zero,zero,ymm29[22],zero,ymm29[20] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm23, %ymm25, %ymm23 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm14, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %r10, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm14, %zmm9 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: movabsq $-4357498600088870461, %r10 # imm = 0xC3870E1C3870E1C3 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %r10, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm24, %zmm9 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm28[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,3,3,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm24 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14] -; AVX512BW-ONLY-SLOW-NEXT: movl $338170920, %r10d # imm = 0x14281428 -; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm27, %ymm14 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] -; AVX512BW-ONLY-SLOW-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm23, %ymm27, %ymm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm27 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm28, %ymm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm25, %ymm28, %ymm25 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm25, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm25 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm26[24,25],zero,ymm26[23],zero,ymm26[21,22,23,26],zero,ymm26[24],zero,ymm26[28,29,26,27] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm28 = ymm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm29[25],zero,ymm29[23],zero,zero,zero,zero,ymm29[26],zero,ymm29[24],zero,zero,zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm25, %ymm28, %ymm28 -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm25 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm25, %ymm29, %ymm29 -; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm26 = ymm26[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm26 = ymm26[0,2,3,3,4,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm26, %ymm29 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm29[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: movabsq $1742999440035548184, %r10 # imm = 0x183060C183060C18 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %r10, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm28, %zmm14 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm22[0,1,2,3],zmm28[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} zmm22 = zmm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm22[23],zero,zmm22[23,24,25,26],zero,zmm22[24],zero,zmm22[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm22[59],zero,zero,zero,zero,zmm22[62],zero,zmm22[60],zero,zero,zero,zero,zmm22[63],zero -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,3,2,3,6,7,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm29[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} zmm21 = zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm21[25],zero,zmm21[23],zero,zero,zero,zero,zmm21[26],zero,zmm21[24],zero,zero,zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm21[59],zero,zero,zero,zero,zmm21[62],zero,zmm21[60],zero,zero,zero,zero,zmm21[63],zero,zmm21[61] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm21 = zmm21[2,3,2,3,6,7,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vporq %zmm22, %zmm21, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: movabsq $6971997760142192736, %rax # imm = 0x60C183060C183060 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm22, %zmm14 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm21, %zmm2, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm22, %zmm14 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm22 = ymm18[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm30 = ymm22[0,0,1,1,4,4,5,5] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm20, %ymm19, %ymm30 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm23, %ymm19, %ymm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm18, %ymm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm22, %ymm23, %ymm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r9), %xmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm30[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm27, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r8), %xmm23 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm26, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} zmm26 = zmm26[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm26[18,19,20,21],zero,zmm26[19],zero,zmm26[25,26,27,22],zero,zmm26[20],zero,zmm26[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm26[55],zero,zmm26[53,54,55,58],zero,zmm26[56],zero,zmm26[60,61,58,59] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm26 = zmm26[2,3,2,3,6,7,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm20, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} zmm20 = zmm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm20[18],zero,zero,zero,zero,zmm20[21],zero,zmm20[19],zero,zero,zero,zero,zmm20[22],zero,zmm20[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm20[57],zero,zmm20[55],zero,zero,zero,zero,zmm20[58],zero,zmm20[56],zero,zero,zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm20 = zmm20[2,3,2,3,6,7,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vporq %zmm26, %zmm20, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdx), %xmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm27, %zmm20 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rcx), %xmm27 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm29, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} zmm29 = zmm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm29[18],zero,zmm29[20,21,20,21],zero,zmm29[19],zero,zmm29[19,20,21,22],zero,zmm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm29[55],zero,zmm29[55,56,57,58],zero,zmm29[56],zero,zmm29[62,63] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm29 = zmm29[2,3,2,3,6,7,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm28, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} zmm28 = zmm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm28[20],zero,zmm28[18],zero,zero,zero,zero,zmm28[21],zero,zmm28[19],zero,zero,zero,zero,zmm28[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm28[57],zero,zmm28[55],zero,zero,zero,zero,zmm28[58],zero,zmm28[56],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm28 = zmm28[2,3,2,3,6,7,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vporq %zmm29, %zmm28, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm15, %zmm28, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm28, %zmm29 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdi), %xmm28 -; AVX512BW-ONLY-SLOW-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm29, %zmm20 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rsi), %xmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm18 = ymm18[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm30 = ymm18[2,2,3,3,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm19, %ymm30 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm28[0],xmm29[0],xmm28[1],xmm29[1],xmm28[2],xmm29[2],xmm28[3],xmm29[3],xmm28[4],xmm29[4],xmm28[5],xmm29[5],xmm28[6],xmm29[6],xmm28[7],xmm29[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm18 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm18, %xmm19, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm30[2,3,2,3],zmm19[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm26[0],xmm27[0],xmm26[1],xmm27[1],xmm26[2],xmm27[2],xmm26[3],xmm27[3],xmm26[4],xmm27[4],xmm26[5],xmm27[5],xmm26[6],xmm27[6],xmm26[7],xmm27[7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm25, %ymm17, %ymm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm17 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm17, %xmm24, %xmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm16 = ymm16[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm16[0,2,3,3,4,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm16, %ymm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm25[2,3,2,3],zmm24[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm19, %zmm16 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm10[27],zero,zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[27],zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm10, %ymm11, %ymm11 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm19, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm19[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm11, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm21, %zmm19, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm19, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm11, %zmm16 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm27, %xmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm24 = -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm24, %xmm26, %xmm25 -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm11, %xmm25, %xmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm25 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm25, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm11, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm29, %xmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm26 = -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm26, %xmm28, %xmm27 -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm25, %xmm27, %xmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm28 = zmm11[0,1,0,1,4,5,4,5] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm27, %xmm11 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm25, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm11 = zmm11[0,1,0,1,4,5,4,5] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm28, %zmm11 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm22, %xmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm21, %zmm2, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm23, %xmm21 -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm25, %xmm21, %xmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm22, %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm21, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm4[0,1,0,1,4,5,4,5] -; AVX512BW-ONLY-SLOW-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm27, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm4, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm6, %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm24, %xmm5, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm4, %xmm19, %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm17, %xmm5, %xmm5 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm8, %xmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm26, %xmm7, %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm18, %xmm6, %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm6, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm4[0,1,0,1,4,5,4,5] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[0,1,0,1,4,5,4,5] -; AVX512BW-ONLY-SLOW-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm4, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm15, %zmm1, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] -; AVX512BW-ONLY-SLOW-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vzeroupper -; AVX512BW-ONLY-SLOW-NEXT: retq +; AVX512BW-SLOW-LABEL: store_i8_stride7_vf64: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: pushq %rax +; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX512BW-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm1 +; AVX512BW-SLOW-NEXT: vmovdqa (%rax), %xmm6 +; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512BW-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-SLOW-NEXT: vpermw %ymm6, %ymm10, %ymm2 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] +; AVX512BW-SLOW-NEXT: vpshufb %ymm17, %ymm2, %ymm4 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512BW-SLOW-NEXT: vpshufb %ymm18, %ymm3, %ymm5 +; AVX512BW-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm9 +; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm14 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX512BW-SLOW-NEXT: vpshufb %xmm11, %xmm5, %xmm5 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm19 +; AVX512BW-SLOW-NEXT: movabsq $2323999253380730912, %r10 # imm = 0x2040810204081020 +; AVX512BW-SLOW-NEXT: kmovq %r10, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm19 {%k1} +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512BW-SLOW-NEXT: vpshufb %ymm13, %ymm4, %ymm1 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512BW-SLOW-NEXT: vpshufb %ymm15, %ymm5, %ymm7 +; AVX512BW-SLOW-NEXT: vpor %ymm1, %ymm7, %ymm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm16 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %xmm20 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm20[8],xmm16[8],xmm20[9],xmm16[9],xmm20[10],xmm16[10],xmm20[11],xmm16[11],xmm20[12],xmm16[12],xmm20[13],xmm16[13],xmm20[14],xmm16[14],xmm20[15],xmm16[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm27 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm7, %xmm7 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm7, %zmm21 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm8 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] +; AVX512BW-SLOW-NEXT: vpshufb %ymm22, %ymm8, %ymm1 +; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %ymm7 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX512BW-SLOW-NEXT: vpshufb %ymm23, %ymm7, %ymm24 +; AVX512BW-SLOW-NEXT: vporq %ymm1, %ymm24, %ymm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm24 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %xmm25 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm26 = xmm25[8],xmm24[8],xmm25[9],xmm24[9],xmm25[10],xmm24[10],xmm25[11],xmm24[11],xmm25[12],xmm24[12],xmm25[13],xmm24[13],xmm25[14],xmm24[14],xmm25[15],xmm24[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm28 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX512BW-SLOW-NEXT: vpshufb %xmm28, %xmm26, %xmm26 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[0,1,0,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm26, %zmm0 +; AVX512BW-SLOW-NEXT: movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306 +; AVX512BW-SLOW-NEXT: kmovq %r10, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm21, %zmm0 {%k1} +; AVX512BW-SLOW-NEXT: movabsq $4066998693416279096, %r10 # imm = 0x3870E1C3870E1C38 +; AVX512BW-SLOW-NEXT: kmovq %r10, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm19, %zmm0 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0] +; AVX512BW-SLOW-NEXT: vpermw %ymm6, %ymm19, %ymm19 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm21, %xmm6, %xmm6 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm19, %zmm6 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm26 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm9, %xmm19 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm29 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX512BW-SLOW-NEXT: vpshufb %xmm29, %xmm14, %xmm30 +; AVX512BW-SLOW-NEXT: vporq %xmm19, %xmm30, %xmm19 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3],xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX512BW-SLOW-NEXT: vpshufb %xmm9, %xmm14, %xmm14 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm14, %zmm14 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm19 = zmm14[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: movabsq $4647998506761461824, %r10 # imm = 0x4081020408102040 +; AVX512BW-SLOW-NEXT: kmovq %r10, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm6, %zmm19 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm30 = +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm20, %xmm6 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm31 = +; AVX512BW-SLOW-NEXT: vpshufb %xmm31, %xmm16, %xmm14 +; AVX512BW-SLOW-NEXT: vpor %xmm6, %xmm14, %xmm6 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm16[0],xmm20[0],xmm16[1],xmm20[1],xmm16[2],xmm20[2],xmm16[3],xmm20[3],xmm16[4],xmm20[4],xmm16[5],xmm20[5],xmm16[6],xmm20[6],xmm16[7],xmm20[7] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm16 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX512BW-SLOW-NEXT: vpshufb %xmm16, %xmm14, %xmm14 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm14, %zmm6 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm25, %xmm14 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm24, %xmm20 +; AVX512BW-SLOW-NEXT: vporq %xmm14, %xmm20, %xmm14 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm24[0],xmm25[0],xmm24[1],xmm25[1],xmm24[2],xmm25[2],xmm24[3],xmm25[3],xmm24[4],xmm25[4],xmm24[5],xmm25[5],xmm24[6],xmm25[6],xmm24[7],xmm25[7] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> +; AVX512BW-SLOW-NEXT: vpshufb %xmm20, %xmm24, %xmm24 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm24, %zmm14 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm24 = zmm6[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm6 = zmm14[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: movabsq $871499720017774092, %r10 # imm = 0xC183060C183060C +; AVX512BW-SLOW-NEXT: kmovq %r10, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm24, %zmm6 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rax), %xmm14 +; AVX512BW-SLOW-NEXT: movabsq $8133997386832558192, %r10 # imm = 0x70E1C3870E1C3870 +; AVX512BW-SLOW-NEXT: kmovq %r10, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm19, %zmm6 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r9), %xmm19 +; AVX512BW-SLOW-NEXT: vpermw %ymm14, %ymm10, %ymm10 +; AVX512BW-SLOW-NEXT: vpshufb %xmm21, %xmm14, %xmm21 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,1,0] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm21, %zmm10 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r8), %xmm21 +; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm19, %xmm24 +; AVX512BW-SLOW-NEXT: vpshufb %xmm29, %xmm21, %xmm25 +; AVX512BW-SLOW-NEXT: vporq %xmm24, %xmm25, %xmm25 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm21[8],xmm19[8],xmm21[9],xmm19[9],xmm21[10],xmm19[10],xmm21[11],xmm19[11],xmm21[12],xmm19[12],xmm21[13],xmm19[13],xmm21[14],xmm19[14],xmm21[15],xmm19[15] +; AVX512BW-SLOW-NEXT: vpshufb %xmm11, %xmm24, %xmm11 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdx), %xmm24 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm25, %zmm11 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rcx), %xmm25 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm11 = zmm11[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: movabsq $290499906672591364, %r10 # imm = 0x408102040810204 +; AVX512BW-SLOW-NEXT: kmovq %r10, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm10, %zmm11 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdi), %xmm26 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm25, %xmm10 +; AVX512BW-SLOW-NEXT: vpshufb %xmm31, %xmm24, %xmm29 +; AVX512BW-SLOW-NEXT: vporq %xmm10, %xmm29, %xmm10 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm29 = xmm25[8],xmm24[8],xmm25[9],xmm24[9],xmm25[10],xmm24[10],xmm25[11],xmm24[11],xmm25[12],xmm24[12],xmm25[13],xmm24[13],xmm25[14],xmm24[14],xmm25[15],xmm24[15] +; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm29, %xmm27 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rsi), %xmm29 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm27, %zmm10, %zmm10 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm27 = zmm10[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm29, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm26, %xmm0 +; AVX512BW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm29[8],xmm26[8],xmm29[9],xmm26[9],xmm29[10],xmm26[10],xmm29[11],xmm26[11],xmm29[12],xmm26[12],xmm29[13],xmm26[13],xmm29[14],xmm26[14],xmm29[15],xmm26[15] +; AVX512BW-SLOW-NEXT: vpshufb %xmm28, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm0[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: movabsq $6971997760142192736, %r10 # imm = 0x60C183060C183060 +; AVX512BW-SLOW-NEXT: kmovq %r10, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm27, %zmm10 {%k1} +; AVX512BW-SLOW-NEXT: movabsq $-8714997200177740921, %r10 # imm = 0x870E1C3870E1C387 +; AVX512BW-SLOW-NEXT: kmovq %r10, %k2 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm11, %zmm10 {%k2} +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] +; AVX512BW-SLOW-NEXT: vpermw %ymm11, %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vpshufb %ymm12, %ymm11, %ymm1 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%r9), %ymm12 +; AVX512BW-SLOW-NEXT: vpshufb %ymm17, %ymm12, %ymm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r8), %ymm17 +; AVX512BW-SLOW-NEXT: vpshufb %ymm18, %ymm17, %ymm18 +; AVX512BW-SLOW-NEXT: vporq %ymm1, %ymm18, %ymm1 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm18 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm17[18],zero,ymm17[20,21,20,21],zero,ymm17[19],zero,ymm17[19,20,21,22],zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm18[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm27 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[20],zero,ymm12[18],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,3,2,3] +; AVX512BW-SLOW-NEXT: vporq %ymm18, %ymm27, %ymm18 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm1, %zmm1 +; AVX512BW-SLOW-NEXT: movabsq $145249953336295682, %rax # imm = 0x204081020408102 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k2 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm28 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm27 +; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm27[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512BW-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm30 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6] +; AVX512BW-SLOW-NEXT: movl $676341840, %eax # imm = 0x28502850 +; AVX512BW-SLOW-NEXT: kmovd %eax, %k2 +; AVX512BW-SLOW-NEXT: vpshufb %ymm30, %ymm28, %ymm0 {%k2} +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpshufb %ymm22, %ymm27, %ymm18 +; AVX512BW-SLOW-NEXT: vpshufb %ymm23, %ymm28, %ymm22 +; AVX512BW-SLOW-NEXT: vporq %ymm18, %ymm22, %ymm18 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm18, %zmm18 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm22 +; AVX512BW-SLOW-NEXT: vpshufb %ymm13, %ymm22, %ymm0 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rcx), %ymm13 +; AVX512BW-SLOW-NEXT: vpshufb %ymm15, %ymm13, %ymm15 +; AVX512BW-SLOW-NEXT: vpor %ymm0, %ymm15, %ymm0 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm22[2],zero,ymm22[2,3,4,5],zero,ymm22[3],zero,ymm22[9,10,11,6],zero,ymm22[4],zero,ymm22[18],zero,ymm22[18,19,20,21],zero,ymm22[19],zero,ymm22[25,26,27,22],zero,ymm22[20],zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm23 = zero,ymm13[2],zero,zero,zero,zero,ymm13[5],zero,ymm13[3],zero,zero,zero,zero,ymm13[6],zero,ymm13[4],zero,ymm13[18],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero,ymm13[20] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,3,2,3] +; AVX512BW-SLOW-NEXT: vporq %ymm15, %ymm23, %ymm15 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: movabsq $3485998880071096368, %rax # imm = 0x3060C183060C1830 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k3 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm18 {%k3} +; AVX512BW-SLOW-NEXT: movabsq $-4357498600088870461, %rax # imm = 0xC3870E1C3870E1C3 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k4 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm18 {%k4} +; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512BW-SLOW-NEXT: vpshufb %ymm30, %ymm7, %ymm0 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [9,128,7,128,128,128,128,10,128,8,128,128,128,128,11,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] +; AVX512BW-SLOW-NEXT: vpshufb %ymm23, %ymm7, %ymm1 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm30 = [128,7,128,128,128,128,10,128,8,128,128,128,128,11,128,9,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] +; AVX512BW-SLOW-NEXT: vpshufb %ymm30, %ymm8, %ymm15 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpor %ymm1, %ymm15, %ymm1 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm1 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm4[18,19,20,21],zero,zmm4[19],zero,zmm4[25,26,27,22],zero,zmm4[20],zero,zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm4[55],zero,zmm4[53,54,55,58],zero,zmm4[56],zero,zmm4[60,61,58,59] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm15 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[18],zero,zero,zero,zero,zmm5[21],zero,zmm5[19],zero,zero,zero,zero,zmm5[22],zero,zmm5[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm5[57],zero,zmm5[55],zero,zero,zero,zero,zmm5[58],zero,zmm5[56],zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm15 = zmm15[2,3,2,3,6,7,6,7] +; AVX512BW-SLOW-NEXT: vporq %zmm1, %zmm15, %zmm15 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm15 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm0 = zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm3[18],zero,zmm3[20,21,20,21],zero,zmm3[19],zero,zmm3[19,20,21,22],zero,zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm3[55],zero,zmm3[55,56,57,58],zero,zmm3[56],zero,zmm3[62,63] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm1 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm2[20],zero,zmm2[18],zero,zero,zero,zero,zmm2[21],zero,zmm2[19],zero,zero,zero,zero,zmm2[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[57],zero,zmm2[55],zero,zero,zero,zero,zmm2[58],zero,zmm2[56],zero,zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] +; AVX512BW-SLOW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vpermw %zmm31, %zmm1, %zmm1 +; AVX512BW-SLOW-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k4 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k4} +; AVX512BW-SLOW-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C +; AVX512BW-SLOW-NEXT: kmovq %rax, %k4 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm15 {%k4} +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm8[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX512BW-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14] +; AVX512BW-SLOW-NEXT: movl $338170920, %eax # imm = 0x14281428 +; AVX512BW-SLOW-NEXT: kmovd %eax, %k4 +; AVX512BW-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm0 {%k4} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm26[0],xmm29[0],xmm26[1],xmm29[1],xmm26[2],xmm29[2],xmm26[3],xmm29[3],xmm26[4],xmm29[4],xmm26[5],xmm29[5],xmm26[6],xmm29[6],xmm26[7],xmm29[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm20, %xmm7, %xmm7 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm7[0,1,0,1] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm24[0],xmm25[0],xmm24[1],xmm25[1],xmm24[2],xmm25[2],xmm24[3],xmm25[3],xmm24[4],xmm25[4],xmm24[5],xmm25[5],xmm24[6],xmm25[6],xmm24[7],xmm25[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm16, %xmm7, %xmm7 +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] +; AVX512BW-SLOW-NEXT: vpshufb %ymm8, %ymm5, %ymm5 +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,3,3,4,6,7,7] +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm4, %ymm5 {%k2} +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[2,3,2,3],zmm7[0,1,0,1] +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm4 {%k3} +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[27],zero,zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm21[0],xmm19[0],xmm21[1],xmm19[1],xmm21[2],xmm19[2],xmm21[3],xmm19[3],xmm21[4],xmm19[4],xmm21[5],xmm19[5],xmm21[6],xmm19[6],xmm21[7],xmm19[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm31, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] +; AVX512BW-SLOW-NEXT: vpermw %zmm3, %zmm5, %zmm3 +; AVX512BW-SLOW-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k3 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k3} +; AVX512BW-SLOW-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E +; AVX512BW-SLOW-NEXT: kmovq %rax, %k3 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm4 {%k3} +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm27[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX512BW-SLOW-NEXT: vpshufb %ymm1, %ymm28, %ymm0 {%k4} +; AVX512BW-SLOW-NEXT: vpshufb %ymm23, %ymm28, %ymm1 +; AVX512BW-SLOW-NEXT: vpshufb %ymm30, %ymm27, %ymm3 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm22[8,9],zero,ymm22[7],zero,ymm22[5,6,7,10],zero,ymm22[8],zero,ymm22[12,13,10,11,24,25],zero,ymm22[23],zero,ymm22[21,22,23,26],zero,ymm22[24],zero,ymm22[28,29,26,27] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm13[9],zero,ymm13[7],zero,zero,zero,zero,ymm13[10],zero,ymm13[8],zero,zero,zero,zero,zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512BW-SLOW-NEXT: vpshufb %ymm8, %ymm13, %ymm3 +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm22[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,3,3,4,6,7,7] +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm5, %ymm3 {%k2} +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512BW-SLOW-NEXT: movabsq $1742999440035548184, %rax # imm = 0x183060C183060C18 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k2 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2} +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm17, %zmm1 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm12, %zmm3 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm1[23],zero,zmm1[23,24,25,26],zero,zmm1[24],zero,zmm1[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm1[59],zero,zero,zero,zero,zmm1[62],zero,zmm1[60],zero,zero,zero,zero,zmm1[63],zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm3[25],zero,zmm3[23],zero,zero,zero,zero,zmm3[26],zero,zmm3[24],zero,zero,zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm3[59],zero,zero,zero,zero,zmm3[62],zero,zmm3[60],zero,zero,zero,zero,zmm3[63],zero,zmm3[61] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,3,2,3,6,7,6,7] +; AVX512BW-SLOW-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,30,29,30,31,31,30,30,31,30,29,30,31,31,30,30,31] +; AVX512BW-SLOW-NEXT: vpermw %zmm11, %zmm1, %zmm1 +; AVX512BW-SLOW-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm18, 320(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, 384(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, 128(%rax) +; AVX512BW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-SLOW-NEXT: popq %rax +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: store_i8_stride7_vf64: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: subq $200, %rsp +; AVX512BW-FAST-NEXT: subq $136, %rsp ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm9 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm5 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa (%rax), %ymm4 -; AVX512BW-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa 32(%rax), %ymm13 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm1 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512BW-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm3, %ymm3 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa (%r9), %ymm15 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm15, %ymm7 -; AVX512BW-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512BW-FAST-NEXT: vpshufb %ymm20, %ymm1, %ymm8 -; AVX512BW-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512BW-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %xmm25 -; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm10 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm10[8],xmm25[8],xmm10[9],xmm25[9],xmm10[10],xmm25[10],xmm10[11],xmm25[11],xmm10[12],xmm25[12],xmm10[13],xmm25[13],xmm10[14],xmm25[14],xmm10[15],xmm25[15] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm22 +; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm1 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512BW-FAST-NEXT: vmovdqa (%rax), %xmm3 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512BW-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpermw %ymm3, %ymm2, %ymm2 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm3 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] +; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm4 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm6 +; AVX512BW-FAST-NEXT: vpor %ymm4, %ymm6, %ymm4 +; AVX512BW-FAST-NEXT: vmovdqa (%r9), %xmm12 +; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm14 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm17 ; AVX512BW-FAST-NEXT: movabsq $2323999253380730912, %r10 # imm = 0x2040810204081020 ; AVX512BW-FAST-NEXT: kmovq %r10, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm6, %zmm22 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512BW-FAST-NEXT: vpshufb %ymm21, %ymm1, %ymm6 -; AVX512BW-FAST-NEXT: vmovdqa %ymm1, %ymm7 -; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512BW-FAST-NEXT: vpshufb %ymm23, %ymm1, %ymm11 -; AVX512BW-FAST-NEXT: vmovdqa %ymm1, %ymm8 -; AVX512BW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FAST-NEXT: vpor %ymm6, %ymm11, %ymm6 -; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm14 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %xmm16 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm16[8],xmm14[8],xmm16[9],xmm14[9],xmm16[10],xmm14[10],xmm16[11],xmm14[11],xmm16[12],xmm14[12],xmm16[13],xmm14[13],xmm16[14],xmm14[14],xmm16[15],xmm14[15] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm11, %zmm26 -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512BW-FAST-NEXT: vpshufb %ymm28, %ymm11, %ymm6 -; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm12 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX512BW-FAST-NEXT: vpshufb %ymm29, %ymm12, %ymm18 -; AVX512BW-FAST-NEXT: vporq %ymm6, %ymm18, %ymm6 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %xmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %xmm19 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm19[8],xmm18[8],xmm19[9],xmm18[9],xmm19[10],xmm18[10],xmm19[11],xmm18[11],xmm19[12],xmm18[12],xmm19[13],xmm18[13],xmm19[14],xmm18[14],xmm19[15],xmm18[15] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm27 = xmm27[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,1,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm27, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm17 {%k1} +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm3 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512BW-FAST-NEXT: vpshufb %ymm16, %ymm11, %ymm9 +; AVX512BW-FAST-NEXT: vpor %ymm3, %ymm9, %ymm3 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %xmm18 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %xmm19 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm19[8],xmm18[8],xmm19[9],xmm18[9],xmm19[10],xmm18[10],xmm19[11],xmm18[11],xmm19[12],xmm18[12],xmm19[13],xmm18[13],xmm19[14],xmm18[14],xmm19[15],xmm18[15] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm22 +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm13 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] +; AVX512BW-FAST-NEXT: vpshufb %ymm23, %ymm13, %ymm3 +; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm15 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX512BW-FAST-NEXT: vpshufb %ymm24, %ymm15, %ymm9 +; AVX512BW-FAST-NEXT: vpor %ymm3, %ymm9, %ymm3 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %xmm20 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %xmm21 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm25 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm25 = xmm25[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,1,0,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm25, %zmm6 ; AVX512BW-FAST-NEXT: movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306 ; AVX512BW-FAST-NEXT: kmovq %r10, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm26, %zmm6 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm22, %zmm6 {%k1} ; AVX512BW-FAST-NEXT: movabsq $4066998693416279096, %r10 # imm = 0x3870E1C3870E1C38 ; AVX512BW-FAST-NEXT: kmovq %r10, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm22, %zmm6 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] -; AVX512BW-FAST-NEXT: vpermw %ymm13, %ymm22, %ymm22 -; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm22 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%r9), %ymm27 -; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm27, %ymm17 -; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %ymm1 -; AVX512BW-FAST-NEXT: vpshufb %ymm20, %ymm1, %ymm20 -; AVX512BW-FAST-NEXT: vporq %ymm17, %ymm20, %ymm17 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm20 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[18],zero,ymm1[20,21,20,21],zero,ymm1[19],zero,ymm1[19,20,21,22],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,3,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm26 = ymm27[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm27[20],zero,ymm27[18],zero,zero,zero,zero,ymm27[21],zero,ymm27[19],zero,zero,zero,zero,ymm27[22] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,3,2,3] -; AVX512BW-FAST-NEXT: vporq %ymm20, %ymm26, %ymm20 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm26 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm17, %zmm6 {%k1} +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] +; AVX512BW-FAST-NEXT: vpermw %ymm28, %ymm17, %ymm17 +; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm28, %ymm0 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%r9), %ymm29 +; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm29, %ymm1 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%r8), %ymm30 +; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm30, %ymm2 +; AVX512BW-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm30[18],zero,ymm30[20,21,20,21],zero,ymm30[19],zero,ymm30[19,20,21,22],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm17 = ymm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm29[20],zero,ymm29[18],zero,zero,zero,zero,ymm29[21],zero,ymm29[19],zero,zero,zero,zero,ymm29[22] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm17[2,3,2,3] +; AVX512BW-FAST-NEXT: vporq %ymm2, %ymm17, %ymm2 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm17 ; AVX512BW-FAST-NEXT: movabsq $145249953336295682, %r10 # imm = 0x204081020408102 ; AVX512BW-FAST-NEXT: kmovq %r10, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm22, %zmm26 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %ymm22 -; AVX512BW-FAST-NEXT: vpshufb %ymm21, %ymm22, %ymm17 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rcx), %ymm30 -; AVX512BW-FAST-NEXT: vpshufb %ymm23, %ymm30, %ymm20 -; AVX512BW-FAST-NEXT: vporq %ymm17, %ymm20, %ymm17 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm20 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm22[18],zero,ymm22[18,19,20,21],zero,ymm22[19],zero,ymm22[25,26,27,22],zero,ymm22[20],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,3,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm21 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm30[18],zero,zero,zero,zero,ymm30[21],zero,ymm30[19],zero,zero,zero,zero,ymm30[22],zero,ymm30[20] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[2,3,2,3] -; AVX512BW-FAST-NEXT: vporq %ymm20, %ymm21, %ymm20 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm21 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %ymm31 -; AVX512BW-FAST-NEXT: vpshufb %ymm28, %ymm31, %ymm17 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX512BW-FAST-NEXT: vpshufb %ymm29, %ymm0, %ymm20 -; AVX512BW-FAST-NEXT: vporq %ymm17, %ymm20, %ymm17 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm20 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm31[18,19,20,21],zero,ymm31[19],zero,ymm31[21,20,21,22],zero,ymm31[20],zero,ymm31[22,23] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,3,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm23 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero,zero +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm17 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %ymm27 +; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm27, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX512BW-FAST-NEXT: vpshufb %ymm16, %ymm3, %ymm2 +; AVX512BW-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm27[2],zero,ymm27[2,3,4,5],zero,ymm27[3],zero,ymm27[9,10,11,6],zero,ymm27[4],zero,ymm27[18],zero,ymm27[18,19,20,21],zero,ymm27[19],zero,ymm27[25,26,27,22],zero,ymm27[20],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm3[2],zero,zero,zero,zero,ymm3[5],zero,ymm3[3],zero,zero,zero,zero,ymm3[6],zero,ymm3[4],zero,ymm3[18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX512BW-FAST-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512BW-FAST-NEXT: vpshufb %ymm23, %ymm5, %ymm16 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512BW-FAST-NEXT: vpshufb %ymm24, %ymm2, %ymm22 +; AVX512BW-FAST-NEXT: vporq %ymm16, %ymm22, %ymm16 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm22 = ymm5[2,3,4,5],zero,ymm5[3],zero,ymm5[5,4,5,6],zero,ymm5[4],zero,ymm5[6,7,18,19,20,21],zero,ymm5[19],zero,ymm5[21,20,21,22],zero,ymm5[20],zero,ymm5[22,23] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,3,2,3] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm23 = zero,zero,zero,zero,ymm2[5],zero,ymm2[3],zero,zero,zero,zero,ymm2[6],zero,ymm2[4],zero,zero,zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20],zero,zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,3,2,3] -; AVX512BW-FAST-NEXT: vporq %ymm20, %ymm23, %ymm20 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm23 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm17 -; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830 -; AVX512BW-FAST-NEXT: kmovq %r10, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm21, %zmm17 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm21 -; AVX512BW-FAST-NEXT: movabsq $-4357498600088870461, %r10 # imm = 0xC3870E1C3870E1C3 -; AVX512BW-FAST-NEXT: kmovq %r10, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm26, %zmm17 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 (%rax), %zmm26 -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm22[0,1,2,3],zmm23[4,5,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm22 = zmm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,zmm22[23],zero,zmm22[21,22,23,26],zero,zmm22[24],zero,zmm22[28,29,26,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,59],zero,zero,zero,zero,zmm22[62],zero,zmm22[60],zero,zero,zero,zero,zmm22[63],zero,zmm22[61],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm30[0,1,2,3],zmm2[4,5,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm28 = zmm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm28[25],zero,zmm28[23],zero,zero,zero,zero,zmm28[26],zero,zmm28[24],zero,zero,zero,zero,zmm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm28[62],zero,zmm28[60],zero,zero,zero,zero,zmm28[63],zero,zmm28[61],zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm28 = zmm28[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vporq %zmm22, %zmm28, %zmm29 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm28 -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm9[4,5,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,zmm0[23],zero,zero,zero,zero,zmm0[26],zero,zmm0[24],zero,zero,zero,zero,zmm0[27],zero,zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,60,61,62],zero,zmm0[60],zero,zmm0[62,63,62,63],zero,zmm0[61],zero,zmm0[63,60,61] +; AVX512BW-FAST-NEXT: vporq %ymm22, %ymm23, %ymm22 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm16, %zmm16 +; AVX512BW-FAST-NEXT: movabsq $3485998880071096368, %rsi # imm = 0x3060C183060C1830 +; AVX512BW-FAST-NEXT: kmovq %rsi, %k1 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm4, %zmm16 {%k1} +; AVX512BW-FAST-NEXT: movabsq $-4357498600088870461, %rsi # imm = 0xC3870E1C3870E1C3 +; AVX512BW-FAST-NEXT: kmovq %rsi, %k2 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm17, %zmm16 {%k2} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm13[28],zero,ymm13[30,31,30,31],zero,ymm13[29],zero,ymm13[31,28,29] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm17 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm17[2,3,2,3] +; AVX512BW-FAST-NEXT: vporq %ymm4, %ymm17, %ymm4 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,1,0,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm4, %zmm22 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm17 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm17[2,3,2,3] +; AVX512BW-FAST-NEXT: vporq %ymm4, %ymm17, %ymm17 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %xmm24 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rcx), %xmm25 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm24[0],xmm25[0],xmm24[1],xmm25[1],xmm24[2],xmm25[2],xmm24[3],xmm25[3],xmm24[4],xmm25[4],xmm24[5],xmm25[5],xmm24[6],xmm25[6],xmm24[7],xmm25[7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm23, %xmm23 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,1,0,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm17, %zmm17 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm22, %zmm17 {%k1} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm22 = ymm26[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm26[27],zero,zero,zero,zero,ymm26[30],zero,ymm26[28],zero,zero,zero,zero,ymm26[31],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,3,2,3] +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm23 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[27],zero,zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,3,2,3] +; AVX512BW-FAST-NEXT: vporq %ymm22, %ymm23, %ymm26 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%r9), %xmm22 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%r8), %xmm23 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm26, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rax), %xmm26 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm26, %zmm31, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] +; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm31, %zmm1 +; AVX512BW-FAST-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 +; AVX512BW-FAST-NEXT: kmovq %rax, %k1 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512BW-FAST-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E +; AVX512BW-FAST-NEXT: kmovq %rax, %k1 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm17 {%k1} +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm27, %zmm0 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm3, %zmm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,zmm0[23],zero,zmm0[21,22,23,26],zero,zmm0[24],zero,zmm0[28,29,26,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,59],zero,zero,zero,zero,zmm0[62],zero,zmm0[60],zero,zero,zero,zero,zmm0[63],zero,zmm0[61],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm31[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm22 = zmm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm22[23],zero,zero,zero,zero,zmm22[26],zero,zmm22[24],zero,zero,zero,zero,zmm22[27],zero,zmm22[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zmm22[62],zero,zmm22[60],zero,zero,zero,zero,zmm22[63],zero,zmm22[61],zero,zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vporq %zmm0, %zmm22, %zmm22 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rsi), %xmm30 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm1[25],zero,zmm1[23],zero,zero,zero,zero,zmm1[26],zero,zmm1[24],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[62],zero,zmm1[60],zero,zero,zero,zero,zmm1[63],zero,zmm1[61],zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm1 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,zmm1[23],zero,zero,zero,zero,zmm1[26],zero,zmm1[24],zero,zero,zero,zero,zmm1[27],zero,zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,60,61,62],zero,zmm1[60],zero,zmm1[62,63,62,63],zero,zmm1[61],zero,zmm1[63,60,61] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm2[23],zero,zero,zero,zero,zmm2[26],zero,zmm2[24],zero,zero,zero,zero,zmm2[27],zero,zmm2[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zmm2[62],zero,zmm2[60],zero,zero,zero,zero,zmm2[63],zero,zmm2[61],zero,zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vporq %zmm1, %zmm2, %zmm27 ; AVX512BW-FAST-NEXT: movabsq $1742999440035548184, %rax # imm = 0x183060C183060C18 ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm29, %zmm22 {%k1} -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm21[4,5,6,7] +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm27 {%k1} +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm30, %zmm0 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm29, %zmm1 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm0[23],zero,zmm0[23,24,25,26],zero,zmm0[24],zero,zmm0[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm0[59],zero,zero,zero,zero,zmm0[62],zero,zmm0[60],zero,zero,zero,zero,zmm0[63],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm27[0,1,2,3],zmm3[4,5,6,7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[25],zero,zmm1[23],zero,zero,zero,zero,zmm1[26],zero,zmm1[24],zero,zero,zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm1[59],zero,zero,zero,zero,zmm1[62],zero,zmm1[60],zero,zero,zero,zero,zmm1[63],zero,zmm1[61] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] ; AVX512BW-FAST-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: movabsq $6971997760142192736, %rax # imm = 0x60C183060C183060 ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm22 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63] -; AVX512BW-FAST-NEXT: vpermi2w %zmm26, %zmm13, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm27 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,30,29,30,31,31,30,30,31,30,29,30,31,31,30,30,31] +; AVX512BW-FAST-NEXT: vpermw %zmm28, %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 -; AVX512BW-FAST-NEXT: kmovq %rax, %k3 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm22 {%k3} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm11[28],zero,ymm11[30,31,30,31],zero,ymm11[29],zero,ymm11[31,28,29] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero,ymm12[29],zero,zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512BW-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm28[0],xmm30[0],xmm28[1],xmm30[1],xmm28[2],xmm30[2],xmm28[3],xmm30[3],xmm28[4],xmm30[4],xmm28[5],xmm30[5],xmm28[6],xmm30[6],xmm28[7],xmm30[7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm27 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero,zero -; AVX512BW-FAST-NEXT: vmovdqa64 %ymm7, %ymm20 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,3,2,3] -; AVX512BW-FAST-NEXT: vporq %ymm0, %ymm27, %ymm27 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %xmm31 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm31[0],xmm1[0],xmm31[1],xmm1[1],xmm31[2],xmm1[2],xmm31[3],xmm1[3],xmm31[4],xmm1[4],xmm31[5],xmm1[5],xmm31[6],xmm1[6],xmm31[7],xmm1[7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm27, %zmm27 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm27 {%k2} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm15[27],zero,zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm24[27],zero,zero,zero,zero,ymm24[30],zero,ymm24[28],zero,zero,zero,zero,ymm24[31],zero,ymm24[29] -; AVX512BW-FAST-NEXT: vmovdqa64 %ymm24, %ymm9 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512BW-FAST-NEXT: vpor %ymm0, %ymm3, %ymm2 -; AVX512BW-FAST-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] -; AVX512BW-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm2 -; AVX512BW-FAST-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 -; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm0 {%k2} -; AVX512BW-FAST-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm27 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm2 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512BW-FAST-NEXT: vpshufb %xmm0, %xmm31, %xmm24 -; AVX512BW-FAST-NEXT: vporq %xmm2, %xmm24, %xmm2 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm31[8],xmm1[9],xmm31[9],xmm1[10],xmm31[10],xmm1[11],xmm31[11],xmm1[12],xmm31[12],xmm1[13],xmm31[13],xmm1[14],xmm31[14],xmm1[15],xmm31[15] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512BW-FAST-NEXT: vpshufb %xmm2, %xmm30, %xmm24 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm31 = -; AVX512BW-FAST-NEXT: vpshufb %xmm31, %xmm28, %xmm29 -; AVX512BW-FAST-NEXT: vporq %xmm24, %xmm29, %xmm24 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm28 = xmm28[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm28, %zmm24, %zmm24 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm24 = zmm24[0,1,0,1,4,5,4,5] -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm24 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm28 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm29 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm3, %xmm30 -; AVX512BW-FAST-NEXT: vporq %xmm28, %xmm30, %xmm28 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm28, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] -; AVX512BW-FAST-NEXT: vpermi2w %zmm26, %zmm13, %zmm4 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,1,0,1,4,5,4,5] -; AVX512BW-FAST-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 -; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm4, %zmm3 {%k2} -; AVX512BW-FAST-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387 -; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm24 {%k2} -; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm16, %xmm3 -; AVX512BW-FAST-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX512BW-FAST-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm14[0],xmm16[0],xmm14[1],xmm16[1],xmm14[2],xmm16[2],xmm14[3],xmm16[3],xmm14[4],xmm16[4],xmm14[5],xmm16[5],xmm14[6],xmm16[6],xmm14[7],xmm16[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm0 -; AVX512BW-FAST-NEXT: vpshufb %xmm2, %xmm19, %xmm2 -; AVX512BW-FAST-NEXT: vpshufb %xmm31, %xmm18, %xmm3 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512BW-FAST-NEXT: vpshufb %xmm0, %xmm19, %xmm2 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm18, %xmm3 ; AVX512BW-FAST-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm18[0],xmm19[0],xmm18[1],xmm19[1],xmm18[2],xmm19[2],xmm18[3],xmm19[3],xmm18[4],xmm19[4],xmm18[5],xmm19[5],xmm18[6],xmm19[6],xmm18[7],xmm19[7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm0[0,1,0,1,4,5,4,5] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm2[0,1,0,1,4,5,4,5] +; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512BW-FAST-NEXT: vpshufb %xmm2, %xmm21, %xmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm28 = +; AVX512BW-FAST-NEXT: vpshufb %xmm28, %xmm20, %xmm18 +; AVX512BW-FAST-NEXT: vporq %xmm5, %xmm18, %xmm5 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm20[0],xmm21[0],xmm20[1],xmm21[1],xmm20[2],xmm21[2],xmm20[3],xmm21[3],xmm20[4],xmm21[4],xmm20[5],xmm21[5],xmm20[6],xmm21[6],xmm20[7],xmm21[7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,1,0,1,4,5,4,5] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm18, %zmm5 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm18 = zmm5[0,1,0,1,4,5,4,5] ; AVX512BW-FAST-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm0 {%k2} -; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm25, %xmm1 -; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm10, %xmm2 -; AVX512BW-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm25[0],xmm10[1],xmm25[1],xmm10[2],xmm25[2],xmm10[3],xmm25[3],xmm10[4],xmm25[4],xmm10[5],xmm25[5],xmm10[6],xmm25[6],xmm10[7],xmm25[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm2 # 32-byte Folded Reload -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] -; AVX512BW-FAST-NEXT: vpermw %zmm2, %zmm3, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm18 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm19 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX512BW-FAST-NEXT: vpshufb %xmm19, %xmm12, %xmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm20 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX512BW-FAST-NEXT: vpshufb %xmm20, %xmm14, %xmm5 +; AVX512BW-FAST-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm5, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] +; AVX512BW-FAST-NEXT: vpermw %zmm5, %zmm12, %zmm5 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,1,0,1,4,5,4,5] ; AVX512BW-FAST-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm5, %zmm3 {%k2} ; AVX512BW-FAST-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2} -; AVX512BW-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm1, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,zmm1[19],zero,zmm1[21,20,21,22],zero,zmm1[20],zero,zmm1[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm1[55],zero,zero,zero,zero,zmm1[58],zero,zmm1[56],zero,zero,zero,zero,zmm1[59],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm3[21],zero,zmm3[19],zero,zero,zero,zero,zmm3[22],zero,zmm3[20],zero,zero,zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm3[55],zero,zero,zero,zero,zmm3[58],zero,zmm3[56],zero,zero,zero,zero,zmm3[59],zero,zmm3[57] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vporq %zmm1, %zmm3, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm3, %zmm3 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm4 # 32-byte Folded Reload -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm3[18,19,20,21],zero,zmm3[19],zero,zmm3[25,26,27,22],zero,zmm3[20],zero,zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm3[55],zero,zmm3[53,54,55,58],zero,zmm3[56],zero,zmm3[60,61,58,59] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm4[18],zero,zero,zero,zero,zmm4[21],zero,zmm4[19],zero,zero,zero,zero,zmm4[22],zero,zmm4[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm4[57],zero,zmm4[55],zero,zero,zero,zero,zmm4[58],zero,zmm4[56],zero,zero,zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vporq %zmm3, %zmm4, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm3 {%k1} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm4 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm1[18],zero,zmm1[20,21,20,21],zero,zmm1[19],zero,zmm1[19,20,21,22],zero,zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm1[55],zero,zmm1[55,56,57,58],zero,zmm1[56],zero,zmm1[62,63] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm4[20],zero,zmm4[18],zero,zero,zero,zero,zmm4[21],zero,zmm4[19],zero,zero,zero,zero,zmm4[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm4[57],zero,zmm4[55],zero,zero,zero,zero,zmm4[58],zero,zmm4[56],zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vporq %zmm1, %zmm4, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] -; AVX512BW-FAST-NEXT: vpermw %zmm2, %zmm4, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm18 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] +; AVX512BW-FAST-NEXT: vpermw %zmm7, %zmm3, %zmm3 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm5 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm4[18],zero,zmm4[20,21,20,21],zero,zmm4[19],zero,zmm4[19,20,21,22],zero,zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm4[55],zero,zmm4[55,56,57,58],zero,zmm4[56],zero,zmm4[62,63] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm8 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm4[20],zero,zmm4[18],zero,zero,zero,zero,zmm4[21],zero,zmm4[19],zero,zero,zero,zero,zmm4[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm4[57],zero,zmm4[55],zero,zero,zero,zero,zmm4[58],zero,zmm4[56],zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vporq %zmm5, %zmm8, %zmm5 ; AVX512BW-FAST-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 -; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1} +; AVX512BW-FAST-NEXT: kmovq %rax, %k2 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm5 {%k2} +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm3 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm15, %zmm8 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,zmm3[19],zero,zmm3[21,20,21,22],zero,zmm3[20],zero,zmm3[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm3[55],zero,zero,zero,zero,zmm3[58],zero,zmm3[56],zero,zero,zero,zero,zmm3[59],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm8 = zmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm8[21],zero,zmm8[19],zero,zero,zero,zero,zmm8[22],zero,zmm8[20],zero,zero,zmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm8[55],zero,zero,zero,zero,zmm8[58],zero,zmm8[56],zero,zero,zero,zero,zmm8[59],zero,zmm8[57] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vporq %zmm3, %zmm8, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm8 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm4[18,19,20,21],zero,zmm4[19],zero,zmm4[25,26,27,22],zero,zmm4[20],zero,zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm4[55],zero,zmm4[53,54,55,58],zero,zmm4[56],zero,zmm4[60,61,58,59] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm7 = zmm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm11[18],zero,zero,zero,zero,zmm11[21],zero,zmm11[19],zero,zero,zero,zero,zmm11[22],zero,zmm11[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm11[57],zero,zmm11[55],zero,zero,zero,zero,zmm11[58],zero,zmm11[56],zero,zero,zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vporq %zmm8, %zmm7, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm7 {%k1} ; AVX512BW-FAST-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C +; AVX512BW-FAST-NEXT: kmovq %rax, %k2 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm5, %zmm7 {%k2} +; AVX512BW-FAST-NEXT: vpshufb %xmm0, %xmm25, %xmm0 +; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm24, %xmm1 +; AVX512BW-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm25[8],xmm24[8],xmm25[9],xmm24[9],xmm25[10],xmm24[10],xmm25[11],xmm24[11],xmm25[12],xmm24[12],xmm25[13],xmm24[13],xmm25[14],xmm24[14],xmm25[15],xmm24[15] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm1 +; AVX512BW-FAST-NEXT: vpshufb %xmm28, %xmm9, %xmm2 +; AVX512BW-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} +; AVX512BW-FAST-NEXT: vpshufb %xmm19, %xmm22, %xmm0 +; AVX512BW-FAST-NEXT: vpshufb %xmm20, %xmm23, %xmm2 +; AVX512BW-FAST-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm26, %zmm26, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,20,21,20,21,21,22,21,22,20,21,20,21,21,22,21,22] +; AVX512BW-FAST-NEXT: vpermw %zmm2, %zmm3, %zmm2 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; AVX512BW-FAST-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 +; AVX512BW-FAST-NEXT: kmovq %rax, %k1 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} +; AVX512BW-FAST-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387 ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, 256(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm27, 192(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, 320(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm27, 384(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, 192(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) -; AVX512BW-FAST-NEXT: addq $200, %rsp +; AVX512BW-FAST-NEXT: addq $136, %rsp ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq -; -; AVX512DQBW-SLOW-LABEL: store_i8_stride7_vf64: -; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rax), %ymm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rax), %ymm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm9, %ymm15, %ymm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm15, %ymm1, %ymm1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm20, %ymm10, %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm11, %ymm1 -; AVX512DQBW-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %xmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm21 -; AVX512DQBW-SLOW-NEXT: movabsq $2323999253380730912, %r10 # imm = 0x2040810204081020 -; AVX512DQBW-SLOW-NEXT: kmovq %r10, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm3, %zmm21 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm14, %ymm16, %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm23, %ymm17, %ymm5 -; AVX512DQBW-SLOW-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm7, %xmm7 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm25, %ymm18, %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %ymm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm26, %ymm19, %ymm7 -; AVX512DQBW-SLOW-NEXT: vpor %ymm3, %ymm7, %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm13, %xmm27, %xmm27 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm27, %zmm3 -; AVX512DQBW-SLOW-NEXT: movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306 -; AVX512DQBW-SLOW-NEXT: kmovq %r10, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm22, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: movabsq $4066998693416279096, %r10 # imm = 0x3870E1C3870E1C38 -; AVX512DQBW-SLOW-NEXT: kmovq %r10, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm21, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm2, %ymm21, %ymm21 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm9, %ymm2, %ymm9 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm9, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r9), %ymm21 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm20, %ymm21, %ymm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r8), %ymm22 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm22, %ymm24 -; AVX512DQBW-SLOW-NEXT: vporq %ymm20, %ymm24, %ymm20 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm24 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm22[20],zero,ymm22[18],zero,ymm22[20,21,20,21],zero,ymm22[19],zero,ymm22[19,20,21,22],zero -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm27 = ymm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm21[20],zero,ymm21[18],zero,zero,zero,zero,ymm21[21],zero,ymm21[19],zero,zero,zero,zero,ymm21[22] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vporq %ymm24, %ymm27, %ymm24 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm24 -; AVX512DQBW-SLOW-NEXT: movabsq $145249953336295682, %r10 # imm = 0x204081020408102 -; AVX512DQBW-SLOW-NEXT: kmovq %r10, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm9, %zmm24 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm28 -; AVX512DQBW-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm28[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,1,1,4,4,5,5] -; AVX512DQBW-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm20 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6] -; AVX512DQBW-SLOW-NEXT: movl $676341840, %r10d # imm = 0x28502850 -; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm20, %ymm27, %ymm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm25, %ymm28, %ymm25 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm26, %ymm27, %ymm26 -; AVX512DQBW-SLOW-NEXT: vporq %ymm25, %ymm26, %ymm25 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm25, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm26 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm14, %ymm26, %ymm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm29 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm23, %ymm29, %ymm23 -; AVX512DQBW-SLOW-NEXT: vporq %ymm14, %ymm23, %ymm14 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm23 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm26[18],zero,ymm26[18,19,20,21],zero,ymm26[19],zero,ymm26[25,26,27,22],zero,ymm26[20],zero -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm25 = ymm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm29[18],zero,zero,zero,zero,ymm29[21],zero,ymm29[19],zero,zero,zero,zero,ymm29[22],zero,ymm29[20] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vporq %ymm23, %ymm25, %ymm23 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm14, %zmm14 -; AVX512DQBW-SLOW-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830 -; AVX512DQBW-SLOW-NEXT: kmovq %r10, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm14, %zmm9 {%k2} -; AVX512DQBW-SLOW-NEXT: movabsq $-4357498600088870461, %r10 # imm = 0xC3870E1C3870E1C3 -; AVX512DQBW-SLOW-NEXT: kmovq %r10, %k3 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm24, %zmm9 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm28[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,3,3,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm24 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14] -; AVX512DQBW-SLOW-NEXT: movl $338170920, %r10d # imm = 0x14281428 -; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k4 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm27, %ymm14 {%k4} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] -; AVX512DQBW-SLOW-NEXT: # ymm23 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm23, %ymm27, %ymm25 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm27 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm28, %ymm28 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vporq %ymm25, %ymm28, %ymm25 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm25, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm25 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm26[24,25],zero,ymm26[23],zero,ymm26[21,22,23,26],zero,ymm26[24],zero,ymm26[28,29,26,27] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm28 = ymm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm29[25],zero,ymm29[23],zero,zero,zero,zero,ymm29[26],zero,ymm29[24],zero,zero,zero,zero -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vporq %ymm25, %ymm28, %ymm28 -; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm25 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm25, %ymm29, %ymm29 -; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm26 = ymm26[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm26 = ymm26[0,2,3,3,4,6,7,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm26, %ymm29 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm29[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm29 -; AVX512DQBW-SLOW-NEXT: movabsq $1742999440035548184, %r10 # imm = 0x183060C183060C18 -; AVX512DQBW-SLOW-NEXT: kmovq %r10, %k3 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm28, %zmm14 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm28 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm22[0,1,2,3],zmm28[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} zmm22 = zmm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm22[23],zero,zmm22[23,24,25,26],zero,zmm22[24],zero,zmm22[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm22[59],zero,zero,zero,zero,zmm22[62],zero,zmm22[60],zero,zero,zero,zero,zmm22[63],zero -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,3,2,3,6,7,6,7] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm29[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} zmm21 = zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm21[25],zero,zmm21[23],zero,zero,zero,zero,zmm21[26],zero,zmm21[24],zero,zero,zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm21[59],zero,zero,zero,zero,zmm21[62],zero,zmm21[60],zero,zero,zero,zero,zmm21[63],zero,zmm21[61] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm21 = zmm21[2,3,2,3,6,7,6,7] -; AVX512DQBW-SLOW-NEXT: vporq %zmm22, %zmm21, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512DQBW-SLOW-NEXT: movabsq $6971997760142192736, %rax # imm = 0x60C183060C183060 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k3 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm22, %zmm14 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63] -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm21, %zmm2, %zmm22 -; AVX512DQBW-SLOW-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k5 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm22, %zmm14 {%k5} -; AVX512DQBW-SLOW-NEXT: vpshuflw {{.*#+}} ymm22 = ymm18[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm30 = ymm22[0,0,1,1,4,4,5,5] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm20, %ymm19, %ymm30 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm20 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm23, %ymm19, %ymm22 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm18, %ymm23 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vporq %ymm22, %ymm23, %ymm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r9), %xmm22 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm30[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm27, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r8), %xmm23 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm26, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} zmm26 = zmm26[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm26[18,19,20,21],zero,zmm26[19],zero,zmm26[25,26,27,22],zero,zmm26[20],zero,zmm26[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm26[55],zero,zmm26[53,54,55,58],zero,zmm26[56],zero,zmm26[60,61,58,59] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm26 = zmm26[2,3,2,3,6,7,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm20, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} zmm20 = zmm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm20[18],zero,zero,zero,zero,zmm20[21],zero,zmm20[19],zero,zero,zero,zero,zmm20[22],zero,zmm20[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm20[57],zero,zmm20[55],zero,zero,zero,zero,zmm20[58],zero,zmm20[56],zero,zero,zero,zero -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm20 = zmm20[2,3,2,3,6,7,6,7] -; AVX512DQBW-SLOW-NEXT: vporq %zmm26, %zmm20, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdx), %xmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm27, %zmm20 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rcx), %xmm27 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm29, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} zmm29 = zmm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm29[18],zero,zmm29[20,21,20,21],zero,zmm29[19],zero,zmm29[19,20,21,22],zero,zmm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm29[55],zero,zmm29[55,56,57,58],zero,zmm29[56],zero,zmm29[62,63] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm29 = zmm29[2,3,2,3,6,7,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm28, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} zmm28 = zmm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm28[20],zero,zmm28[18],zero,zero,zero,zero,zmm28[21],zero,zmm28[19],zero,zero,zero,zero,zmm28[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm28[57],zero,zmm28[55],zero,zero,zero,zero,zmm28[58],zero,zmm28[56],zero,zero -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm28 = zmm28[2,3,2,3,6,7,6,7] -; AVX512DQBW-SLOW-NEXT: vporq %zmm29, %zmm28, %zmm29 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] -; AVX512DQBW-SLOW-NEXT: vpermw %zmm15, %zmm28, %zmm28 -; AVX512DQBW-SLOW-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k5 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm28, %zmm29 {%k5} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdi), %xmm28 -; AVX512DQBW-SLOW-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k5 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm29, %zmm20 {%k5} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rsi), %xmm29 -; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm18 = ymm18[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm30 = ymm18[2,2,3,3,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm19, %ymm30 {%k4} -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm28[0],xmm29[0],xmm28[1],xmm29[1],xmm28[2],xmm29[2],xmm28[3],xmm29[3],xmm28[4],xmm29[4],xmm28[5],xmm29[5],xmm28[6],xmm29[6],xmm28[7],xmm29[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm18 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm18, %xmm19, %xmm19 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm30[2,3,2,3],zmm19[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm26[0],xmm27[0],xmm26[1],xmm27[1],xmm26[2],xmm27[2],xmm26[3],xmm27[3],xmm26[4],xmm27[4],xmm26[5],xmm27[5],xmm26[6],xmm27[6],xmm26[7],xmm27[7] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm25, %ymm17, %ymm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm17 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm17, %xmm24, %xmm24 -; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm16 = ymm16[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm16[0,2,3,3,4,6,7,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm16, %ymm25 {%k1} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm25[2,3,2,3],zmm24[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm19, %zmm16 {%k2} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm10[27],zero,zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[27],zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX512DQBW-SLOW-NEXT: vpor %ymm10, %ymm11, %ymm11 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm10, %xmm19, %xmm19 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm19[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm11, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] -; AVX512DQBW-SLOW-NEXT: vpermw %zmm21, %zmm19, %zmm19 -; AVX512DQBW-SLOW-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm19, %zmm11 {%k1} -; AVX512DQBW-SLOW-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm11, %zmm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm27, %xmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm24 = -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm24, %xmm26, %xmm25 -; AVX512DQBW-SLOW-NEXT: vporq %xmm11, %xmm25, %xmm11 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm25 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm25, %xmm12 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm11, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm29, %xmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm26 = -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm26, %xmm28, %xmm27 -; AVX512DQBW-SLOW-NEXT: vporq %xmm25, %xmm27, %xmm25 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm28 = zmm11[0,1,0,1,4,5,4,5] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm13, %xmm27, %xmm11 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm25, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm11 = zmm11[0,1,0,1,4,5,4,5] -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm28, %zmm11 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm13, %xmm22, %xmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] -; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm21, %zmm2, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm2, %xmm23, %xmm21 -; AVX512DQBW-SLOW-NEXT: vporq %xmm25, %xmm21, %xmm21 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm4, %xmm22, %xmm4 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm21, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm4[0,1,0,1,4,5,4,5] -; AVX512DQBW-SLOW-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm27, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm4, %zmm11 {%k1} -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm6, %xmm4 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm24, %xmm5, %xmm19 -; AVX512DQBW-SLOW-NEXT: vporq %xmm4, %xmm19, %xmm4 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm17, %xmm5, %xmm5 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm8, %xmm5 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm26, %xmm7, %xmm6 -; AVX512DQBW-SLOW-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm18, %xmm6, %xmm6 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm6, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm4[0,1,0,1,4,5,4,5] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[0,1,0,1,4,5,4,5] -; AVX512DQBW-SLOW-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm4, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm4 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] -; AVX512DQBW-SLOW-NEXT: vpermw %zmm15, %zmm1, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] -; AVX512DQBW-SLOW-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} -; AVX512DQBW-SLOW-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vzeroupper -; AVX512DQBW-SLOW-NEXT: retq %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 %in.vec1 = load <64 x i8>, ptr %in.vecptr1, align 64 %in.vec2 = load <64 x i8>, ptr %in.vecptr2, align 64 @@ -9690,7 +8657,13 @@ ; AVX512-FAST: {{.*}} ; AVX512-SLOW: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} +; AVX512BW-ONLY-SLOW: {{.*}} +; AVX512DQ-FAST: {{.*}} +; AVX512DQ-SLOW: {{.*}} ; AVX512DQBW-FAST: {{.*}} +; AVX512DQBW-SLOW: {{.*}} +; AVX512F-ONLY-FAST: {{.*}} +; AVX512F-ONLY-SLOW: {{.*}} ; FALLBACK0: {{.*}} ; FALLBACK1: {{.*}} ; FALLBACK10: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -25,56 +25,98 @@ ; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: movdqa (%r8), %xmm2 ; SSE-NEXT: movdqa (%r11), %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] -; SSE-NEXT: packuswb %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: packssdw %xmm1, %xmm3 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pslld $16, %xmm2 +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: packssdw %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm0, (%rax) ; SSE-NEXT: retq ; -; AVX-LABEL: store_i8_stride8_vf2: -; AVX: # %bb.0: -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa (%rdx), %xmm1 -; AVX-NEXT: vmovdqa (%r8), %xmm2 -; AVX-NEXT: vmovdqa (%r11), %xmm3 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] -; AVX-NEXT: vmovdqa %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-ONLY-LABEL: store_i8_stride8_vf2: +; AVX1-ONLY: # %bb.0: +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%r11), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,1,3,5,7,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8,10,12,14,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rax) +; AVX1-ONLY-NEXT: retq +; +; AVX2-ONLY-LABEL: store_i8_stride8_vf2: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa (%r8), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa (%r11), %xmm3 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpbroadcastd %xmm3, %xmm3 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,1,3,5,7,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8,10,12,14,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rax) +; AVX2-ONLY-NEXT: retq +; +; AVX512-LABEL: store_i8_stride8_vf2: +; AVX512: # %bb.0: +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512-NEXT: vmovdqa (%r8), %xmm2 +; AVX512-NEXT: vmovdqa (%r11), %xmm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm3 = [0,4,0,4] +; AVX512-NEXT: vpermi2d %xmm1, %xmm2, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[8,10,12,14,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,1,3,5,7,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512-NEXT: vmovdqa %xmm0, (%rax) +; AVX512-NEXT: retq %in.vec0 = load <2 x i8>, ptr %in.vecptr0, align 64 %in.vec1 = load <2 x i8>, ptr %in.vecptr1, align 64 %in.vec2 = load <2 x i8>, ptr %in.vecptr2, align 64 @@ -103,73 +145,52 @@ ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa (%rdx), %xmm1 -; SSE-NEXT: movdqa (%r8), %xmm2 -; SSE-NEXT: movdqa (%r11), %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,5,7,5] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,5,6,4] -; SSE-NEXT: packuswb %xmm5, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm7, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7] -; SSE-NEXT: packuswb %xmm8, %xmm7 -; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm5[0,1,0,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm8, %xmm9 +; SSE-NEXT: movdqa (%r8), %xmm3 +; SSE-NEXT: movdqa (%r11), %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] +; SSE-NEXT: pand %xmm2, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,3,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,1,1,3] ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: pandn %xmm9, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm6[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm9, %xmm6 -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: packuswb %xmm7, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,5,7,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,0,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movdqa %xmm0, 16(%rax) -; SSE-NEXT: movdqa %xmm6, (%rax) +; SSE-NEXT: movdqa %xmm7, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride8_vf4: @@ -181,12 +202,12 @@ ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa (%r11), %xmm3 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [2,6,10,14,3,7,11,15,2,6,10,14,3,7,11,15] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm3 @@ -202,28 +223,53 @@ ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; -; AVX2-LABEL: store_i8_stride8_vf4: -; AVX2: # %bb.0: -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-NEXT: vmovdqa (%rdx), %xmm2 -; AVX2-NEXT: vmovdqa (%rcx), %xmm3 -; AVX2-NEXT: vinserti128 $1, (%r11), %ymm3, %ymm3 -; AVX2-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX2-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 -; AVX2-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-ONLY-LABEL: store_i8_stride8_vf4: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastd (%rcx), %xmm2 +; AVX2-ONLY-NEXT: vpbroadcastd (%rdx), %xmm3 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpbroadcastd (%r11), %xmm2 +; AVX2-ONLY-NEXT: vpbroadcastd (%r10), %xmm3 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq +; +; AVX512-LABEL: store_i8_stride8_vf4: +; AVX512: # %bb.0: +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa (%r8), %xmm1 +; AVX512-NEXT: vpbroadcastd (%rdx), %xmm2 +; AVX512-NEXT: vpunpckldq (%rcx){1to4}, %xmm2, %xmm2 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512-NEXT: vpbroadcastd (%r11), %xmm2 +; AVX512-NEXT: vpunpckldq (%r10){1to4}, %xmm2, %xmm2 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.vec0 = load <4 x i8>, ptr %in.vecptr0, align 64 %in.vec1 = load <4 x i8>, ptr %in.vecptr1, align 64 %in.vec2 = load <4 x i8>, ptr %in.vecptr2, align 64 @@ -739,38 +785,38 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm12[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm12[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] ; SSE-NEXT: movdqa %xmm2, %xmm9 ; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm14[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm14[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] ; SSE-NEXT: pand %xmm2, %xmm8 ; SSE-NEXT: por %xmm9, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,3,3] ; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: pandn %xmm9, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm0, %xmm9 ; SSE-NEXT: por %xmm5, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] ; SSE-NEXT: movdqa %xmm2, %xmm9 ; SSE-NEXT: pandn %xmm5, %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm14[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm14[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] ; SSE-NEXT: pand %xmm2, %xmm5 ; SSE-NEXT: por %xmm9, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,3,3] ; SSE-NEXT: movdqa %xmm0, %xmm12 ; SSE-NEXT: pandn %xmm9, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[3,3,3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm0, %xmm6 ; SSE-NEXT: por %xmm12, %xmm6 @@ -821,35 +867,35 @@ ; SSE-NEXT: por %xmm11, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] ; SSE-NEXT: movdqa %xmm2, %xmm11 ; SSE-NEXT: pandn %xmm6, %xmm11 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] ; SSE-NEXT: pand %xmm2, %xmm6 ; SSE-NEXT: por %xmm11, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,3,3] ; SSE-NEXT: movdqa %xmm0, %xmm13 ; SSE-NEXT: pandn %xmm11, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm10[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm0, %xmm11 ; SSE-NEXT: por %xmm13, %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: pandn %xmm4, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[3,3,3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm0 @@ -858,12 +904,12 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm0, 96(%rax) -; SSE-NEXT: movdqa %xmm11, 112(%rax) +; SSE-NEXT: movdqa %xmm0, 112(%rax) +; SSE-NEXT: movdqa %xmm11, 96(%rax) ; SSE-NEXT: movdqa %xmm5, 80(%rax) ; SSE-NEXT: movdqa %xmm9, 64(%rax) -; SSE-NEXT: movdqa %xmm12, 32(%rax) -; SSE-NEXT: movdqa %xmm8, 48(%rax) +; SSE-NEXT: movdqa %xmm12, 48(%rax) +; SSE-NEXT: movdqa %xmm8, 32(%rax) ; SSE-NEXT: movdqa %xmm7, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) @@ -1137,1225 +1183,1168 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride8_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $232, %rsp +; SSE-NEXT: subq $56, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rcx), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm10 +; SSE-NEXT: movdqa (%rsi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdx), %xmm12 +; SSE-NEXT: movdqa (%rcx), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%r8), %xmm13 -; SSE-NEXT: movdqa (%r9), %xmm12 +; SSE-NEXT: movdqa (%r9), %xmm1 ; SSE-NEXT: movdqa (%r10), %xmm14 -; SSE-NEXT: movdqa (%rax), %xmm11 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rax), %xmm8 +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,1] ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 ; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,2,1] -; SSE-NEXT: pand %xmm9, %xmm7 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,1,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: pandn %xmm6, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: por %xmm10, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm15[0,0,2,1,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,1] +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,0,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm7, %xmm11 +; SSE-NEXT: por %xmm15, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm6, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] ; SSE-NEXT: pand %xmm9, %xmm6 -; SSE-NEXT: por %xmm7, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,1,3] -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: pandn %xmm7, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,0,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: por %xmm10, %xmm7 +; SSE-NEXT: por %xmm11, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,1,3] +; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: pandn %xmm11, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,1,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm7, %xmm11 +; SSE-NEXT: por %xmm15, %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm14[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm6, %xmm11 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: por %xmm11, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm4[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: pandn %xmm11, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm7, %xmm11 +; SSE-NEXT: por %xmm15, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] ; SSE-NEXT: movdqa %xmm9, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[2,1,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: movdqa 16(%r8), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm8[8],xmm14[9],xmm8[9],xmm14[10],xmm8[10],xmm14[11],xmm8[11],xmm14[12],xmm8[12],xmm14[13],xmm8[13],xmm14[14],xmm8[14],xmm14[15],xmm8[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm1[8],xmm13[9],xmm1[9],xmm13[10],xmm1[10],xmm13[11],xmm1[11],xmm13[12],xmm1[12],xmm13[13],xmm1[13],xmm13[14],xmm1[14],xmm13[15],xmm1[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] ; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[8],mem[8],xmm12[9],mem[9],xmm12[10],mem[10],xmm12[11],mem[11],xmm12[12],mem[12],xmm12[13],mem[13],xmm12[14],mem[14],xmm12[15],mem[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[8],mem[8],xmm11[9],mem[9],xmm11[10],mem[10],xmm11[11],mem[11],xmm11[12],mem[12],xmm11[13],mem[13],xmm11[14],mem[14],xmm11[15],mem[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[8],mem[8],xmm10[9],mem[9],xmm10[10],mem[10],xmm10[11],mem[11],xmm10[12],mem[12],xmm10[13],mem[13],xmm10[14],mem[14],xmm10[15],mem[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] ; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] ; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] -; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,1,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa 16(%r10), %xmm10 +; SSE-NEXT: movdqa 16(%r10), %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 16(%r8), %xmm12 -; SSE-NEXT: movdqa 16(%r9), %xmm11 -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,1,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,1] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa 16(%rdx), %xmm13 -; SSE-NEXT: movdqa 16(%rcx), %xmm7 -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm14 -; SSE-NEXT: movdqa 16(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rax), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[3,3,3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa 16(%r9), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,1] +; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: movdqa 16(%rdx), %xmm4 +; SSE-NEXT: movdqa 16(%rcx), %xmm10 +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: pandn %xmm3, %xmm12 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa 16(%rsi), %xmm5 +; SSE-NEXT: movdqa %xmm3, %xmm14 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,0,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm7, %xmm15 +; SSE-NEXT: por %xmm12, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm15[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,1] +; SSE-NEXT: movdqa %xmm9, %xmm15 +; SSE-NEXT: pandn %xmm8, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,1] +; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: por %xmm15, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm13[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,1,3] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm15, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[1,1,1,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm8, %xmm15 +; SSE-NEXT: pand %xmm7, %xmm15 ; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: movdqa %xmm8, %xmm15 -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: por %xmm15, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[8],mem[8],xmm10[9],mem[9],xmm10[10],mem[10],xmm10[11],mem[11],xmm10[12],mem[12],xmm10[13],mem[13],xmm10[14],mem[14],xmm10[15],mem[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm11[8],xmm15[9],xmm11[9],xmm15[10],xmm11[10],xmm15[11],xmm11[11],xmm15[12],xmm11[12],xmm15[13],xmm11[13],xmm15[14],xmm11[14],xmm15[15],xmm11[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm7[8],xmm13[9],xmm7[9],xmm13[10],xmm7[10],xmm13[11],xmm7[11],xmm13[12],xmm7[12],xmm13[13],xmm7[13],xmm13[14],xmm7[14],xmm13[15],xmm7[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm6[8],xmm14[9],xmm6[9],xmm14[10],xmm6[10],xmm14[11],xmm6[11],xmm14[12],xmm6[12],xmm14[13],xmm6[13],xmm14[14],xmm6[14],xmm14[15],xmm6[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,0,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: por %xmm8, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm13[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm7, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,3,3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[8],mem[8],xmm11[9],mem[9],xmm11[10],mem[10],xmm11[11],mem[11],xmm11[12],mem[12],xmm11[13],mem[13],xmm11[14],mem[14],xmm11[15],mem[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,6,5,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[8],mem[8],xmm13[9],mem[9],xmm13[10],mem[10],xmm13[11],mem[11],xmm13[12],mem[12],xmm13[13],mem[13],xmm13[14],mem[14],xmm13[15],mem[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[3,3,3,3] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,2,2,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,7,7] +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[3,3,3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: pandn %xmm5, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: por %xmm10, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: pandn %xmm5, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,6,5,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: por %xmm10, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm12 -; SSE-NEXT: pandn %xmm10, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm14[3,3,3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm8, %xmm10 -; SSE-NEXT: por %xmm12, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] -; SSE-NEXT: pand %xmm9, %xmm11 -; SSE-NEXT: pandn %xmm5, %xmm9 -; SSE-NEXT: por %xmm11, %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm13[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm6, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: por %xmm6, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] +; SSE-NEXT: pand %xmm9, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: por %xmm10, %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: por %xmm3, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movdqa %xmm3, 240(%rax) ; SSE-NEXT: movdqa %xmm6, 224(%rax) -; SSE-NEXT: movdqa %xmm10, 240(%rax) -; SSE-NEXT: movdqa %xmm4, 160(%rax) -; SSE-NEXT: movdqa %xmm2, 176(%rax) -; SSE-NEXT: movdqa %xmm0, 96(%rax) -; SSE-NEXT: movdqa %xmm3, 112(%rax) -; SSE-NEXT: movdqa %xmm1, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: movdqa %xmm2, 208(%rax) +; SSE-NEXT: movdqa %xmm5, 192(%rax) +; SSE-NEXT: movdqa %xmm1, 176(%rax) +; SSE-NEXT: movdqa %xmm8, 160(%rax) +; SSE-NEXT: movdqa %xmm15, 144(%rax) +; SSE-NEXT: movdqa %xmm12, 128(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 208(%rax) +; SSE-NEXT: movaps %xmm0, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rax) +; SSE-NEXT: movaps %xmm0, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%rax) +; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: addq $232, %rsp +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: addq $56, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride8_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $72, %rsp +; AVX1-ONLY-NEXT: subq $40, %rsp ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm4 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm9 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm6 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm11 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm7 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm11 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm8 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm13 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm14 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm15[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm2, %ymm13 -; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm2, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm12 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm4 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm15[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm2, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm14[8],mem[8],xmm14[9],mem[9],xmm14[10],mem[10],xmm14[11],mem[11],xmm14[12],mem[12],xmm14[13],mem[13],xmm14[14],mem[14],xmm14[15],mem[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm6 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3],xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm7[8],mem[8],xmm7[9],mem[9],xmm7[10],mem[10],xmm7[11],mem[11],xmm7[12],mem[12],xmm7[13],mem[13],xmm7[14],mem[14],xmm7[15],mem[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm10, %ymm7 +; AVX1-ONLY-NEXT: vmovaps %ymm10, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm7, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm12[8],mem[8],xmm12[9],mem[9],xmm12[10],mem[10],xmm12[11],mem[11],xmm12[12],mem[12],xmm12[13],mem[13],xmm12[14],mem[14],xmm12[15],mem[15] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm11 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm14 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,1,3,4,5,5,7] +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm4, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm13, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0],ymm3[1],ymm11[2],ymm3[3],ymm11[4],ymm3[5],ymm11[6],ymm3[7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm11, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm12[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm9, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0],ymm5[1],ymm1[2],ymm5[3],ymm1[4],ymm5[5],ymm1[6],ymm5[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm5[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm13[8],xmm3[8],xmm13[9],xmm3[9],xmm13[10],xmm3[10],xmm13[11],xmm3[11],xmm13[12],xmm3[12],xmm13[13],xmm3[13],xmm13[14],xmm3[14],xmm13[15],xmm3[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm7, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm10 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3],xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm8 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm9, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm9, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm11, %ymm13, %ymm11 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0],ymm11[1],ymm3[2],ymm11[3],ymm3[4],ymm11[5],ymm3[6],ymm11[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm9, %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm2, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm6[8],xmm12[9],xmm6[9],xmm12[10],xmm6[10],xmm12[11],xmm6[11],xmm12[12],xmm6[12],xmm12[13],xmm6[13],xmm12[14],xmm6[14],xmm12[15],xmm6[15] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm4, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 128(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $72, %rsp +; AVX1-ONLY-NEXT: addq $40, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i8_stride8_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $88, %rsp +; AVX2-SLOW-NEXT: subq $24, %rsp ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-SLOW-NEXT: vmovdqa (%r10), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm3 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa (%r10), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm8 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm5 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7],ymm6[8,9,10],ymm0[11],ymm6[12,13,14],ymm0[15] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm7 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm11 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm14[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm8, %ymm15 -; AVX2-SLOW-NEXT: vmovaps 16(%r10), %xmm8 -; AVX2-SLOW-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7,8],ymm15[9],ymm9[10,11,12],ymm15[13],ymm9[14,15] -; AVX2-SLOW-NEXT: vmovdqa 16(%rax), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm10 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm14 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm7[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm2, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 16(%r10), %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm4[0],ymm15[1],ymm4[2,3,4],ymm15[5],ymm4[6,7,8],ymm15[9],ymm4[10,11,12],ymm15[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vmovdqa 16(%rax), %xmm4 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2],ymm0[3],ymm15[4],ymm0[5],ymm15[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 16(%r9), %xmm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm15, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm5[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm15, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 16(%r9), %xmm5 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,3,3,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm14[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm13, %ymm13 -; AVX2-SLOW-NEXT: vmovdqa 16(%r8), %xmm15 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7,8],ymm13[9],ymm1[10,11,12],ymm13[13],ymm1[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1,2],ymm1[3],ymm15[4,5,6],ymm1[7],ymm15[8,9,10],ymm1[11],ymm15[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7,8],ymm7[9],ymm2[10,11,12],ymm7[13],ymm2[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm15, %ymm6 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm7[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm15, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 16(%r8), %xmm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm6[0],ymm15[1],ymm6[2,3,4],ymm15[5],ymm6[6,7,8],ymm15[9],ymm6[10,11,12],ymm15[13],ymm6[14,15] +; AVX2-SLOW-NEXT: vmovaps 16(%rsi), %xmm0 +; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm1[1],ymm15[2],ymm1[3],ymm15[4],ymm1[5],ymm15[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm3[8],xmm8[9],xmm3[9],xmm8[10],xmm3[10],xmm8[11],xmm3[11],xmm8[12],xmm3[12],xmm8[13],xmm3[13],xmm8[14],xmm3[14],xmm8[15],xmm3[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm8[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm10[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm9, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3],ymm15[4,5,6],ymm3[7],ymm15[8,9,10],ymm3[11],ymm15[12,13,14],ymm3[15] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm12, %ymm12 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7,8],ymm14[9],ymm12[10,11,12],ymm14[13],ymm12[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm3[1],ymm12[2],ymm3[3],ymm12[4],ymm3[5],ymm12[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm8[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm12, %ymm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7],ymm10[8,9,10],ymm8[11],ymm10[12,13,14],ymm8[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm11, %ymm11 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7,8],ymm11[9],ymm10[10,11,12],ymm11[13],ymm10[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovdqa %xmm9, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm10[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm9 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm13[0,1,2],ymm7[3],ymm13[4,5,6],ymm7[7],ymm13[8,9,10],ymm7[11],ymm13[12,13,14],ymm7[15] -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[3,3,3,3] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm8 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm6 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm14[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm14[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3],ymm12[4,5,6],ymm10[7],ymm12[8,9,10],ymm10[11],ymm12[12,13,14],ymm10[15] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm13 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0],ymm10[1],ymm1[2],ymm10[3],ymm1[4],ymm10[5],ymm1[6],ymm10[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm14, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm14[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[3,3,3,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4],ymm7[5],ymm0[6],ymm7[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm2[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm12[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm15, %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7],ymm7[8,9,10],ymm0[11],ymm7[12,13,14],ymm0[15] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,3,3,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7,8],ymm3[9],ymm5[10,11,12],ymm3[13],ymm5[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7,8],ymm1[9],ymm3[10,11,12],ymm1[13],ymm3[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7,8],ymm6[9],ymm4[10,11,12],ymm6[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 224(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 192(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm10, 128(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) -; AVX2-SLOW-NEXT: addq $88, %rsp +; AVX2-SLOW-NEXT: addq $24, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i8_stride8_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $72, %rsp +; AVX2-FAST-NEXT: pushq %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm5 -; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm6 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm9 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm9[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm7 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm10 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm15 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3],xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7,8],ymm14[9],ymm13[10,11,12],ymm14[13],ymm13[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2],ymm11[3],ymm13[4],ymm11[5],ymm13[6],ymm11[7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm14 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm13 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7,8],ymm0[9],ymm9[10,11,12],ymm0[13],ymm9[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm10 +; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm11 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm12 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm13 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm14 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm15 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7,8],ymm8[9],ymm6[10,11,12],ymm8[13],ymm6[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm7[8],xmm15[9],xmm7[9],xmm15[10],xmm7[10],xmm15[11],xmm7[11],xmm15[12],xmm7[12],xmm15[13],xmm7[13],xmm15[14],xmm7[14],xmm15[15],xmm7[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm10 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3],ymm10[4,5,6],ymm2[7],ymm10[8,9,10],ymm2[11],ymm10[12,13,14],ymm2[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm15[8],xmm4[9],xmm15[9],xmm4[10],xmm15[10],xmm4[11],xmm15[11],xmm4[12],xmm15[12],xmm4[13],xmm15[13],xmm4[14],xmm15[14],xmm4[15],xmm15[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm10 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7,8],ymm11[9],ymm10[10,11,12],ymm11[13],ymm10[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm2[1],ymm10[2],ymm2[3],ymm10[4],ymm2[5],ymm10[6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] +; AVX2-FAST-NEXT: vmovdqa 16(%r10), %xmm12 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-FAST-NEXT: vmovdqa 16(%rax), %xmm14 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] +; AVX2-FAST-NEXT: vmovdqa 16(%r9), %xmm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 16(%r10), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 16(%rax), %xmm6 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm12 -; AVX2-FAST-NEXT: vmovdqa 16(%r9), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 16(%r8), %xmm5 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm4 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vmovdqa 16(%r8), %xmm6 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3],xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm10 ; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm9 -; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm14 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm9[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7,8],ymm13[9],ymm14[10,11,12],ymm13[13],ymm14[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4],ymm15[5],ymm13[6],ymm15[7] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7],ymm8[8,9,10],ymm7[11],ymm8[12,13,14],ymm7[15] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7,8],ymm4[9],ymm0[10,11,12],ymm4[13],ymm0[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4],ymm7[5],ymm0[6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm5 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm4 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm11, %xmm5 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7,8],ymm9[9],ymm1[10,11,12],ymm9[13],ymm1[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm13[1],ymm1[2],ymm13[3],ymm1[4],ymm13[5],ymm1[6],ymm13[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm15[8],xmm6[9],xmm15[9],xmm6[10],xmm15[10],xmm6[11],xmm15[11],xmm6[12],xmm15[12],xmm6[13],xmm15[13],xmm6[14],xmm15[14],xmm6[15],xmm15[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm11[8],xmm7[9],xmm11[9],xmm7[10],xmm11[10],xmm7[11],xmm11[11],xmm7[12],xmm11[12],xmm7[13],xmm11[13],xmm7[14],xmm11[14],xmm7[15],xmm11[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7,8],ymm7[9],ymm6[10,11,12],ymm7[13],ymm6[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 128(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 224(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm15, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 192(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm13, 128(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-NEXT: addq $72, %rsp +; AVX2-FAST-NEXT: popq %rax ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i8_stride8_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $72, %rsp +; AVX2-FAST-PERLANE-NEXT: pushq %rax ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm9[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3],xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7,8],ymm14[9],ymm13[10,11,12],ymm14[13],ymm13[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2],ymm11[3],ymm13[4],ymm11[5],ymm13[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7,8],ymm0[9],ymm9[10,11,12],ymm0[13],ymm9[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7,8],ymm8[9],ymm6[10,11,12],ymm8[13],ymm6[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm7[8],xmm15[9],xmm7[9],xmm15[10],xmm7[10],xmm15[11],xmm7[11],xmm15[12],xmm7[12],xmm15[13],xmm7[13],xmm15[14],xmm7[14],xmm15[15],xmm7[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3],ymm10[4,5,6],ymm2[7],ymm10[8,9,10],ymm2[11],ymm10[12,13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm15[8],xmm4[9],xmm15[9],xmm4[10],xmm15[10],xmm4[11],xmm15[11],xmm4[12],xmm15[12],xmm4[13],xmm15[13],xmm4[14],xmm15[14],xmm4[15],xmm15[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm3, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7,8],ymm11[9],ymm10[10,11,12],ymm11[13],ymm10[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm2[1],ymm10[2],ymm2[3],ymm10[4],ymm2[5],ymm10[6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r10), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rax), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r9), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r10), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rax), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r9), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r8), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm12, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm10, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm15 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rcx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r8), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3],xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rcx), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm9[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7,8],ymm13[9],ymm14[10,11,12],ymm13[13],ymm14[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4],ymm15[5],ymm13[6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7],ymm8[8,9,10],ymm7[11],ymm8[12,13,14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7,8],ymm4[9],ymm0[10,11,12],ymm4[13],ymm0[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4],ymm7[5],ymm0[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm8, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7,8],ymm9[9],ymm1[10,11,12],ymm9[13],ymm1[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm13[1],ymm1[2],ymm13[3],ymm1[4],ymm13[5],ymm1[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm8, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm15[8],xmm6[9],xmm15[9],xmm6[10],xmm15[10],xmm6[11],xmm15[11],xmm6[12],xmm15[12],xmm6[13],xmm15[13],xmm6[14],xmm15[14],xmm6[15],xmm15[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm12, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm10, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm9, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm11, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm11[8],xmm7[9],xmm11[9],xmm7[10],xmm11[10],xmm7[11],xmm11[11],xmm7[12],xmm11[12],xmm7[13],xmm11[13],xmm7[14],xmm11[14],xmm7[15],xmm11[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7,8],ymm7[9],ymm6[10,11,12],ymm7[13],ymm6[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $72, %rsp +; AVX2-FAST-PERLANE-NEXT: popq %rax ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -2363,194 +2352,181 @@ ; AVX512F-ONLY-SLOW: # %bb.0: ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r10), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%r10), %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rax), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rax), %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r10), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rax), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%r9), %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%r8), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm24 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7],ymm5[8,9,10],ymm1[11],ymm5[12,13,14],ymm1[15] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,0,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm10[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm15, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,1,3,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm2, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rcx), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm15, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3],ymm10[4,5,6],ymm1[7],ymm10[8,9,10],ymm1[11],ymm10[12,13,14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm10, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm6[0,1,2,3,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm6[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[2,1,3,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%r10), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm6[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm11, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7,8],ymm6[9],ymm1[10,11,12],ymm6[13],ymm1[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rax), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%r9), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%r8), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm9 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rsi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm19 = xmm0[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm20 = xmm0[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm19, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7,8],ymm15[9],ymm1[10,11,12],ymm15[13],ymm1[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm15, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm6, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm15, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,2,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm11[3],ymm9[4,5,6],ymm11[7],ymm9[8,9,10],ymm11[11],ymm9[12,13,14],ymm11[15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3],ymm8[4,5,6],ymm2[7],ymm8[8,9,10],ymm2[11],ymm8[12,13,14],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rcx), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdx), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm14, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm12, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7,8],ymm0[9],ymm12[10,11,12],ymm0[13],ymm12[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm12, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm13[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7,8],ymm12[9],ymm7[10,11,12],ymm12[13],ymm7[14,15] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm13[0,1,2,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1,2],ymm12[3],ymm14[4,5,6],ymm12[7],ymm14[8,9,10],ymm12[11],ymm14[12,13,14],ymm12[15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,0,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm11, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7],ymm6[8,9,10],ymm1[11],ymm6[12,13,14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm14, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,3,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3],ymm13[4,5,6],ymm0[7],ymm13[8,9,10],ymm0[11],ymm13[12,13,14],ymm0[15] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm15[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm13, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2,3,4],ymm2[5],ymm12[6,7,8],ymm2[9],ymm12[10,11,12],ymm2[13],ymm12[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm15[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm15[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm13, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7,8],ymm12[9],ymm0[10,11,12],ymm12[13],ymm0[14,15] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm24, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7],ymm5[8,9,10],ymm2[11],ymm5[12,13,14],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm25, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm4[8],xmm11[9],xmm4[9],xmm11[10],xmm4[10],xmm11[11],xmm4[11],xmm11[12],xmm4[12],xmm11[13],xmm4[13],xmm11[14],xmm4[14],xmm11[15],xmm4[15] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm6, %ymm6 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[2,1,3,3,6,5,7,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] @@ -2558,24 +2534,23 @@ ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,1,3,4,5,5,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm16, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm17, %zmm15 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm16, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm17, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm18, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; @@ -2583,162 +2558,166 @@ ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r10), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r10), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3],ymm5[4,5,6],ymm0[7],ymm5[8,9,10],ymm0[11],ymm5[12,13,14],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2,3,4],ymm2[5],ymm8[6,7,8],ymm2[9],ymm8[10,11,12],ymm2[13],ymm8[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwq {{.*#+}} xmm17 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm25 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm0, %ymm17, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3],ymm12[4,5,6],ymm1[7],ymm12[8,9,10],ymm1[11],ymm12[12,13,14],ymm1[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r10), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm30 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm28 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r10), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rax), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r9), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r8), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[0,1,2,3,8,9,u,u,8,9,10,11,10,11,u,u,16,17,18,19,28,29,u,u,28,29,26,27,30,31,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,0,1,u,u,8,9,10,11,2,3,u,u,20,21,18,19,20,21,u,u,24,25,26,27,22,23,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rsi), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm14, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rcx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15] -; AVX512F-ONLY-FAST-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm31 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm14, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7,8],ymm14[9],ymm0[10,11,12],ymm14[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rax), %xmm14 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r9), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r8), %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7,8],ymm1[9],ymm3[10,11,12],ymm1[13],ymm3[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm22 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rcx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rsi), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3],xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm9, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7,8],ymm6[9],ymm4[10,11,12],ymm6[13],ymm4[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm7 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] +; AVX512F-ONLY-FAST-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7,8],ymm2[9],ymm3[10,11,12],ymm2[13],ymm3[14,15] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm30, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm6 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm12[0,1,2],ymm6[3],ymm12[4,5,6],ymm6[7],ymm12[8,9,10],ymm6[11],ymm12[12,13,14],ymm6[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm11[8],xmm5[8],xmm11[9],xmm5[9],xmm11[10],xmm5[10],xmm11[11],xmm5[11],xmm11[12],xmm5[12],xmm11[13],xmm5[13],xmm11[14],xmm5[14],xmm11[15],xmm5[15] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm6 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm6 ; AVX512F-ONLY-FAST-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7,8],ymm3[9],ymm4[10,11,12],ymm3[13],ymm4[14,15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3,4],ymm0[5],ymm4[6,7,8],ymm0[9],ymm4[10,11,12],ymm0[13],ymm4[14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm23, %zmm19 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm17, %zmm18 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm20, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm16, %zmm17 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm18, %zmm19 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm20, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, (%rax) ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; @@ -2747,212 +2726,200 @@ ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rsi), %xmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm20 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] ; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rcx), %xmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdx), %xmm13 -; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm22 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7,8],ymm7[9],ymm5[10,11,12],ymm7[13],ymm5[14,15] -; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm7, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r10), %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rax), %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm8 -; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,5,7,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm15, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3],ymm1[4,5,6],ymm9[7],ymm1[8,9,10],ymm9[11],ymm1[12,13,14],ymm9[15] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm9, %ymm9 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3],ymm9[4,5,6],ymm0[7],ymm9[8,9,10],ymm0[11],ymm9[12,13,14],ymm0[15] -; AVX512DQ-SLOW-NEXT: movw $-21846, %cx # imm = 0xAAAA -; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm16 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm15[0,1,2,3,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm14, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%r10), %xmm14 -; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm9, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rax), %xmm15 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 -; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm18 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%r9), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%r8), %xmm9 -; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,6,5,7,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,2,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6],ymm4[7],ymm2[8,9,10],ymm4[11],ymm2[12,13,14],ymm4[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7],ymm0[8,9,10],ymm1[11],ymm0[12,13,14],ymm1[15] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm17 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm4[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[2,1,3,3,6,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7,8],ymm5[9],ymm2[10,11,12],ymm5[13],ymm2[14,15] +; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm10, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm3[8],xmm9[9],xmm3[9],xmm9[10],xmm3[10],xmm9[11],xmm3[11],xmm9[12],xmm3[12],xmm9[13],xmm3[13],xmm9[14],xmm3[14],xmm9[15],xmm3[15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,7,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7,8],ymm4[9],ymm0[10,11,12],ymm4[13],ymm0[14,15] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r10), %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rax), %xmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm12 +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm6[0,1,2,3,6,5,7,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm2, %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rsi), %xmm2 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm14, %ymm4 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm13[0,1,2],ymm5[3],ymm13[4,5,6],ymm5[7],ymm13[8,9,10],ymm5[11],ymm13[12,13,14],ymm5[15] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm13, %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6],ymm4[7],ymm6[8,9,10],ymm4[11],ymm6[12,13,14],ymm4[15] +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX512DQ-SLOW-NEXT: movw $-21846, %si # imm = 0xAAAA +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm5, %zmm6, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rcx), %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdx), %xmm6 +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm7[0,1,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm8, %ymm8 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm3, %ymm3 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm3[0],ymm8[1],ymm3[2,3,4],ymm8[5],ymm3[6,7,8],ymm8[9],ymm3[10,11,12],ymm8[13],ymm3[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%r10), %xmm3 +; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm7[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm14, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7,8],ymm7[9],ymm1[10,11,12],ymm7[13],ymm1[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rax), %xmm8 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%r9), %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm9, %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%r8), %xmm9 +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm11[0,1,2,3,6,5,7,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm12, %ymm12 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6],ymm13[7],ymm12[8,9,10],ymm13[11],ymm12[12,13,14],ymm13[15] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm10[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm11[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm13, %ymm11 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7],ymm11[8,9,10],ymm10[11],ymm11[12,13,14],ymm10[15] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm12, %zmm10, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm13[0,1,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm11, %ymm11 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,1,3,3,6,5,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7,8],ymm12[9],ymm11[10,11,12],ymm12[13],ymm11[14,15] +; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,1,1] +; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm13[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7,8],ymm12[9],ymm10[10,11,12],ymm12[13],ymm10[14,15] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm13[0,1,2,3,6,5,7,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm11[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm15, %ymm11 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1,2],ymm12[3],ymm14[4,5,6],ymm12[7],ymm14[8,9,10],ymm12[11],ymm14[12,13,14],ymm12[15] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3],ymm13[4,5,6],ymm11[7],ymm13[8,9,10],ymm11[11],ymm13[12,13,14],ymm11[15] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm12, %zmm11, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm6, %ymm6 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm4, %ymm4 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7,8],ymm6[9],ymm4[10,11,12],ymm6[13],ymm4[14,15] +; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm21, %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm11, %ymm5 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7,8],ymm5[9],ymm2[10,11,12],ymm5[13],ymm2[14,15] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm3[8],xmm8[9],xmm3[9],xmm8[10],xmm3[10],xmm8[11],xmm3[11],xmm8[12],xmm3[12],xmm8[13],xmm3[13],xmm8[14],xmm3[14],xmm8[15],xmm3[15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,3,3,6,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,6,5,7,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm7, %ymm3 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6],ymm4[7],ymm6[8,9,10],ymm4[11],ymm6[12,13,14],ymm4[15] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm1, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm4, %zmm3, %zmm2 {%k1} ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rax) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; @@ -2960,161 +2927,163 @@ ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3],xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7,8],ymm2[9],ymm4[10,11,12],ymm2[13],ymm4[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm7 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm20 +; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3,4],ymm0[5],ymm5[6,7,8],ymm0[9],ymm5[10,11,12],ymm0[13],ymm5[14,15] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa (%r10), %xmm14 +; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %xmm15 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm9 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm22 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3],ymm2[4,5,6],ymm7[7],ymm2[8,9,10],ymm7[11],ymm2[12,13,14],ymm7[15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm23 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm24 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7],ymm0[8,9,10],ymm1[11],ymm0[12,13,14],ymm1[15] +; AVX512DQ-FAST-NEXT: movw $-21846, %r11w # imm = 0xAAAA +; AVX512DQ-FAST-NEXT: kmovw %r11d, %k1 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm16 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm13[8],xmm10[9],xmm13[9],xmm10[10],xmm13[10],xmm10[11],xmm13[11],xmm10[12],xmm13[12],xmm10[13],xmm13[13],xmm10[14],xmm13[14],xmm10[15],xmm13[15] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm22 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7,8],ymm1[9],ymm3[10,11,12],ymm1[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, %xmm10 -; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7,8],ymm1[9],ymm10[10,11,12],ymm1[13],ymm10[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rcx), %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm11, %ymm2 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdx), %xmm12 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa (%r10), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %xmm11 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm28 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rsi), %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdi), %xmm13 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm2 ; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm13 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm19 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm3[8],xmm9[9],xmm3[9],xmm9[10],xmm3[10],xmm9[11],xmm3[11],xmm9[12],xmm3[12],xmm9[13],xmm3[13],xmm9[14],xmm3[14],xmm9[15],xmm3[15] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm3 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm3 ; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX512DQ-FAST-NEXT: movw $-21846, %r11w # imm = 0xAAAA -; AVX512DQ-FAST-NEXT: kmovw %r11d, %k1 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm17 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rsi), %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdi), %xmm8 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm30 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rcx), %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdx), %xmm6 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm10, %xmm29 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%r10), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rax), %xmm4 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%r9), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%r8), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[0,1,2,3,8,9,u,u,8,9,10,11,10,11,u,u,16,17,18,19,28,29,u,u,28,29,26,27,30,31,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3],ymm14[4,5,6],ymm1[7],ymm14[8,9,10],ymm1[11],ymm14[12,13,14],ymm1[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[0,1,2,3,0,1,u,u,8,9,10,11,2,3,u,u,20,21,18,19,20,21,u,u,24,25,26,27,22,23,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6],ymm0[7],ymm14[8,9,10],ymm0[11],ymm14[12,13,14],ymm0[15] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm16 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm10 -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7,8],ymm7[9],ymm6[10,11,12],ymm7[13],ymm6[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm9 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm15 -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm23 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7,8],ymm1[9],ymm3[10,11,12],ymm1[13],ymm3[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 16(%r10), %xmm14 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm20 +; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm24 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rax), %xmm9 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%r9), %xmm4 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm7 -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm3, %zmm1, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa 16(%r8), %xmm15 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3],xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3],ymm5[4,5,6],ymm0[7],ymm5[8,9,10],ymm0[11],ymm5[12,13,14],ymm0[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm2, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm14[8],xmm9[9],xmm14[9],xmm9[10],xmm14[10],xmm9[11],xmm14[11],xmm9[12],xmm14[12],xmm9[13],xmm14[13],xmm9[14],xmm14[14],xmm9[15],xmm14[15] +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm4 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm5 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] ; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm4, %zmm2, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm4, %zmm2, %zmm0 {%k1} ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, (%rax) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -3123,110 +3092,90 @@ ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r11), %xmm25 -; AVX512BW-SLOW-NEXT: vmovdqa 16(%r11), %xmm11 -; AVX512BW-SLOW-NEXT: vmovdqa (%r10), %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa 16(%r10), %xmm12 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm25[8],xmm1[9],xmm25[9],xmm1[10],xmm25[10],xmm1[11],xmm25[11],xmm1[12],xmm25[12],xmm1[13],xmm25[13],xmm1[14],xmm25[14],xmm1[15],xmm25[15] +; AVX512BW-SLOW-NEXT: vmovdqa (%r11), %xmm11 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%r11), %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa (%r10), %xmm12 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%r10), %xmm1 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa 16(%r9), %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa 16(%r8), %xmm14 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm19 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,16,17,20,52,20,21,21,53,16,17,22,54,22,21,23,55] -; AVX512BW-SLOW-NEXT: vpermt2w %zmm5, %zmm23, %zmm19 -; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa 16(%rsi), %xmm15 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm6 -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rdi), %xmm16 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm10 -; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rcx), %xmm17 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rdx), %xmm18 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm21 = xmm20[0,0,2,1,4,5,6,7] -; AVX512BW-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm21 = xmm21[0],zero,xmm21[1],zero -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm22 = xmm20[0,2,2,3,4,5,6,7] -; AVX512BW-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm22 = xmm22[0],zero,xmm22[1],zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm21, %ymm4 -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0],ymm4[1],ymm10[2,3,4],ymm4[5],ymm10[6,7,8],ymm4[9],ymm10[10,11,12],ymm4[13],ymm10[14,15] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm22 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u> -; AVX512BW-SLOW-NEXT: vpermt2w %ymm20, %ymm22, %ymm7 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm7 +; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%r9), %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm14 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%r8), %xmm3 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm15 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,16,17,20,52,20,21,21,53,16,17,22,54,22,21,23,55] +; AVX512BW-SLOW-NEXT: vpermt2w %zmm5, %zmm4, %zmm15 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %xmm16 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%rcx), %xmm6 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%rdx), %xmm7 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm18 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %xmm19 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%rsi), %xmm8 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm20 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm9 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm21 = xmm5[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm10, %ymm10 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,32,2,3,4,33,6,7,8,34,10,11,12,35,14,15,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u> +; AVX512BW-SLOW-NEXT: vpermt2w %zmm18, %zmm10, %zmm5 ; AVX512BW-SLOW-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm19, %zmm7 {%k1} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm4 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm19, %zmm20 -; AVX512BW-SLOW-NEXT: vpermt2w %zmm4, %zmm23, %zmm20 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm21 = xmm4[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm10 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm21 = xmm19[0,0,2,1,4,5,6,7] -; AVX512BW-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm21 = xmm21[0],zero,xmm21[1],zero -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm24 = xmm19[0,2,2,3,4,5,6,7] -; AVX512BW-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm24 = xmm24[0],zero,xmm24[1],zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm0 -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7,8],ymm0[9],ymm10[10,11,12],ymm0[13],ymm10[14,15] -; AVX512BW-SLOW-NEXT: vpermt2w %ymm19, %ymm22, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm19 -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm20, %zmm19 {%k1} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm15, %zmm5 {%k1} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm12, %zmm12 +; AVX512BW-SLOW-NEXT: vpermt2w %zmm11, %zmm4, %zmm12 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm13 +; AVX512BW-SLOW-NEXT: vpermt2w %zmm11, %zmm10, %zmm13 +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm12, %zmm12 +; AVX512BW-SLOW-NEXT: vpermt2w %zmm11, %zmm4, %zmm12 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm14[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm16, %ymm15, %ymm15 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm15, %zmm14 +; AVX512BW-SLOW-NEXT: vpermt2w %zmm11, %zmm10, %zmm14 +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm12, %zmm14 {%k1} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm4 -; AVX512BW-SLOW-NEXT: vpermt2w %zmm0, %zmm23, %zmm4 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15] -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,0,2,1,4,5,6,7] -; AVX512BW-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm11[0,2,2,3,4,5,6,7] -; AVX512BW-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7,8],ymm12[9],ymm10[10,11,12],ymm12[13],ymm10[14,15] -; AVX512BW-SLOW-NEXT: vpermt2w %ymm11, %ymm22, %ymm0 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm25[0],xmm1[1],xmm25[1],xmm1[2],xmm25[2],xmm1[3],xmm25[3],xmm1[4],xmm25[4],xmm1[5],xmm25[5],xmm1[6],xmm25[6],xmm1[7],xmm25[7] +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 -; AVX512BW-SLOW-NEXT: vpermt2w %zmm1, %zmm23, %zmm2 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpermt2w %zmm0, %zmm4, %zmm1 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7] -; AVX512BW-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,2,2,3,4,5,6,7] -; AVX512BW-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15] -; AVX512BW-SLOW-NEXT: vpermt2w %ymm4, %ymm22, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512BW-SLOW-NEXT: vpermt2w %zmm0, %zmm10, %zmm2 +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; @@ -3235,74 +3184,90 @@ ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-FAST-NEXT: vmovdqa (%r11), %xmm0 -; AVX512BW-FAST-NEXT: vmovdqa 16(%r11), %xmm7 -; AVX512BW-FAST-NEXT: vmovdqa (%r10), %xmm1 -; AVX512BW-FAST-NEXT: vmovdqa 16(%r10), %xmm8 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa (%r9), %xmm2 -; AVX512BW-FAST-NEXT: vmovdqa 16(%r9), %xmm9 -; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm3 -; AVX512BW-FAST-NEXT: vmovdqa 16(%r8), %xmm10 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,16,17,20,52,20,21,21,53,16,17,22,54,22,21,23,55] -; AVX512BW-FAST-NEXT: vpermt2w %zmm5, %zmm4, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %xmm5 -; AVX512BW-FAST-NEXT: vmovdqa 16(%rcx), %xmm12 -; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm13 -; AVX512BW-FAST-NEXT: vmovdqa 16(%rdx), %xmm14 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm15 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %xmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 16(%rsi), %xmm17 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %xmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 16(%rdi), %xmm19 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u> -; AVX512BW-FAST-NEXT: vpermt2w %zmm15, %zmm20, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa (%r11), %xmm11 +; AVX512BW-FAST-NEXT: vmovdqa 16(%r11), %xmm0 +; AVX512BW-FAST-NEXT: vmovdqa (%r10), %xmm12 +; AVX512BW-FAST-NEXT: vmovdqa 16(%r10), %xmm1 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa (%r9), %xmm13 +; AVX512BW-FAST-NEXT: vmovdqa 16(%r9), %xmm2 +; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm14 +; AVX512BW-FAST-NEXT: vmovdqa 16(%r8), %xmm3 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm15 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,0,32,4,5,1,33,10,9,10,42,12,13,11,43,16,17,20,52,20,21,21,53,24,25,30,62,30,29,31,63] +; AVX512BW-FAST-NEXT: vpermt2w %zmm5, %zmm4, %zmm15 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %xmm16 +; AVX512BW-FAST-NEXT: vmovdqa 16(%rcx), %xmm6 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512BW-FAST-NEXT: vmovdqa 16(%rdx), %xmm7 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm18 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %xmm19 +; AVX512BW-FAST-NEXT: vmovdqa 16(%rsi), %xmm8 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %xmm20 +; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm9 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm10 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,32,u,u,1,33,u,u,2,42,u,u,3,43,u,u,20,52,u,u,21,53,u,u,30,62,u,u,31,63,u,u> +; AVX512BW-FAST-NEXT: vpermt2w %zmm18, %zmm10, %zmm5 ; AVX512BW-FAST-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm15 -; AVX512BW-FAST-NEXT: vpermt2w %zmm11, %zmm4, %zmm15 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm19[0],xmm17[0],xmm19[1],xmm17[1],xmm19[2],xmm17[2],xmm19[3],xmm17[3],xmm19[4],xmm17[4],xmm19[5],xmm17[5],xmm19[6],xmm17[6],xmm19[7],xmm17[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm21, %zmm21, %zmm21 -; AVX512BW-FAST-NEXT: vpermt2w %zmm11, %zmm20, %zmm21 -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm15, %zmm21 {%k1} -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm7 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 -; AVX512BW-FAST-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm7 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm19[8],xmm17[8],xmm19[9],xmm17[9],xmm19[10],xmm17[10],xmm19[11],xmm17[11],xmm19[12],xmm17[12],xmm19[13],xmm17[13],xmm19[14],xmm17[14],xmm19[15],xmm17[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 -; AVX512BW-FAST-NEXT: vpermt2w %zmm7, %zmm20, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm15, %zmm5 {%k1} +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm11, %zmm11 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm12 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 +; AVX512BW-FAST-NEXT: vpermt2w %zmm11, %zmm4, %zmm12 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm11, %zmm11 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm14 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm13 +; AVX512BW-FAST-NEXT: vpermt2w %zmm11, %zmm10, %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm11, %zmm11 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm12 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 +; AVX512BW-FAST-NEXT: vpermt2w %zmm11, %zmm4, %zmm12 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm11, %zmm11 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm15 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512BW-FAST-NEXT: vpermt2w %zmm11, %zmm10, %zmm14 +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm12, %zmm14 {%k1} +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vpermt2w %zmm0, %zmm4, %zmm1 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3],xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm18[0],xmm16[0],xmm18[1],xmm16[1],xmm18[2],xmm16[2],xmm18[3],xmm16[3],xmm18[4],xmm16[4],xmm18[5],xmm16[5],xmm18[6],xmm16[6],xmm18[7],xmm16[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 -; AVX512BW-FAST-NEXT: vpermt2w %zmm0, %zmm20, %zmm2 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512BW-FAST-NEXT: vpermt2w %zmm0, %zmm10, %zmm2 ; AVX512BW-FAST-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, 128(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 64 @@ -4117,359 +4082,409 @@ ; ; AVX1-ONLY-LABEL: store_i8_stride8_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $360, %rsp # imm = 0x168 +; AVX1-ONLY-NEXT: subq $296, %rsp # imm = 0x128 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm13 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm2[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm5, %ymm8 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4],ymm0[5],ymm7[6],ymm0[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%r10), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm12 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm7 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm10 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm11 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm14 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm4 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2],ymm10[3],ymm0[4],ymm10[5],ymm0[6],ymm10[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm14, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 16(%r10), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm15[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm10 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm10 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm5, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm12, %ymm6 +; AVX1-ONLY-NEXT: vmovaps %ymm12, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm13[8],mem[8],xmm13[9],mem[9],xmm13[10],mem[10],xmm13[11],mem[11],xmm13[12],mem[12],xmm13[13],mem[13],xmm13[14],mem[14],xmm13[15],mem[15] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm14 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,1,3,4,5,5,7] +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm4, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm13, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%r10), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm12 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm13 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm11 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm14 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2],ymm13[3],ymm0[4],ymm13[5],ymm0[6],ymm13[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm15[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm14, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm4, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm2[8],xmm12[9],xmm2[9],xmm12[10],xmm2[10],xmm12[11],xmm2[11],xmm12[12],xmm2[12],xmm12[13],xmm2[13],xmm12[14],xmm2[14],xmm12[15],xmm2[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm7, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vmovdqa 32(%r10), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,5,7] +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm4, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4],ymm0[5],ymm7[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm10 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm11 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm15[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2],ymm10[3],ymm0[4],ymm10[5],ymm0[6],ymm10[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm10 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm15[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm5, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm4, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4],ymm0[5],ymm7[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm4, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 48(%r10), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] +; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm8 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm9 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm13 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm15 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm6[8],xmm11[8],xmm6[9],xmm11[9],xmm6[10],xmm11[10],xmm6[11],xmm11[11],xmm6[12],xmm11[12],xmm6[13],xmm11[13],xmm6[14],xmm11[14],xmm6[15],xmm11[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm14 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm5, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0],ymm15[1],ymm3[2],ymm15[3],ymm3[4],ymm15[5],ymm3[6],ymm15[7] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0],ymm14[1],ymm3[2],ymm14[3],ymm3[4],ymm14[5],ymm3[6],ymm14[7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 @@ -4477,520 +4492,491 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3],xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm13[8],mem[8],xmm13[9],mem[9],xmm13[10],mem[10],xmm13[11],mem[11],xmm13[12],mem[12],xmm13[13],mem[13],xmm13[14],mem[14],xmm13[15],mem[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm11[0],zero,xmm11[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm5, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm9, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm2[1],ymm7[2],ymm2[3],ymm7[4],ymm2[5],ymm7[6],ymm2[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm12[8],xmm7[9],xmm12[9],xmm7[10],xmm12[10],xmm7[11],xmm12[11],xmm7[12],xmm12[12],xmm7[13],xmm12[13],xmm7[14],xmm12[14],xmm7[15],xmm12[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm11[8],xmm6[9],xmm11[9],xmm6[10],xmm11[10],xmm6[11],xmm11[11],xmm6[12],xmm11[12],xmm6[13],xmm11[13],xmm6[14],xmm11[14],xmm6[15],xmm11[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm4, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 384(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $360, %rsp # imm = 0x168 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: addq $296, %rsp # imm = 0x128 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i8_stride8_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $328, %rsp # imm = 0x148 +; AVX2-SLOW-NEXT: subq $264, %rsp # imm = 0x108 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-SLOW-NEXT: vmovdqa (%r10), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vmovdqa (%r10), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm3 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm5 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm14 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm15 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm6, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 16(%r10), %xmm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7,8],ymm12[9],ymm7[10,11,12],ymm12[13],ymm7[14,15] +; AVX2-SLOW-NEXT: vmovdqa 16(%rax), %xmm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 16(%r9), %xmm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 48(%r10), %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7,8],ymm7[9],ymm1[10,11,12],ymm7[13],ymm1[14,15] -; AVX2-SLOW-NEXT: vmovdqa 48(%rax), %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4],ymm5[5],ymm7[6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 48(%r9), %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6],ymm5[7],ymm3[8,9,10],ymm5[11],ymm3[12,13,14],ymm5[15] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 48(%r8), %xmm3 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm12 -; AVX2-SLOW-NEXT: vmovdqa 48(%rcx), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 48(%rdx), %xmm7 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7,8],ymm14[9],ymm12[10,11,12],ymm14[13],ymm12[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm8[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm11, %ymm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-SLOW-NEXT: vmovdqa 16(%r8), %xmm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7,8],ymm9[9],ymm1[10,11,12],ymm9[13],ymm1[14,15] +; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm12 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7,8],ymm6[9],ymm4[10,11,12],ymm6[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm11, %ymm11 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7,8],ymm11[9],ymm4[10,11,12],ymm11[13],ymm4[14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX2-SLOW-NEXT: vmovdqa 32(%r10), %xmm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %xmm1 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm12 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3],xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7,8],ymm14[9],ymm12[10,11,12],ymm14[13],ymm12[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm8[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm11, %ymm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4],ymm14[5],ymm11[6,7,8],ymm14[9],ymm11[10,11,12],ymm14[13],ymm11[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2],ymm4[3],ymm11[4],ymm4[5],ymm11[6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm13[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] +; AVX2-SLOW-NEXT: vmovdqa 32(%r10), %xmm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %xmm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7,8],ymm6[9],ymm4[10,11,12],ymm6[13],ymm4[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm12 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7,8],ymm1[9],ymm6[10,11,12],ymm1[13],ymm6[14,15] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm13 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[3,3,3,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm15, %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7,8],ymm7[9],ymm8[10,11,12],ymm7[13],ymm8[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm14[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm6[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7,8],ymm3[9],ymm1[10,11,12],ymm3[13],ymm1[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3],ymm1[4],ymm5[5],ymm1[6],ymm5[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 48(%r10), %xmm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 48(%rax), %xmm14 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 16(%r10), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 16(%rax), %xmm12 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 16(%r9), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa 16(%r8), %xmm9 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 48(%r9), %xmm15 +; AVX2-SLOW-NEXT: vmovdqa 48(%r8), %xmm10 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3],xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7,8],ymm14[9],ymm2[10,11,12],ymm14[13],ymm2[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0],ymm11[1],ymm2[2],ymm11[3],ymm2[4],ymm11[5],ymm2[6],ymm11[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm4[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 48(%rcx), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdx), %xmm3 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0],ymm7[1],ymm1[2],ymm7[3],ymm1[4],ymm7[5],ymm1[6],ymm7[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7,8],ymm5[9],ymm6[10,11,12],ymm5[13],ymm6[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7],ymm4[8,9,10],ymm1[11],ymm4[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7,8],ymm6[9],ymm8[10,11,12],ymm6[13],ymm8[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2],ymm1[3],ymm6[4],ymm1[5],ymm6[6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] @@ -5001,1213 +4987,1073 @@ ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm7 = xmm7[8],mem[8],xmm7[9],mem[9],xmm7[10],mem[10],xmm7[11],mem[11],xmm7[12],mem[12],xmm7[13],mem[13],xmm7[14],mem[14],xmm7[15],mem[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm9[1],ymm6[2,3,4],ymm9[5],ymm6[6,7,8],ymm9[9],ymm6[10,11,12],ymm9[13],ymm6[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, 224(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm11, 192(%rax) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 480(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 448(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 416(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 384(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 416(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%rax) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 480(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 448(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-SLOW-NEXT: addq $328, %rsp # imm = 0x148 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i8_stride8_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $392, %rsp # imm = 0x188 +; AVX2-FAST-NEXT: subq $264, %rsp # imm = 0x108 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm0 +; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm1 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm11 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3],xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7],ymm5[8,9,10],ymm2[11],ymm5[12,13,14],ymm2[15] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm12 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm13 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm9 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2,3,4],ymm14[5],ymm8[6,7,8],ymm14[9],ymm8[10,11,12],ymm14[13],ymm8[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2],ymm5[3],ymm8[4],ymm5[5],ymm8[6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm7[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6],ymm5[7],ymm4[8,9,10],ymm5[11],ymm4[12,13,14],ymm5[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7,8],ymm5[9],ymm6[10,11,12],ymm5[13],ymm6[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm3[8],xmm11[9],xmm3[9],xmm11[10],xmm3[10],xmm11[11],xmm3[11],xmm11[12],xmm3[12],xmm11[13],xmm3[13],xmm11[14],xmm3[14],xmm11[15],xmm3[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm5 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm5 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm13 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 48(%r10), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 48(%rax), %xmm3 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vmovdqa 48(%r9), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 48(%r8), %xmm5 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm10 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm1 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm13, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 48(%rcx), %xmm13 -; AVX2-FAST-NEXT: vmovdqa 48(%rdx), %xmm2 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] +; AVX2-FAST-NEXT: vmovdqa 16(%r10), %xmm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7],ymm1[8,9,10],ymm3[11],ymm1[12,13,14],ymm3[15] +; AVX2-FAST-NEXT: vmovdqa 16(%rax), %xmm1 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] +; AVX2-FAST-NEXT: vmovdqa 16(%r9), %xmm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm6 +; AVX2-FAST-NEXT: vmovdqa 16(%r8), %xmm5 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] +; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm12 +; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm13 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4],ymm11[5],ymm1[6],ymm11[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4],ymm14[5],ymm11[6,7,8],ymm14[9],ymm11[10,11,12],ymm14[13],ymm11[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2],ymm8[3],ymm11[4],ymm8[5],ymm11[6],ymm8[7] +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7],ymm6[8,9,10],ymm1[11],ymm6[12,13,14],ymm1[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm6 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm15, %ymm7 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7,8],ymm7[9],ymm8[10,11,12],ymm7[13],ymm8[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm5 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7],ymm5[8,9,10],ymm2[11],ymm5[12,13,14],ymm2[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm4 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7,8],ymm6[9],ymm4[10,11,12],ymm6[13],ymm4[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%r10), %xmm1 ; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm3 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 ; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm4 ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm10 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm13 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7,8],ymm15[9],ymm10[10,11,12],ymm15[13],ymm10[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm11 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm14 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm15 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7,8],ymm12[9],ymm11[10,11,12],ymm12[13],ymm11[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2],ymm8[3],ymm11[4],ymm8[5],ymm11[6],ymm8[7] ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm14, %ymm7 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7,8],ymm7[9],ymm8[10,11,12],ymm7[13],ymm8[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3,4],ymm0[5],ymm7[6,7,8],ymm0[9],ymm7[10,11,12],ymm0[13],ymm7[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2],ymm6[3],ymm0[4],ymm6[5],ymm0[6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm2 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7,8],ymm6[9],ymm2[10,11,12],ymm6[13],ymm2[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm6, %xmm15 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 16(%r10), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 16(%rax), %xmm2 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; AVX2-FAST-NEXT: vmovdqa 48(%r10), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 48(%rax), %xmm2 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vmovdqa 16(%r9), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 16(%r8), %xmm0 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vmovdqa 48(%r9), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 48(%r8), %xmm1 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm10 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10 -; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm13 -; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm14 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm15, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7,8],ymm11[9],ymm10[10,11,12],ymm11[13],ymm10[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm7[1],ymm10[2],ymm7[3],ymm10[4],ymm7[5],ymm10[6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7],ymm8[8,9,10],ymm6[11],ymm8[12,13,14],ymm6[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm15, %ymm8 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm5 +; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm14, %xmm15 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm15, %ymm13, %ymm13 +; AVX2-FAST-NEXT: vmovdqa 48(%rcx), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 48(%rdx), %xmm0 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm12, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3,4],ymm11[5],ymm13[6,7,8],ymm11[9],ymm13[10,11,12],ymm11[13],ymm13[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2],ymm7[3],ymm11[4],ymm7[5],ymm11[6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3],ymm10[4,5,6],ymm6[7],ymm10[8,9,10],ymm6[11],ymm10[12,13,14],ymm6[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm12[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7,8],ymm10[9],ymm11[10,11,12],ymm10[13],ymm11[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2],ymm6[3],ymm10[4],ymm6[5],ymm10[6],ymm6[7] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1,2],ymm4[3],ymm8[4,5,6],ymm4[7],ymm8[8,9,10],ymm4[11],ymm8[12,13,14],ymm4[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm8 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7,8],ymm8[9],ymm5[10,11,12],ymm8[13],ymm5[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm15 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm7 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7,8],ymm2[9],ymm3[10,11,12],ymm2[13],ymm3[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm8 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm9 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7],ymm1[8,9,10],ymm2[11],ymm1[12,13,14],ymm2[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7,8],ymm3[9],ymm5[10,11,12],ymm3[13],ymm5[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 64(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 128(%rax) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 480(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 448(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 416(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 384(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rax) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 416(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 384(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 480(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 448(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $392, %rsp # imm = 0x188 +; AVX2-FAST-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i8_stride8_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $392, %rsp # imm = 0x188 +; AVX2-FAST-PERLANE-NEXT: subq $264, %rsp # imm = 0x108 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3],xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7],ymm5[8,9,10],ymm2[11],ymm5[12,13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2,3,4],ymm14[5],ymm8[6,7,8],ymm14[9],ymm8[10,11,12],ymm14[13],ymm8[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2],ymm5[3],ymm8[4],ymm5[5],ymm8[6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm7[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6],ymm5[7],ymm4[8,9,10],ymm5[11],ymm4[12,13,14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm10, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7,8],ymm5[9],ymm6[10,11,12],ymm5[13],ymm6[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm3[8],xmm11[9],xmm3[9],xmm11[10],xmm3[10],xmm11[11],xmm3[11],xmm11[12],xmm3[12],xmm11[13],xmm3[13],xmm11[14],xmm3[14],xmm11[15],xmm3[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r10), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rax), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r9), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r8), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm6, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm7, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm13, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rcx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r10), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7],ymm1[8,9,10],ymm3[11],ymm1[12,13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rax), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r9), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r8), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm6, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm7, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rcx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4],ymm11[5],ymm1[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4],ymm14[5],ymm11[6,7,8],ymm14[9],ymm11[10,11,12],ymm14[13],ymm11[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2],ymm8[3],ymm11[4],ymm8[5],ymm11[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7],ymm6[8,9,10],ymm1[11],ymm6[12,13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm15, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm15, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7,8],ymm7[9],ymm8[10,11,12],ymm7[13],ymm8[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7],ymm5[8,9,10],ymm2[11],ymm5[12,13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm5, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7,8],ymm6[9],ymm4[10,11,12],ymm6[13],ymm4[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r10), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm6, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm6, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm7, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm9, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7,8],ymm15[9],ymm10[10,11,12],ymm15[13],ymm10[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm9, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7,8],ymm12[9],ymm11[10,11,12],ymm12[13],ymm11[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2],ymm8[3],ymm11[4],ymm8[5],ymm11[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm7, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm14, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7,8],ymm7[9],ymm8[10,11,12],ymm7[13],ymm8[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm3, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm5, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7,8],ymm6[9],ymm2[10,11,12],ymm6[13],ymm2[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3,4],ymm0[5],ymm7[6,7,8],ymm0[9],ymm7[10,11,12],ymm0[13],ymm7[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2],ymm6[3],ymm0[4],ymm6[5],ymm0[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r10), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rax), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r10), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rax), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r9), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r8), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm6, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r9), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r8), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm6, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm10, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm9, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rcx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm15, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7,8],ymm11[9],ymm10[10,11,12],ymm11[13],ymm10[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm7[1],ymm10[2],ymm7[3],ymm10[4],ymm7[5],ymm10[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7],ymm8[8,9,10],ymm6[11],ymm8[12,13,14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm15, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm14, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm13, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rcx), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdx), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm12, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3,4],ymm11[5],ymm13[6,7,8],ymm11[9],ymm13[10,11,12],ymm11[13],ymm13[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2],ymm7[3],ymm11[4],ymm7[5],ymm11[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3],ymm10[4,5,6],ymm6[7],ymm10[8,9,10],ymm6[11],ymm10[12,13,14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm12[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7,8],ymm10[9],ymm11[10,11,12],ymm10[13],ymm11[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2],ymm6[3],ymm10[4],ymm6[5],ymm10[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1,2],ymm4[3],ymm8[4,5,6],ymm4[7],ymm8[8,9,10],ymm4[11],ymm8[12,13,14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7,8],ymm8[9],ymm5[10,11,12],ymm8[13],ymm5[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7,8],ymm2[9],ymm3[10,11,12],ymm2[13],ymm3[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm2, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm3, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm8, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm5, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7],ymm1[8,9,10],ymm2[11],ymm1[12,13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7,8],ymm3[9],ymm5[10,11,12],ymm3[13],ymm5[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 480(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 448(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 416(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 384(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 416(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 384(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 480(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 448(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $392, %rsp # imm = 0x188 +; AVX2-FAST-PERLANE-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: store_i8_stride8_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $648, %rsp # imm = 0x288 +; AVX512F-SLOW-NEXT: subq $184, %rsp ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa 48(%rcx), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 48(%rdx), %xmm3 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512F-SLOW-NEXT: vmovaps 16(%rsi), %xmm0 +; AVX512F-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm12 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 48(%r10), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 48(%rax), %xmm5 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 48(%r9), %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 48(%r8), %xmm13 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm8 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm6 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm3, %ymm19 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm5, %ymm27 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm11 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm10, %ymm0, %ymm22 +; AVX512F-SLOW-NEXT: vmovdqa 16(%rcx), %xmm0 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm5[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm10, %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa 16(%rdx), %xmm5 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm10, %ymm1, %ymm29 +; AVX512F-SLOW-NEXT: vmovdqa 16(%r10), %xmm1 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm10, %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa 16(%rax), %xmm10 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm7, %ymm6, %ymm31 +; AVX512F-SLOW-NEXT: vmovdqa 16(%r9), %xmm13 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm6, %ymm24 +; AVX512F-SLOW-NEXT: vmovdqa 16(%r8), %xmm15 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm7, %ymm6, %ymm25 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm6, %ymm26 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm8 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm6, %ymm18 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm7, %ymm6, %ymm30 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm6, %ymm21 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm7, %ymm6, %ymm20 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm7, %ymm23 +; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm7, %ymm7 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm11, %ymm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm28 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm1 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm14, %ymm28 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm0 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm2, %ymm11 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm2 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm14, %ymm31 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm24 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm22 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm20 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm18 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm12, %ymm5, %ymm30 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm29 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm12, %ymm5, %ymm23 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm21 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm12[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm14, %ymm13, %ymm19 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm12, %ymm13, %ymm17 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm10, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm12[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm16 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} zmm12 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm28, %zmm12, %zmm16 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 16(%rcx), %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rdx), %xmm12 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 16(%r10), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rax), %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm26 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm16 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm1 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 16(%r9), %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa 16(%r8), %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm25 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm27 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa 48(%rsi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm5 = mem[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm6 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm6 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm5 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm5 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] -; AVX512F-SLOW-NEXT: vpandnq %zmm4, %zmm8, %zmm4 -; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm5, %zmm5 -; AVX512F-SLOW-NEXT: movw $-21846, %ax # imm = 0xAAAA -; AVX512F-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-SLOW-NEXT: vpord %zmm4, %zmm5, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm9 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm9 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm10 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm19[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm27[0,2,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpshufd $96, (%rsp), %ymm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm28[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpandnq %zmm0, %zmm8, %zmm0 -; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpord %zmm0, %zmm2, %zmm9 {%k1} -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] +; AVX512F-SLOW-NEXT: vpandnq %zmm0, %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm2 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm9 +; AVX512F-SLOW-NEXT: vpandq %zmm4, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: movw $-21846, %r11w # imm = 0xAAAA +; AVX512F-SLOW-NEXT: kmovw %r11d, %k1 +; AVX512F-SLOW-NEXT: vpord %zmm0, %zmm5, %zmm16 {%k1} +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm11 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm11 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm24[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpandnq %zmm0, %zmm8, %zmm0 -; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpord %zmm0, %zmm1, %zmm11 {%k1} -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm28 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[3,3,3,3] ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm5 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm3, %zmm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm23[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm21[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm17[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 -; AVX512F-SLOW-NEXT: vpandnq %zmm5, %zmm8, %zmm5 -; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm7, %zmm7 -; AVX512F-SLOW-NEXT: vpord %zmm5, %zmm7, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm5, %ymm18 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm5 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm7, %ymm5, %ymm21 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm5, %ymm22 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm14[8],xmm15[8],xmm14[9],xmm15[9],xmm14[10],xmm15[10],xmm14[11],xmm15[11],xmm14[12],xmm15[12],xmm14[13],xmm15[13],xmm14[14],xmm15[14],xmm14[15],xmm15[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm7, %ymm2, %ymm26 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm29[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm22 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm12, %zmm22 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm31[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm24[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpandnq %zmm0, %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm25[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vpandq %zmm4, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vpord %zmm0, %zmm5, %zmm22 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm28 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm6, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm7, %ymm19 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm5 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm6, %ymm27 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm10, %ymm14 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm5[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm6, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm10, %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm5 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm13, %ymm10, %ymm17 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm5[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm6, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm10, %ymm20 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm6, %ymm29 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm13, %ymm6, %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa 48(%rcx), %xmm6 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm18[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm8 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm5[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm13, %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa 48(%rdx), %xmm5 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm12, %zmm8 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm7[8],xmm11[9],xmm7[9],xmm11[10],xmm7[10],xmm11[11],xmm7[11],xmm11[12],xmm7[12],xmm11[13],xmm7[13],xmm11[14],xmm7[14],xmm11[15],xmm7[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm7, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 48(%r10), %xmm7 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm30[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm21[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm11 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm13, %ymm19 +; AVX512F-SLOW-NEXT: vmovdqa 48(%rax), %xmm0 +; AVX512F-SLOW-NEXT: vpandnq %zmm11, %zmm4, %zmm11 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm20[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm23[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm13 +; AVX512F-SLOW-NEXT: vpandq %zmm4, %zmm13, %zmm13 +; AVX512F-SLOW-NEXT: vpord %zmm11, %zmm13, %zmm8 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm1[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm11, %ymm10, %ymm20 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm10, %ymm21 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm2[8],xmm9[9],xmm2[9],xmm9[10],xmm2[10],xmm9[11],xmm2[11],xmm9[12],xmm2[12],xmm9[13],xmm2[13],xmm9[14],xmm2[14],xmm9[15],xmm2[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm9, %ymm2, %ymm23 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rsi), %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm13 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[2,3,2,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm23 = xmm5[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm13, %ymm13 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm23, %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm5, %zmm13 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm23 = mem[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm5, %zmm5 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm3, %zmm5 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm13 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm23 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm13, %zmm13 -; AVX512F-SLOW-NEXT: vpandnq %zmm13, %zmm8, %zmm13 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm23 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm24 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm23 -; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm23, %zmm23 -; AVX512F-SLOW-NEXT: vpord %zmm13, %zmm23, %zmm5 {%k1} -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm23 = xmm12[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm7 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm13, %ymm13 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm12, %ymm24, %ymm12 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm23 = xmm10[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm24 = xmm10[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,1,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm2, %ymm24 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm9, %ymm2, %ymm25 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm2, %ymm26 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm15 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm13 +; AVX512F-SLOW-NEXT: vmovdqa 48(%r9), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 48(%r8), %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm11 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm10 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm6, %ymm9 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm6 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm7 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm0 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm31 = xmm0[2,3,2,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm31[0],zero,zero,zero,xmm31[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm28 = xmm0[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm28[0],zero,zero,zero,xmm28[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm28, %ymm31, %ymm28 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm31, %ymm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm28 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm31 = mem[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm28, %zmm12, %zmm3 +; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm28 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm31 = mem[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm28, %zmm28 +; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm31 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm27[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm31, %zmm14 +; AVX512F-SLOW-NEXT: vpandnq %zmm28, %zmm4, %zmm28 +; AVX512F-SLOW-NEXT: vpandq %zmm4, %zmm14, %zmm14 +; AVX512F-SLOW-NEXT: vpord %zmm28, %zmm14, %zmm3 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm14, %ymm2, %ymm31 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm14, %ymm28 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm27 = xmm14[2,3,2,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm27[0],zero,zero,zero,xmm27[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm30 = xmm14[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm30 = xmm30[0],zero,zero,zero,xmm30[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm30 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,1,1] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm14, %ymm30, %ymm14 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm14, %zmm14 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm27 = mem[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm30 = mem[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm13, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm3, %zmm13 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm30 = xmm2[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm27, %zmm27 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm14, %zmm12, %zmm27 +; AVX512F-SLOW-NEXT: vmovdqa 48(%rsi), %xmm14 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm30 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm29 = ymm29[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm30, %zmm29 +; AVX512F-SLOW-NEXT: vpandnq %zmm29, %zmm4, %zmm29 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm17 +; AVX512F-SLOW-NEXT: vpandq %zmm4, %zmm17, %zmm17 +; AVX512F-SLOW-NEXT: vpord %zmm29, %zmm17, %zmm27 {%k1} +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm17 = xmm1[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm18 = xmm1[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm17 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm14, %ymm14 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm29, %ymm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm17 = xmm0[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm18 = xmm0[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm19 = ymm19[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm14, %zmm14 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm12, %zmm14 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm19 = xmm2[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm30 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm25 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm27 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm25 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm27 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm16 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16 -; AVX512F-SLOW-NEXT: vpandnq %zmm25, %zmm8, %zmm25 -; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm16, %zmm16 -; AVX512F-SLOW-NEXT: vpord %zmm25, %zmm16, %zmm13 {%k1} -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm16, %ymm16 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm10, %ymm29, %ymm10 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm10, %zmm10 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm28[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm10, %zmm3, %zmm16 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm21[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm18 = ymm22[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm10, %zmm10 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm18 = ymm26[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm18, %zmm7 -; AVX512F-SLOW-NEXT: vpandnq %zmm10, %zmm8, %zmm10 -; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm7, %zmm7 -; AVX512F-SLOW-NEXT: vpord %zmm10, %zmm7, %zmm16 {%k1} -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm30[0],zero,zero,zero,xmm30[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm3, %zmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpandnq %zmm2, %zmm8, %zmm1 -; AVX512F-SLOW-NEXT: vpandq %zmm8, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpord %zmm1, %zmm0, %zmm7 {%k1} +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm20, %zmm20 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm21 = ymm23[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm23 = ymm24[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm21, %zmm21 +; AVX512F-SLOW-NEXT: vpandnq %zmm20, %zmm4, %zmm20 +; AVX512F-SLOW-NEXT: vpandq %zmm4, %zmm21, %zmm21 +; AVX512F-SLOW-NEXT: vpord %zmm20, %zmm21, %zmm14 {%k1} +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm17, %ymm17 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm29, %ymm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm17 = ymm25[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm18 = ymm26[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm17 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm12, %zmm17 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512F-SLOW-NEXT: vpandnq %zmm0, %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vpandq %zmm4, %zmm10, %zmm10 +; AVX512F-SLOW-NEXT: vpord %zmm0, %zmm10, %zmm17 {%k1} +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm30, %ymm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm12, %zmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm28[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpandnq %zmm0, %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vpandq %zmm4, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpord %zmm0, %zmm2, %zmm1 {%k1} ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512F-SLOW-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 448(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512F-SLOW-NEXT: addq $184, %rsp ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -6216,288 +6062,286 @@ ; AVX512F-FAST-NEXT: subq $392, %rsp # imm = 0x188 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 48(%rcx), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm15 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdx), %xmm1 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm14 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm19 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm21 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 16(%rcx), %xmm15 +; AVX512F-FAST-NEXT: vmovdqa64 32(%rcx), %xmm20 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa %ymm3, %ymm11 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm27 +; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm5 +; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm7 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm28 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm0 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 48(%r10), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 48(%rax), %xmm3 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 48(%r9), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa 48(%r8), %xmm5 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm7, %ymm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm7 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm16 +; AVX512F-FAST-NEXT: vmovdqa64 48(%rcx), %xmm26 +; AVX512F-FAST-NEXT: vmovdqa 16(%rdx), %xmm2 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 16(%r10), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 16(%rax), %xmm4 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm7 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 16(%r9), %xmm9 +; AVX512F-FAST-NEXT: vmovdqa 16(%r8), %xmm10 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm1 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm20 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa 48(%rdx), %xmm7 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa %ymm11, %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm3 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm22 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm7, %ymm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm23 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm5 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm4 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm24 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm30 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm11 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa 16(%rcx), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm26 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm27 -; AVX512F-FAST-NEXT: vmovdqa 16(%r10), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 16(%rax), %xmm3 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm28 -; AVX512F-FAST-NEXT: vmovdqa 16(%r9), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa 16(%r8), %xmm5 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm7, %ymm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm29 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm23 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm21 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm22 +; AVX512F-FAST-NEXT: vmovdqa 48(%r10), %xmm9 +; AVX512F-FAST-NEXT: vmovdqa 48(%rax), %xmm10 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm8[8],xmm11[8],xmm8[9],xmm11[9],xmm8[10],xmm11[10],xmm8[11],xmm11[11],xmm8[12],xmm11[12],xmm8[13],xmm11[13],xmm8[14],xmm11[14],xmm8[15],xmm11[15] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm8 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm19 +; AVX512F-FAST-NEXT: vmovdqa 48(%r9), %xmm8 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm30 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm24 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm31 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX512F-FAST-NEXT: vmovdqa %ymm3, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm3 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm15, %ymm18 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm13, %ymm17 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm26 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm9, %ymm17 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm12, %ymm18 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm12 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm2 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm29 +; AVX512F-FAST-NEXT: vmovdqa 48(%r8), %xmm5 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm4 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} zmm3 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} zmm0 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] -; AVX512F-FAST-NEXT: vpandnq %zmm19, %zmm0, %zmm19 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm21, %zmm21 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm11 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm13 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm15 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm15, %ymm2, %ymm15 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm15, %zmm31 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} zmm1 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm27, %zmm1, %zmm31 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} zmm27 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] +; AVX512F-FAST-NEXT: vpandnq %zmm28, %zmm27, %zmm15 +; AVX512F-FAST-NEXT: vpandq %zmm27, %zmm16, %zmm16 ; AVX512F-FAST-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vpord %zmm19, %zmm21, %zmm4 {%k1} -; AVX512F-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3],xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm15, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm8 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm19 -; AVX512F-FAST-NEXT: vmovdqa 48(%rsi), %xmm15 +; AVX512F-FAST-NEXT: vpord %zmm15, %zmm16, %zmm31 {%k1} +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm13 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm13, %ymm15, %ymm13 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm13 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm28 +; AVX512F-FAST-NEXT: vmovdqa 16(%rsi), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm15 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm7 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm7, %ymm16, %ymm7 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm3 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm7 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm15, %ymm7 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm16 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm7 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm10 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm7 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm10, %ymm7 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm7 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm7 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa 48(%rsi), %xmm7 ; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm8 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3],xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm1 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm1, %ymm21, %ymm1 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm6 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm15[8],xmm8[9],xmm15[9],xmm8[10],xmm15[10],xmm8[11],xmm15[11],xmm8[12],xmm15[12],xmm8[13],xmm15[13],xmm8[14],xmm15[14],xmm8[15],xmm15[15] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm15, %ymm8 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm15 -; AVX512F-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm1 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3],xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm13 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm14 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm14, %ymm21, %ymm14 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm11 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm11 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm8 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm14, %ymm8 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm14 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm21 -; AVX512F-FAST-NEXT: vmovdqa 16(%rsi), %xmm8 -; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm10 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm2 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm17 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm17, %ymm2 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm8 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm2 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm9 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm7 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm6 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpord %zmm7, %zmm8, %zmm6 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm15 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpandnq (%rsp), %zmm0, %zmm7 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpord %zmm7, %zmm8, %zmm15 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm20, %zmm3, %zmm11 -; AVX512F-FAST-NEXT: vpandnq %zmm22, %zmm0, %zmm7 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm23, %zmm8 -; AVX512F-FAST-NEXT: vpord %zmm7, %zmm8, %zmm11 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm24, %zmm3, %zmm14 -; AVX512F-FAST-NEXT: vpandnq %zmm25, %zmm0, %zmm7 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm26, %zmm8 -; AVX512F-FAST-NEXT: vpord %zmm7, %zmm8, %zmm14 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm27, %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vpandnq %zmm28, %zmm0, %zmm7 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm29, %zmm8 -; AVX512F-FAST-NEXT: vpord %zmm7, %zmm8, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm30, %zmm3, %zmm2 -; AVX512F-FAST-NEXT: vpandnq %zmm31, %zmm0, %zmm7 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm16, %zmm8 -; AVX512F-FAST-NEXT: vpord %zmm7, %zmm8, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm19, %zmm3, %zmm5 -; AVX512F-FAST-NEXT: vpandnq %zmm13, %zmm0, %zmm3 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm21, %zmm0 -; AVX512F-FAST-NEXT: vpord %zmm3, %zmm0, %zmm5 {%k1} +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm12 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm10, %ymm10, %ymm10 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm10 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm8 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm4 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm4 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm7 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpord %zmm4, %zmm7, %zmm13 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm4 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpandq (%rsp), %zmm27, %zmm7 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpord %zmm4, %zmm7, %zmm3 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm16 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm4 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpandq %zmm27, %zmm30, %zmm7 +; AVX512F-FAST-NEXT: vpord %zmm4, %zmm7, %zmm16 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm23, %zmm1, %zmm9 +; AVX512F-FAST-NEXT: vpandnq %zmm21, %zmm27, %zmm4 +; AVX512F-FAST-NEXT: vpandq %zmm27, %zmm22, %zmm7 +; AVX512F-FAST-NEXT: vpord %zmm4, %zmm7, %zmm9 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm19, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vpandnq %zmm24, %zmm27, %zmm4 +; AVX512F-FAST-NEXT: vpandq %zmm27, %zmm25, %zmm7 +; AVX512F-FAST-NEXT: vpord %zmm4, %zmm7, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm26, %zmm1, %zmm10 +; AVX512F-FAST-NEXT: vpandnq %zmm29, %zmm27, %zmm4 +; AVX512F-FAST-NEXT: vpandq %zmm27, %zmm11, %zmm7 +; AVX512F-FAST-NEXT: vpord %zmm4, %zmm7, %zmm10 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm28, %zmm1, %zmm2 +; AVX512F-FAST-NEXT: vpandnq %zmm6, %zmm27, %zmm1 +; AVX512F-FAST-NEXT: vpandq %zmm27, %zmm5, %zmm4 +; AVX512F-FAST-NEXT: vpord %zmm1, %zmm4, %zmm2 {%k1} ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 448(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, (%rax) ; AVX512F-FAST-NEXT: addq $392, %rsp # imm = 0x188 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -6506,230 +6350,224 @@ ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-SLOW-NEXT: vmovdqa (%r10), %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa (%r10), %xmm5 ; AVX512BW-SLOW-NEXT: vmovdqa 16(%r10), %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r10), %xmm22 -; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r10), %xmm19 -; AVX512BW-SLOW-NEXT: vmovdqa (%rax), %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rax), %xmm16 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rax), %xmm23 -; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rax), %xmm20 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512BW-SLOW-NEXT: vmovdqa 32(%r10), %xmm9 +; AVX512BW-SLOW-NEXT: vmovdqa (%rax), %xmm6 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%rax), %xmm14 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rax), %xmm10 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm1 +; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm7 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%r9), %xmm15 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%r9), %xmm11 +; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm8 +; AVX512BW-SLOW-NEXT: vmovdqa64 16(%r8), %xmm16 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%r8), %xmm12 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%r9), %xmm17 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r9), %xmm25 -; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r9), %xmm21 -; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm7 -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%r8), %xmm18 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r8), %xmm26 -; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r8), %xmm24 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,16,17,20,21,20,21,21,23,16,17,22,21,22,21,23,23] -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm10, %zmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,16,17,20,20,20,20,22,21,16,17,20,22,20,22,22,23] -; AVX512BW-SLOW-NEXT: movl $-2004318072, %eax # imm = 0x88888888 -; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm9, %zmm4 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rsi), %xmm28 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm6 -; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rdi), %xmm30 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,16,17,20,21,20,21,21,23,16,17,22,21,22,21,23,23] +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm3, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,16,17,20,20,20,20,22,21,16,17,20,22,20,22,22,23] +; AVX512BW-SLOW-NEXT: movl $-2004318072, %r11d # imm = 0x88888888 +; AVX512BW-SLOW-NEXT: kmovd %r11d, %k1 +; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm2, %zmm0 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %xmm17 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm20 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm20[0],xmm17[0],xmm20[1],xmm17[1],xmm20[2],xmm17[2],xmm20[3],xmm17[3],xmm20[4],xmm17[4],xmm20[5],xmm17[5],xmm20[6],xmm17[6],xmm20[7],xmm17[7] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm18 = xmm1[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm4, %ymm4 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[3,3,3,3] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm18, %ymm1 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %xmm22 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm23 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm18 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,20,20,18,19,22,21,22,21,20,22,18,19,22,23,22,23] +; AVX512BW-SLOW-NEXT: movl $572662306, %r11d # imm = 0x22222222 +; AVX512BW-SLOW-NEXT: kmovd %r11d, %k2 +; AVX512BW-SLOW-NEXT: vpermw %zmm18, %zmm4, %zmm1 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rsi), %xmm19 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rdi), %xmm21 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rcx), %xmm18 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm7 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm5 +; AVX512BW-SLOW-NEXT: vpermw %zmm5, %zmm3, %zmm5 +; AVX512BW-SLOW-NEXT: vpermw %zmm7, %zmm2, %zmm5 {%k1} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm20[8],xmm17[8],xmm20[9],xmm17[9],xmm20[10],xmm17[10],xmm20[11],xmm17[11],xmm20[12],xmm17[12],xmm20[13],xmm17[13],xmm20[14],xmm17[14],xmm20[15],xmm17[15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm6 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm7 +; AVX512BW-SLOW-NEXT: vpermw %zmm7, %zmm4, %zmm6 {%k2} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm8 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm7 +; AVX512BW-SLOW-NEXT: vpermw %zmm7, %zmm3, %zmm7 +; AVX512BW-SLOW-NEXT: vpermw %zmm8, %zmm2, %zmm7 {%k1} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm21[0],xmm19[0],xmm21[1],xmm19[1],xmm21[2],xmm19[2],xmm21[3],xmm19[3],xmm21[4],xmm19[4],xmm21[5],xmm19[5],xmm21[6],xmm19[6],xmm21[7],xmm19[7] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm17 = xmm8[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm20 = xmm8[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm17, %ymm17 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,1,1] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 -; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512BW-SLOW-NEXT: vmovdqa 48(%rcx), %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm11 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm12, %zmm14 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,20,20,18,19,22,21,22,21,20,22,18,19,22,23,22,23] -; AVX512BW-SLOW-NEXT: movl $572662306, %eax # imm = 0x22222222 -; AVX512BW-SLOW-NEXT: kmovd %eax, %k2 -; AVX512BW-SLOW-NEXT: vpermw %zmm14, %zmm12, %zmm3 {%k2} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm14, %zmm15 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm24[0],xmm21[0],xmm24[1],xmm21[1],xmm24[2],xmm21[2],xmm24[3],xmm21[3],xmm24[4],xmm21[4],xmm24[5],xmm21[5],xmm24[6],xmm21[6],xmm24[7],xmm21[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm14, %zmm14 -; AVX512BW-SLOW-NEXT: vpermw %zmm14, %zmm10, %zmm14 -; AVX512BW-SLOW-NEXT: vpermw %zmm15, %zmm9, %zmm14 {%k1} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm27 = xmm15[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm27[0],zero,zero,zero,xmm27[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm29 = xmm15[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm29[0],zero,zero,zero,xmm29[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm29, %ymm27, %ymm27 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,1,1] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm8, %ymm20, %ymm8 +; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rdx), %xmm23 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm8, %zmm8 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm23[0],xmm18[0],xmm23[1],xmm18[1],xmm23[2],xmm18[2],xmm23[3],xmm18[3],xmm23[4],xmm18[4],xmm23[5],xmm18[5],xmm23[6],xmm18[6],xmm23[7],xmm18[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm17, %zmm17, %zmm17 +; AVX512BW-SLOW-NEXT: vpermw %zmm17, %zmm4, %zmm8 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rsi), %xmm20 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdi), %xmm22 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rcx), %xmm17 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm15 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm14, %zmm13 +; AVX512BW-SLOW-NEXT: vpermw %zmm13, %zmm3, %zmm13 +; AVX512BW-SLOW-NEXT: vpermw %zmm15, %zmm2, %zmm13 {%k1} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm21[8],xmm19[8],xmm21[9],xmm19[9],xmm21[10],xmm19[10],xmm21[11],xmm19[11],xmm21[12],xmm19[12],xmm21[13],xmm19[13],xmm21[14],xmm19[14],xmm21[15],xmm19[15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm29, %ymm15 -; AVX512BW-SLOW-NEXT: vmovdqa 48(%rdx), %xmm0 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm15, %zmm15 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm27, %zmm27, %zmm27 -; AVX512BW-SLOW-NEXT: vpermw %zmm27, %zmm12, %zmm15 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rsi), %xmm29 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15] -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdi), %xmm31 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm24[8],xmm21[8],xmm24[9],xmm21[9],xmm24[10],xmm21[10],xmm24[11],xmm21[11],xmm24[12],xmm21[12],xmm24[13],xmm21[13],xmm24[14],xmm21[14],xmm24[15],xmm21[15] -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rcx), %xmm27 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm19, %zmm21 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm20, %zmm20, %zmm19 -; AVX512BW-SLOW-NEXT: vpermw %zmm19, %zmm10, %zmm19 -; AVX512BW-SLOW-NEXT: vpermw %zmm21, %zmm9, %zmm19 {%k1} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm21 = xmm20[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm24 = xmm20[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm20 = xmm20[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm14[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm16, %ymm15, %ymm15 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm14, %ymm16, %ymm14 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm23[8],xmm18[8],xmm23[9],xmm18[9],xmm23[10],xmm18[10],xmm23[11],xmm18[11],xmm23[12],xmm18[12],xmm23[13],xmm18[13],xmm23[14],xmm18[14],xmm23[15],xmm18[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm15 +; AVX512BW-SLOW-NEXT: vpermw %zmm15, %zmm4, %zmm14 {%k2} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm16 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm15 +; AVX512BW-SLOW-NEXT: vpermw %zmm15, %zmm3, %zmm15 +; AVX512BW-SLOW-NEXT: vpermw %zmm16, %zmm2, %zmm15 {%k1} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm22[0],xmm20[0],xmm22[1],xmm20[1],xmm22[2],xmm20[2],xmm22[3],xmm20[3],xmm22[4],xmm20[4],xmm22[5],xmm20[5],xmm22[6],xmm20[6],xmm22[7],xmm20[7] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm18 = xmm16[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm19 = xmm16[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm18, %ymm18 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm16[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm16, %ymm19, %ymm16 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdx), %xmm19 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm19[0],xmm17[0],xmm19[1],xmm17[1],xmm19[2],xmm17[2],xmm19[3],xmm17[3],xmm19[4],xmm17[4],xmm19[5],xmm17[5],xmm19[6],xmm17[6],xmm19[7],xmm17[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm18, %zmm18, %zmm18 +; AVX512BW-SLOW-NEXT: vpermw %zmm18, %zmm4, %zmm16 {%k2} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r10), %xmm18 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512BW-SLOW-NEXT: vmovdqa 48(%rax), %xmm12 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm11 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm9 +; AVX512BW-SLOW-NEXT: vpermw %zmm9, %zmm3, %zmm9 +; AVX512BW-SLOW-NEXT: vpermw %zmm11, %zmm2, %zmm9 {%k1} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm22[8],xmm20[8],xmm22[9],xmm20[9],xmm22[10],xmm20[10],xmm22[11],xmm20[11],xmm22[12],xmm20[12],xmm22[13],xmm20[13],xmm22[14],xmm20[14],xmm22[15],xmm20[15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm20 = xmm10[3,3,3,3] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm24, %ymm20 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm20, %zmm20 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm12, %zmm20 {%k2} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm10, %zmm21 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm9, %zmm21 {%k1} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm31[0],xmm29[0],xmm31[1],xmm29[1],xmm31[2],xmm29[2],xmm31[3],xmm29[3],xmm31[4],xmm29[4],xmm31[5],xmm29[5],xmm31[6],xmm29[6],xmm31[7],xmm29[7] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm24 = xmm0[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm1, %ymm1 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm24, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm24 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm27[0],xmm2[1],xmm27[1],xmm2[2],xmm27[2],xmm2[3],xmm27[3],xmm2[4],xmm27[4],xmm2[5],xmm27[5],xmm2[6],xmm27[6],xmm2[7],xmm27[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm12, %zmm24 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rsi), %xmm28 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rdi), %xmm30 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rcx), %xmm25 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm10, %zmm22 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm9, %zmm22 {%k1} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm31[8],xmm29[8],xmm31[9],xmm29[9],xmm31[10],xmm29[10],xmm31[11],xmm29[11],xmm31[12],xmm29[12],xmm31[13],xmm29[13],xmm31[14],xmm29[14],xmm31[15],xmm29[15] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm23 = xmm0[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm1, %ymm1 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm23, %ymm0 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm23 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm27[8],xmm2[9],xmm27[9],xmm2[10],xmm27[10],xmm2[11],xmm27[11],xmm2[12],xmm27[12],xmm2[13],xmm27[13],xmm2[14],xmm27[14],xmm2[15],xmm27[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm12, %zmm23 {%k2} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm16[0],xmm13[0],xmm16[1],xmm13[1],xmm16[2],xmm13[2],xmm16[3],xmm13[3],xmm16[4],xmm13[4],xmm16[5],xmm13[5],xmm16[6],xmm13[6],xmm16[7],xmm13[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm10, %zmm26 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm9, %zmm26 {%k1} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm27 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm25[0],xmm2[1],xmm25[1],xmm2[2],xmm25[2],xmm2[3],xmm25[3],xmm2[4],xmm25[4],xmm2[5],xmm25[5],xmm2[6],xmm25[6],xmm2[7],xmm25[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm12, %zmm27 {%k2} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm16[8],xmm13[8],xmm16[9],xmm13[9],xmm16[10],xmm13[10],xmm16[11],xmm13[11],xmm16[12],xmm13[12],xmm16[13],xmm13[13],xmm16[14],xmm13[14],xmm16[15],xmm13[15] -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm13 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm10, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm13, %zmm9, %zmm0 {%k1} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX512BW-SLOW-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3],xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm5 -; AVX512BW-SLOW-NEXT: vpermw %zmm5, %zmm10, %zmm5 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm7 -; AVX512BW-SLOW-NEXT: vpermw %zmm7, %zmm9, %zmm5 {%k1} -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm7, %ymm7 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm9, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm25[8],xmm2[9],xmm25[9],xmm2[10],xmm25[10],xmm2[11],xmm25[11],xmm2[12],xmm25[12],xmm2[13],xmm25[13],xmm2[14],xmm25[14],xmm2[15],xmm25[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 -; AVX512BW-SLOW-NEXT: vpermw %zmm2, %zmm12, %zmm1 {%k2} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload -; AVX512BW-SLOW-NEXT: # xmm2 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3],xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm11, %ymm11 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10 +; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r9), %xmm20 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r8), %xmm21 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm19[8],xmm17[8],xmm19[9],xmm17[9],xmm19[10],xmm17[10],xmm19[11],xmm17[11],xmm19[12],xmm17[12],xmm19[13],xmm17[13],xmm19[14],xmm17[14],xmm19[15],xmm17[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 +; AVX512BW-SLOW-NEXT: vpermw %zmm11, %zmm4, %zmm10 {%k2} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm12[0],xmm18[0],xmm12[1],xmm18[1],xmm12[2],xmm18[2],xmm12[3],xmm18[3],xmm12[4],xmm18[4],xmm12[5],xmm18[5],xmm12[6],xmm18[6],xmm12[7],xmm18[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm17 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 +; AVX512BW-SLOW-NEXT: vpermw %zmm11, %zmm3, %zmm11 +; AVX512BW-SLOW-NEXT: vpermw %zmm17, %zmm2, %zmm11 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rsi), %xmm17 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm12[8],xmm18[8],xmm12[9],xmm18[9],xmm12[10],xmm18[10],xmm12[11],xmm18[11],xmm12[12],xmm18[12],xmm12[13],xmm18[13],xmm12[14],xmm18[14],xmm12[15],xmm18[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rdi), %xmm18 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rcx), %xmm20 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm19, %zmm19 +; AVX512BW-SLOW-NEXT: vpermw %zmm19, %zmm3, %zmm3 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm12, %zmm12 +; AVX512BW-SLOW-NEXT: vpermw %zmm12, %zmm2, %zmm3 {%k1} +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm19[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm2 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm6 -; AVX512BW-SLOW-NEXT: vpermw %zmm6, %zmm12, %zmm2 {%k2} +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm19[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm2, %ymm2 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm19 = xmm19[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm12, %ymm12 +; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rdx), %xmm19 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm12, %zmm2 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm19[0],xmm20[0],xmm19[1],xmm20[1],xmm19[2],xmm20[2],xmm19[3],xmm20[3],xmm19[4],xmm20[4],xmm19[5],xmm20[5],xmm19[6],xmm20[6],xmm19[7],xmm20[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm12, %zmm12 +; AVX512BW-SLOW-NEXT: vpermw %zmm12, %zmm4, %zmm2 {%k2} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm17 = xmm12[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm18 = xmm12[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm17, %ymm17 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm12, %ymm18, %ymm12 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm12, %zmm12 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm19[8],xmm20[8],xmm19[9],xmm20[9],xmm19[10],xmm20[10],xmm19[11],xmm20[11],xmm19[12],xmm20[12],xmm19[13],xmm20[13],xmm19[14],xmm20[14],xmm19[15],xmm20[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm17, %zmm17, %zmm17 +; AVX512BW-SLOW-NEXT: vpermw %zmm17, %zmm4, %zmm12 {%k2} ; AVX512BW-SLOW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm14, %zmm15 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm19, %zmm20 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm21, %zmm24 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm22, %zmm23 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm26, %zmm27 {%k1} ; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm5, %zmm2 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm13, %zmm14 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm15, %zmm16 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm11, %zmm2 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm3, %zmm12 {%k1} ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm27, 128(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm23, 320(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm24, 256(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, 448(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, 384(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm10, 320(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; @@ -6737,234 +6575,234 @@ ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FAST-NEXT: vmovdqa (%r10), %xmm0 -; AVX512BW-FAST-NEXT: vmovdqa 16(%r10), %xmm14 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%r10), %xmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 48(%r10), %xmm17 -; AVX512BW-FAST-NEXT: vmovdqa (%rax), %xmm1 -; AVX512BW-FAST-NEXT: vmovdqa 16(%rax), %xmm15 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rax), %xmm19 -; AVX512BW-FAST-NEXT: vmovdqa64 48(%rax), %xmm20 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512BW-FAST-NEXT: vmovdqa (%r10), %xmm8 +; AVX512BW-FAST-NEXT: vmovdqa 16(%r10), %xmm12 +; AVX512BW-FAST-NEXT: vmovdqa 32(%r10), %xmm9 +; AVX512BW-FAST-NEXT: vmovdqa 48(%r10), %xmm0 +; AVX512BW-FAST-NEXT: vmovdqa (%rax), %xmm11 +; AVX512BW-FAST-NEXT: vmovdqa 16(%rax), %xmm13 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rax), %xmm10 +; AVX512BW-FAST-NEXT: vmovdqa 48(%rax), %xmm1 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm3 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7,0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm3, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa (%r9), %xmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 48(%r9), %xmm21 -; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm4 -; AVX512BW-FAST-NEXT: vmovdqa64 48(%r8), %xmm22 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm7 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7,0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm7, %zmm16 +; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm3, %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa (%r9), %xmm15 +; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %xmm16 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7,0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX512BW-FAST-NEXT: vpshufb %zmm3, %zmm5, %zmm14 ; AVX512BW-FAST-NEXT: movl $-2004318072, %eax # imm = 0x88888888 ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm6, %zmm16 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512BW-FAST-NEXT: vmovdqa64 48(%rcx), %xmm23 -; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm8 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm8, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15,8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm8, %zmm24 -; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512BW-FAST-NEXT: vmovdqa64 48(%rsi), %xmm25 -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 48(%rdi), %xmm28 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm13 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm26 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm27 -; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm8, %ymm8 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm8, %zmm8 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm4, %zmm14 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %xmm18 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %xmm20 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm20[0],xmm18[0],xmm20[1],xmm18[1],xmm20[2],xmm18[2],xmm20[3],xmm18[3],xmm20[4],xmm18[4],xmm20[5],xmm18[5],xmm20[6],xmm18[6],xmm20[7],xmm18[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm17 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %xmm22 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %xmm23 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm7 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm6 +; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm7 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15,8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX512BW-FAST-NEXT: vpshufb %zmm7, %zmm17, %zmm17 ; AVX512BW-FAST-NEXT: movl $572662306, %eax # imm = 0x22222222 ; AVX512BW-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm24, %zmm8 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm17, %zmm4 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 16(%r9), %xmm17 ; AVX512BW-FAST-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512BW-FAST-NEXT: kmovd %eax, %k3 -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm16, %zmm8 {%k3} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm20[0],xmm17[0],xmm20[1],xmm17[1],xmm20[2],xmm17[2],xmm20[3],xmm17[3],xmm20[4],xmm17[4],xmm20[5],xmm17[5],xmm20[6],xmm17[6],xmm20[7],xmm17[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm16, %ymm16 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm16, %zmm16 -; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm16, %zmm16 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm24, %ymm24, %ymm24 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm24, %zmm24 -; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm24, %zmm24 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm16, %zmm24 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm28[0],xmm25[0],xmm28[1],xmm25[1],xmm28[2],xmm25[2],xmm28[3],xmm25[3],xmm28[4],xmm25[4],xmm28[5],xmm25[5],xmm28[6],xmm25[6],xmm28[7],xmm25[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm16, %xmm26 -; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm26, %ymm27, %ymm26 -; AVX512BW-FAST-NEXT: vmovdqa64 48(%rdx), %xmm30 -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm16, %ymm16 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm16, %ymm16 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm26, %zmm16 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm26 = xmm30[0],xmm23[0],xmm30[1],xmm23[1],xmm30[2],xmm23[2],xmm30[3],xmm23[3],xmm30[4],xmm23[4],xmm30[5],xmm23[5],xmm30[6],xmm23[6],xmm30[7],xmm23[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm26, %ymm26, %ymm26 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm26, %zmm26 -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm26, %zmm26 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm26, %zmm16 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%r9), %xmm26 -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm24, %zmm16 {%k3} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%r8), %xmm27 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm20[8],xmm17[8],xmm20[9],xmm17[9],xmm20[10],xmm17[10],xmm20[11],xmm17[11],xmm20[12],xmm17[12],xmm20[13],xmm17[13],xmm20[14],xmm17[14],xmm20[15],xmm17[15] -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rcx), %xmm24 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rsi), %xmm21 -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm17 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm17, %zmm17 -; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm17, %zmm17 -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm20 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm20, %zmm20 -; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm20, %zmm20 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm17, %zmm20 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm29 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm30[8],xmm23[8],xmm30[9],xmm23[9],xmm30[10],xmm23[10],xmm30[11],xmm23[11],xmm30[12],xmm23[12],xmm30[13],xmm23[13],xmm30[14],xmm23[14],xmm30[15],xmm23[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm17 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm17, %zmm17 -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm17, %zmm22 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm28[8],xmm25[8],xmm28[9],xmm25[9],xmm28[10],xmm25[10],xmm28[11],xmm25[11],xmm28[12],xmm25[12],xmm28[13],xmm25[13],xmm28[14],xmm25[14],xmm28[15],xmm25[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm23 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm23, %ymm23 -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm17, %xmm25 -; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm25, %ymm17, %ymm17 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm17, %zmm17 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm22, %zmm17 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm20, %zmm17 {%k3} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm19[0],xmm18[0],xmm19[1],xmm18[1],xmm19[2],xmm18[2],xmm19[3],xmm18[3],xmm19[4],xmm18[4],xmm19[5],xmm18[5],xmm19[6],xmm18[6],xmm19[7],xmm18[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm20 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm20, %zmm20 -; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm20, %zmm20 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm22, %ymm22 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm22, %zmm22 -; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm22, %zmm22 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm20, %zmm22 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm29[0],xmm21[0],xmm29[1],xmm21[1],xmm29[2],xmm21[2],xmm29[3],xmm21[3],xmm29[4],xmm21[4],xmm29[5],xmm21[5],xmm29[6],xmm21[6],xmm29[7],xmm21[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm20, %xmm23 -; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm23, %ymm25, %ymm23 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %xmm28 -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm20 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm20, %ymm20 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm23, %zmm20 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm28[0],xmm24[0],xmm28[1],xmm24[1],xmm28[2],xmm24[2],xmm28[3],xmm24[3],xmm28[4],xmm24[4],xmm28[5],xmm24[5],xmm28[6],xmm24[6],xmm28[7],xmm24[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm23, %ymm23, %ymm23 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm23, %zmm23 -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm23, %zmm23 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm23, %zmm20 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 16(%r9), %xmm23 -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm22, %zmm20 {%k3} -; AVX512BW-FAST-NEXT: vmovdqa64 16(%r8), %xmm25 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm19[8],xmm18[8],xmm19[9],xmm18[9],xmm19[10],xmm18[10],xmm19[11],xmm18[11],xmm19[12],xmm18[12],xmm19[13],xmm18[13],xmm19[14],xmm18[14],xmm19[15],xmm18[15] -; AVX512BW-FAST-NEXT: vmovdqa64 16(%rcx), %xmm19 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm26 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] -; AVX512BW-FAST-NEXT: vmovdqa64 16(%rsi), %xmm22 -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm18, %ymm18 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm18, %zmm18 -; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm18, %zmm18 -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm26, %ymm26, %ymm26 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm26, %zmm26 -; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm26, %zmm27 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm18, %zmm27 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 16(%rdi), %xmm26 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm28[8],xmm24[8],xmm28[9],xmm24[9],xmm28[10],xmm24[10],xmm28[11],xmm24[11],xmm28[12],xmm24[12],xmm28[13],xmm24[13],xmm28[14],xmm24[14],xmm28[15],xmm24[15] +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm14, %zmm4 {%k3} +; AVX512BW-FAST-NEXT: vmovdqa64 16(%r8), %xmm19 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] +; AVX512BW-FAST-NEXT: vmovdqa 16(%rcx), %xmm14 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] +; AVX512BW-FAST-NEXT: vmovdqa64 16(%rsi), %xmm16 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm8 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm8, %zmm8 +; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm8, %zmm8 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm11, %zmm11 +; AVX512BW-FAST-NEXT: vpshufb %zmm3, %zmm11, %zmm11 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm8, %zmm11 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 16(%rdi), %xmm21 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm20[8],xmm18[8],xmm20[9],xmm18[9],xmm20[10],xmm18[10],xmm20[11],xmm18[11],xmm20[12],xmm18[12],xmm20[13],xmm18[13],xmm20[14],xmm18[14],xmm20[15],xmm18[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm8 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm8, %zmm8 +; AVX512BW-FAST-NEXT: vpshufb %zmm7, %zmm8, %zmm15 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm8, %ymm8, %ymm18 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm18, %ymm18 +; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm20 +; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm8, %ymm8 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm8, %zmm8 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm15, %zmm8 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm11, %zmm8 {%k3} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm11, %zmm11 +; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm11, %zmm11 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm19[0],xmm17[0],xmm19[1],xmm17[1],xmm19[2],xmm17[2],xmm19[3],xmm17[3],xmm19[4],xmm17[4],xmm19[5],xmm17[5],xmm19[6],xmm17[6],xmm19[7],xmm17[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm15, %zmm15 +; AVX512BW-FAST-NEXT: vpshufb %zmm3, %zmm15, %zmm15 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm11, %zmm15 {%k1} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm21[0],xmm16[0],xmm21[1],xmm16[1],xmm21[2],xmm16[2],xmm21[3],xmm16[3],xmm21[4],xmm16[4],xmm21[5],xmm16[5],xmm21[6],xmm16[6],xmm21[7],xmm16[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm11, %xmm18 +; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm20, %ymm18 +; AVX512BW-FAST-NEXT: vmovdqa64 16(%rdx), %xmm22 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm11 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm18, %zmm11 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm22[0],xmm14[0],xmm22[1],xmm14[1],xmm22[2],xmm14[2],xmm22[3],xmm14[3],xmm22[4],xmm14[4],xmm22[5],xmm14[5],xmm22[6],xmm14[6],xmm22[7],xmm14[7] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm18, %ymm18 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm18, %zmm18 -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm18, %zmm24 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm29[8],xmm21[8],xmm29[9],xmm21[9],xmm29[10],xmm21[10],xmm29[11],xmm21[11],xmm29[12],xmm21[12],xmm29[13],xmm21[13],xmm29[14],xmm21[14],xmm29[15],xmm21[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm18, %ymm21 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm21, %ymm21 -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm18, %xmm28 -; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm28, %ymm18, %ymm18 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm24, %zmm18 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm27, %zmm18 {%k3} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm21, %ymm21 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm21, %zmm21 -; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm21, %zmm21 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm25[0],xmm23[0],xmm25[1],xmm23[1],xmm25[2],xmm23[2],xmm25[3],xmm23[3],xmm25[4],xmm23[4],xmm25[5],xmm23[5],xmm25[6],xmm23[6],xmm25[7],xmm23[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm24, %ymm24, %ymm24 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm24, %zmm24 -; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm24, %zmm24 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm21, %zmm24 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm26[0],xmm22[0],xmm26[1],xmm22[1],xmm26[2],xmm22[2],xmm26[3],xmm22[3],xmm26[4],xmm22[4],xmm26[5],xmm22[5],xmm26[6],xmm22[6],xmm26[7],xmm22[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm21, %xmm27 -; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm28, %ymm27 -; AVX512BW-FAST-NEXT: vmovdqa64 16(%rdx), %xmm28 -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm21, %ymm21 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm21, %ymm21 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm27, %zmm21 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm28[0],xmm19[0],xmm28[1],xmm19[1],xmm28[2],xmm19[2],xmm28[3],xmm19[3],xmm28[4],xmm19[4],xmm28[5],xmm19[5],xmm28[6],xmm19[6],xmm28[7],xmm19[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm27, %ymm27 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm27, %zmm27 -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm27, %zmm27 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm27, %zmm21 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm24, %zmm21 {%k3} -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm25[8],xmm23[8],xmm25[9],xmm23[9],xmm25[10],xmm23[10],xmm25[11],xmm23[11],xmm25[12],xmm23[12],xmm25[13],xmm23[13],xmm25[14],xmm23[14],xmm25[15],xmm23[15] +; AVX512BW-FAST-NEXT: vpshufb %zmm7, %zmm18, %zmm18 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm18, %zmm11 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 32(%r9), %xmm18 +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm15, %zmm11 {%k3} +; AVX512BW-FAST-NEXT: vmovdqa64 32(%r8), %xmm20 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %xmm15 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm19[8],xmm17[8],xmm19[9],xmm17[9],xmm19[10],xmm17[10],xmm19[11],xmm17[11],xmm19[12],xmm17[12],xmm19[13],xmm17[13],xmm19[14],xmm17[14],xmm19[15],xmm17[15] +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rsi), %xmm17 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm12 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 +; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm12, %zmm12 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm13 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm13, %zmm13 +; AVX512BW-FAST-NEXT: vpshufb %zmm3, %zmm13, %zmm13 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm12, %zmm13 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm19 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm22[8],xmm14[8],xmm22[9],xmm14[9],xmm22[10],xmm14[10],xmm22[11],xmm14[11],xmm22[12],xmm14[12],xmm22[13],xmm14[13],xmm22[14],xmm14[14],xmm22[15],xmm14[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm12 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 +; AVX512BW-FAST-NEXT: vpshufb %zmm7, %zmm12, %zmm14 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm21[8],xmm16[8],xmm21[9],xmm16[9],xmm21[10],xmm16[10],xmm21[11],xmm16[11],xmm21[12],xmm16[12],xmm21[13],xmm16[13],xmm21[14],xmm16[14],xmm21[15],xmm16[15] +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm12, %ymm12, %ymm16 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm16, %ymm16 +; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm12, %xmm21 +; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm12, %ymm12 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm12, %zmm12 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm14, %zmm12 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm13, %zmm12 {%k3} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm13 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm13, %zmm13 +; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm13, %zmm13 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm20[0],xmm18[0],xmm20[1],xmm18[1],xmm20[2],xmm18[2],xmm20[3],xmm18[3],xmm20[4],xmm18[4],xmm20[5],xmm18[5],xmm20[6],xmm18[6],xmm20[7],xmm18[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm14, %zmm14 +; AVX512BW-FAST-NEXT: vpshufb %zmm3, %zmm14, %zmm16 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm13, %zmm16 {%k1} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm19[0],xmm17[0],xmm19[1],xmm17[1],xmm19[2],xmm17[2],xmm19[3],xmm17[3],xmm19[4],xmm17[4],xmm19[5],xmm17[5],xmm19[6],xmm17[6],xmm19[7],xmm17[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm13, %xmm14 +; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm14, %ymm21, %ymm14 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %xmm21 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm13 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm13, %ymm13 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm13 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm21[0],xmm15[0],xmm21[1],xmm15[1],xmm21[2],xmm15[2],xmm21[3],xmm15[3],xmm21[4],xmm15[4],xmm21[5],xmm15[5],xmm21[6],xmm15[6],xmm21[7],xmm15[7] ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm14, %zmm14 -; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm14, %zmm14 +; AVX512BW-FAST-NEXT: vpshufb %zmm7, %zmm14, %zmm14 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm14, %zmm13 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa 48(%r9), %xmm14 +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm16, %zmm13 {%k3} +; AVX512BW-FAST-NEXT: vmovdqa64 48(%r8), %xmm16 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512BW-FAST-NEXT: vmovdqa 48(%rcx), %xmm9 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm20[8],xmm18[8],xmm20[9],xmm18[9],xmm20[10],xmm18[10],xmm20[11],xmm18[11],xmm20[12],xmm18[12],xmm20[13],xmm18[13],xmm20[14],xmm18[14],xmm20[15],xmm18[15] +; AVX512BW-FAST-NEXT: vmovdqa 48(%rsi), %xmm10 +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm22, %ymm20 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm20, %zmm20 +; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm20, %zmm20 +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm18, %ymm18 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm18, %zmm18 +; AVX512BW-FAST-NEXT: vpshufb %zmm3, %zmm18, %zmm22 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm20, %zmm22 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 48(%rdi), %xmm18 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm21[8],xmm15[8],xmm21[9],xmm15[9],xmm21[10],xmm15[10],xmm21[11],xmm15[11],xmm21[12],xmm15[12],xmm21[13],xmm15[13],xmm21[14],xmm15[14],xmm21[15],xmm15[15] ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm15, %zmm15 -; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm15, %zmm15 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm14, %zmm15 {%k1} -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm28[8],xmm19[8],xmm28[9],xmm19[9],xmm28[10],xmm19[10],xmm28[11],xmm19[11],xmm28[12],xmm19[12],xmm28[13],xmm19[13],xmm28[14],xmm19[14],xmm28[15],xmm19[15] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm22[8],xmm26[9],xmm22[9],xmm26[10],xmm22[10],xmm26[11],xmm22[11],xmm26[12],xmm22[12],xmm26[13],xmm22[13],xmm26[14],xmm22[14],xmm26[15],xmm22[15] -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm19, %xmm22 -; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm23, %ymm22 +; AVX512BW-FAST-NEXT: vpshufb %zmm7, %zmm15, %zmm20 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm19[8],xmm17[8],xmm19[9],xmm17[9],xmm19[10],xmm17[10],xmm19[11],xmm17[11],xmm19[12],xmm17[12],xmm19[13],xmm17[13],xmm19[14],xmm17[14],xmm19[15],xmm17[15] +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm15, %ymm15, %ymm17 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm17, %ymm17 +; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm15, %xmm19 +; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm15, %ymm15 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm15, %zmm15 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm20, %zmm15 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm22, %zmm15 {%k3} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm17 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm17, %zmm17 +; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm17, %zmm17 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm16[0],xmm14[0],xmm16[1],xmm14[1],xmm16[2],xmm14[2],xmm16[3],xmm14[3],xmm16[4],xmm14[4],xmm16[5],xmm14[5],xmm16[6],xmm14[6],xmm16[7],xmm14[7] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm19, %ymm19 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm19, %ymm19 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm22, %zmm19 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm14, %zmm14 -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm14, %zmm14 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm14, %zmm19 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm15, %zmm19 {%k3} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm19, %zmm19 +; AVX512BW-FAST-NEXT: vpshufb %zmm3, %zmm19, %zmm19 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm17, %zmm19 {%k1} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm18[0],xmm10[0],xmm18[1],xmm10[1],xmm18[2],xmm10[2],xmm18[3],xmm10[3],xmm18[4],xmm10[4],xmm18[5],xmm10[5],xmm18[6],xmm10[6],xmm18[7],xmm10[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm17, %xmm20 +; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm21, %ymm20 +; AVX512BW-FAST-NEXT: vmovdqa64 48(%rdx), %xmm21 +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm17 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm17, %ymm17 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm20, %zmm17 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm21[0],xmm9[0],xmm21[1],xmm9[1],xmm21[2],xmm9[2],xmm21[3],xmm9[3],xmm21[4],xmm9[4],xmm21[5],xmm9[5],xmm21[6],xmm9[6],xmm21[7],xmm9[7] +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm20 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm20, %zmm20 +; AVX512BW-FAST-NEXT: vpshufb %zmm7, %zmm20, %zmm20 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm20, %zmm17 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm19, %zmm17 {%k3} +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm16[8],xmm14[8],xmm16[9],xmm14[9],xmm16[10],xmm14[10],xmm16[11],xmm14[11],xmm16[12],xmm14[12],xmm16[13],xmm14[13],xmm16[14],xmm14[14],xmm16[15],xmm14[15] ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vpshufb %zmm3, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm21[8],xmm9[8],xmm21[9],xmm9[9],xmm21[10],xmm9[10],xmm21[11],xmm9[11],xmm21[12],xmm9[12],xmm21[13],xmm9[13],xmm21[14],xmm9[14],xmm21[15],xmm9[15] ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512BW-FAST-NEXT: vpshufb %zmm7, %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm18[8],xmm10[8],xmm18[9],xmm10[9],xmm18[10],xmm10[10],xmm18[11],xmm10[11],xmm18[12],xmm10[12],xmm18[13],xmm10[13],xmm18[14],xmm10[14],xmm18[15],xmm10[15] ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 -; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm4 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm0, %zmm2 {%k2} ; AVX512BW-FAST-NEXT: vmovdqa32 %zmm1, %zmm2 {%k3} ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, 192(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, 128(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, 320(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, 256(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, 448(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, 384(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, 448(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, 384(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, 320(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, 256(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 @@ -6987,7 +6825,9 @@ ret void } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX: {{.*}} ; AVX1: {{.*}} +; AVX2: {{.*}} ; AVX512-SLOW: {{.*}} ; AVX512BW: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-intrinsics.ll b/llvm/test/CodeGen/X86/vector-intrinsics.ll --- a/llvm/test/CodeGen/X86/vector-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-intrinsics.ll @@ -161,14 +161,15 @@ ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movaps (%rdi), %xmm0 -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rdi), %xmm1 +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps 16(%rdi), %xmm0 -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movaps 32(%rdi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps 32(%rdi), %xmm0 +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps 48(%rdi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm1, %xmm0 ; CHECK-NEXT: callq exp@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -209,13 +210,13 @@ ; CHECK-NEXT: callq exp@PLT ; CHECK-NEXT: movsd %xmm0, 64(%rbx) ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps %xmm0, (%rbx) -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps %xmm0, 16(%rbx) +; CHECK-NEXT: movaps %xmm0, 48(%rbx) ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps %xmm0, 32(%rbx) ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps %xmm0, 48(%rbx) +; CHECK-NEXT: movaps %xmm0, 16(%rbx) +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) ; CHECK-NEXT: addq $96, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: retq @@ -232,24 +233,24 @@ ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movaps (%rdi), %xmm0 -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rdi), %xmm2 +; CHECK-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps 16(%rdi), %xmm0 -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps 32(%rdi), %xmm0 +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps 48(%rdi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps 48(%rdi), %xmm2 -; CHECK-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movaps (%rsi), %xmm0 -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rsi), %xmm1 +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps 16(%rsi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps 32(%rsi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps 48(%rsi), %xmm1 -; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps 48(%rsi), %xmm0 +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm2, %xmm0 ; CHECK-NEXT: callq pow@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -304,13 +305,13 @@ ; CHECK-NEXT: callq pow@PLT ; CHECK-NEXT: movsd %xmm0, 64(%rbx) ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps %xmm0, (%rbx) -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps %xmm0, 16(%rbx) +; CHECK-NEXT: movaps %xmm0, 48(%rbx) ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps %xmm0, 32(%rbx) ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps %xmm0, 48(%rbx) +; CHECK-NEXT: movaps %xmm0, 16(%rbx) +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) ; CHECK-NEXT: addq $160, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: retq @@ -330,14 +331,15 @@ ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movaps (%rdi), %xmm0 -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rdi), %xmm1 +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps 16(%rdi), %xmm0 -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movaps 32(%rdi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps 32(%rdi), %xmm0 +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps 48(%rdi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm1, %xmm0 ; CHECK-NEXT: movl %esi, %edi ; CHECK-NEXT: callq __powidf2@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -387,13 +389,13 @@ ; CHECK-NEXT: callq __powidf2@PLT ; CHECK-NEXT: movsd %xmm0, 64(%rbx) ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps %xmm0, (%rbx) -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps %xmm0, 16(%rbx) +; CHECK-NEXT: movaps %xmm0, 48(%rbx) ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps %xmm0, 32(%rbx) ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps %xmm0, 48(%rbx) +; CHECK-NEXT: movaps %xmm0, 16(%rbx) +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) ; CHECK-NEXT: addq $104, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %rbp diff --git a/llvm/test/CodeGen/X86/vector-narrow-binop.ll b/llvm/test/CodeGen/X86/vector-narrow-binop.ll --- a/llvm/test/CodeGen/X86/vector-narrow-binop.ll +++ b/llvm/test/CodeGen/X86/vector-narrow-binop.ll @@ -107,9 +107,11 @@ ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: psubd %xmm0, %xmm2 -; SSE-NEXT: psrld $16, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: psrlq $16, %xmm2 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; @@ -117,9 +119,8 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: retq %sub = sub <2 x i32> , %x %bc = bitcast <2 x i32> %sub to <8 x i8> @@ -180,8 +181,8 @@ ; ; AVX512-LABEL: fmul_v2f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1] -; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX512-NEXT: vmulpd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm0 = (xmm2 * xmm2) + xmm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] diff --git a/llvm/test/CodeGen/X86/vector-pcmp.ll b/llvm/test/CodeGen/X86/vector-pcmp.ll --- a/llvm/test/CodeGen/X86/vector-pcmp.ll +++ b/llvm/test/CodeGen/X86/vector-pcmp.ll @@ -534,11 +534,20 @@ ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: retq ; -; AVX512-LABEL: cmpne_knownzeros_zext_v8i16_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlw $15, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: retq +; AVX512F-LABEL: cmpne_knownzeros_zext_v8i16_v8i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512F-NEXT: retq +; +; AVX512DQBW-LABEL: cmpne_knownzeros_zext_v8i16_v8i32: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX512DQBW-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQBW-NEXT: retq %a = lshr <8 x i16> %x, %b = icmp ne <8 x i16> %a, zeroinitializer %c = zext <8 x i1> %b to <8 x i32> diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128.ll b/llvm/test/CodeGen/X86/vector-popcnt-128.ll --- a/llvm/test/CodeGen/X86/vector-popcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-128.ll @@ -818,24 +818,23 @@ ; ; BITALG_NOVLX-LABEL: eq_1_v2i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm2 -; BITALG_NOVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; BITALG_NOVLX-NEXT: vpaddq %xmm3, %xmm0, %xmm3 -; BITALG_NOVLX-NEXT: vpand %xmm3, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpandn %xmm0, %xmm2, %xmm0 +; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; BITALG_NOVLX-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; BITALG_NOVLX-NEXT: vptestnmq %zmm1, %zmm0, %k1 +; BITALG_NOVLX-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1} +; BITALG_NOVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: eq_1_v2i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm2 -; BITALG-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; BITALG-NEXT: vpaddq %xmm3, %xmm0, %xmm3 -; BITALG-NEXT: vpand %xmm3, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpandn %xmm0, %xmm2, %xmm0 +; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; BITALG-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; BITALG-NEXT: vptestnmq %xmm2, %xmm0, %k1 +; BITALG-NEXT: vptestmq %xmm0, %xmm0, %k1 {%k1} +; BITALG-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp eq <2 x i64> %2, @@ -955,26 +954,25 @@ ; ; BITALG_NOVLX-LABEL: ne_1_v2i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm2 -; BITALG_NOVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; BITALG_NOVLX-NEXT: vpaddq %xmm3, %xmm0, %xmm3 -; BITALG_NOVLX-NEXT: vpand %xmm3, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpor %xmm0, %xmm2, %xmm0 +; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; BITALG_NOVLX-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; BITALG_NOVLX-NEXT: vptestmq %zmm1, %zmm0, %k0 +; BITALG_NOVLX-NEXT: vptestnmq %zmm0, %zmm0, %k1 +; BITALG_NOVLX-NEXT: korw %k0, %k1, %k1 +; BITALG_NOVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ne_1_v2i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm2 -; BITALG-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; BITALG-NEXT: vpaddq %xmm3, %xmm0, %xmm4 -; BITALG-NEXT: vpand %xmm4, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpternlogq $222, %xmm3, %xmm2, %xmm0 +; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; BITALG-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; BITALG-NEXT: vptestmq %xmm2, %xmm0, %k0 +; BITALG-NEXT: vptestnmq %xmm0, %xmm0, %k1 +; BITALG-NEXT: korw %k0, %k1, %k1 +; BITALG-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ne <2 x i64> %2, @@ -1034,24 +1032,23 @@ ; ; BITALG_NOVLX-LABEL: eq_1_v4i32: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm2 -; BITALG_NOVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; BITALG_NOVLX-NEXT: vpaddd %xmm3, %xmm0, %xmm3 -; BITALG_NOVLX-NEXT: vpand %xmm3, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpandn %xmm0, %xmm2, %xmm0 +; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; BITALG_NOVLX-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; BITALG_NOVLX-NEXT: vptestnmd %zmm1, %zmm0, %k1 +; BITALG_NOVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} +; BITALG_NOVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: eq_1_v4i32: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm2 -; BITALG-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; BITALG-NEXT: vpaddd %xmm3, %xmm0, %xmm3 -; BITALG-NEXT: vpand %xmm3, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpandn %xmm0, %xmm2, %xmm0 +; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; BITALG-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; BITALG-NEXT: vptestnmd %xmm2, %xmm0, %k1 +; BITALG-NEXT: vptestmd %xmm0, %xmm0, %k1 {%k1} +; BITALG-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp eq <4 x i32> %2, @@ -1118,26 +1115,25 @@ ; ; BITALG_NOVLX-LABEL: ne_1_v4i32: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm2 -; BITALG_NOVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; BITALG_NOVLX-NEXT: vpaddd %xmm3, %xmm0, %xmm3 -; BITALG_NOVLX-NEXT: vpand %xmm3, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpor %xmm0, %xmm2, %xmm0 +; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; BITALG_NOVLX-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; BITALG_NOVLX-NEXT: vptestmd %zmm1, %zmm0, %k0 +; BITALG_NOVLX-NEXT: vptestnmd %zmm0, %zmm0, %k1 +; BITALG_NOVLX-NEXT: korw %k0, %k1, %k1 +; BITALG_NOVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ne_1_v4i32: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm2 -; BITALG-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; BITALG-NEXT: vpaddd %xmm3, %xmm0, %xmm4 -; BITALG-NEXT: vpand %xmm4, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpternlogd $222, %xmm3, %xmm2, %xmm0 +; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; BITALG-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; BITALG-NEXT: vptestmd %xmm2, %xmm0, %k0 +; BITALG-NEXT: vptestnmd %xmm0, %xmm0, %k1 +; BITALG-NEXT: korw %k0, %k1, %k1 +; BITALG-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ne <4 x i32> %2, diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll --- a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll @@ -10,18 +10,18 @@ define <32 x i8> @ugt_1_v32i8(<32 x i8> %0) { ; AVX1-LABEL: ugt_1_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_1_v32i8: @@ -76,13 +76,13 @@ define <32 x i8> @ult_2_v32i8(<32 x i8> %0) { ; AVX1-LABEL: ult_2_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1003,18 +1003,18 @@ define <16 x i16> @ugt_1_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ugt_1_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_1_v16i16: @@ -1069,13 +1069,13 @@ define <16 x i16> @ult_2_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ult_2_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -3302,18 +3302,18 @@ define <8 x i32> @ugt_1_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_1_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_1_v8i32: @@ -3370,13 +3370,13 @@ define <8 x i32> @ult_2_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_2_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -9406,18 +9406,18 @@ define <4 x i64> @ugt_1_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_1_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_1_v4i64: @@ -9474,13 +9474,13 @@ define <4 x i64> @ult_2_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_2_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256.ll b/llvm/test/CodeGen/X86/vector-popcnt-256.ll --- a/llvm/test/CodeGen/X86/vector-popcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-256.ll @@ -509,24 +509,22 @@ ; ; BITALG_NOVLX-LABEL: eq_1_v4i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm2 -; BITALG_NOVLX-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; BITALG_NOVLX-NEXT: vpaddq %ymm3, %ymm0, %ymm3 -; BITALG_NOVLX-NEXT: vpand %ymm3, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpandn %ymm0, %ymm2, %ymm0 +; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; BITALG_NOVLX-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; BITALG_NOVLX-NEXT: vptestnmq %zmm1, %zmm0, %k1 +; BITALG_NOVLX-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1} +; BITALG_NOVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: eq_1_v4i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm2 -; BITALG-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; BITALG-NEXT: vpaddq %ymm3, %ymm0, %ymm3 -; BITALG-NEXT: vpand %ymm3, %ymm0, %ymm0 -; BITALG-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpandn %ymm0, %ymm2, %ymm0 +; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; BITALG-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; BITALG-NEXT: vptestnmq %ymm2, %ymm0, %k1 +; BITALG-NEXT: vptestmq %ymm0, %ymm0, %k1 {%k1} +; BITALG-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp eq <4 x i64> %2, @@ -605,25 +603,24 @@ ; ; BITALG_NOVLX-LABEL: ne_1_v4i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm2 -; BITALG_NOVLX-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; BITALG_NOVLX-NEXT: vpaddq %ymm3, %ymm0, %ymm3 -; BITALG_NOVLX-NEXT: vpand %ymm3, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpor %ymm0, %ymm2, %ymm0 +; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; BITALG_NOVLX-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; BITALG_NOVLX-NEXT: vptestmq %zmm1, %zmm0, %k0 +; BITALG_NOVLX-NEXT: vptestnmq %zmm0, %zmm0, %k1 +; BITALG_NOVLX-NEXT: korw %k0, %k1, %k1 +; BITALG_NOVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ne_1_v4i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm2 -; BITALG-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; BITALG-NEXT: vpaddq %ymm3, %ymm0, %ymm4 -; BITALG-NEXT: vpand %ymm4, %ymm0, %ymm0 -; BITALG-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpternlogq $222, %ymm3, %ymm2, %ymm0 +; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; BITALG-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; BITALG-NEXT: vptestmq %ymm2, %ymm0, %k0 +; BITALG-NEXT: vptestnmq %ymm0, %ymm0, %k1 +; BITALG-NEXT: korw %k0, %k1, %k1 +; BITALG-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ne <4 x i64> %2, @@ -696,24 +693,22 @@ ; ; BITALG_NOVLX-LABEL: eq_1_v8i32: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm2 -; BITALG_NOVLX-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; BITALG_NOVLX-NEXT: vpaddd %ymm3, %ymm0, %ymm3 -; BITALG_NOVLX-NEXT: vpand %ymm3, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpandn %ymm0, %ymm2, %ymm0 +; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; BITALG_NOVLX-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; BITALG_NOVLX-NEXT: vptestnmd %zmm1, %zmm0, %k1 +; BITALG_NOVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} +; BITALG_NOVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: eq_1_v8i32: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm2 -; BITALG-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; BITALG-NEXT: vpaddd %ymm3, %ymm0, %ymm3 -; BITALG-NEXT: vpand %ymm3, %ymm0, %ymm0 -; BITALG-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpandn %ymm0, %ymm2, %ymm0 +; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; BITALG-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; BITALG-NEXT: vptestnmd %ymm2, %ymm0, %k1 +; BITALG-NEXT: vptestmd %ymm0, %ymm0, %k1 {%k1} +; BITALG-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} {z} ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp eq <8 x i32> %2, @@ -792,25 +787,24 @@ ; ; BITALG_NOVLX-LABEL: ne_1_v8i32: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm2 -; BITALG_NOVLX-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; BITALG_NOVLX-NEXT: vpaddd %ymm3, %ymm0, %ymm3 -; BITALG_NOVLX-NEXT: vpand %ymm3, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpor %ymm0, %ymm2, %ymm0 +; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; BITALG_NOVLX-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; BITALG_NOVLX-NEXT: vptestmd %zmm1, %zmm0, %k0 +; BITALG_NOVLX-NEXT: vptestnmd %zmm0, %zmm0, %k1 +; BITALG_NOVLX-NEXT: korw %k0, %k1, %k1 +; BITALG_NOVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ne_1_v8i32: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm2 -; BITALG-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; BITALG-NEXT: vpaddd %ymm3, %ymm0, %ymm4 -; BITALG-NEXT: vpand %ymm4, %ymm0, %ymm0 -; BITALG-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpternlogd $222, %ymm3, %ymm2, %ymm0 +; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; BITALG-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; BITALG-NEXT: vptestmd %ymm2, %ymm0, %k0 +; BITALG-NEXT: vptestnmd %ymm0, %ymm0, %k1 +; BITALG-NEXT: korw %k0, %k1, %k1 +; BITALG-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} {z} ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ne <8 x i32> %2, diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll --- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll @@ -100,34 +100,23 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v4i64_v4i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: test_v4i64_v4i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vmovq %xmm0, %rax -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: test_v4i64_v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = and <4 x i64> %a0, %2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %1) ret i64 %2 @@ -185,7 +174,7 @@ ; AVX2-NEXT: vpsrlq $60, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -195,9 +184,12 @@ ; AVX512-LABEL: test_v8i64_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlq $60, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -288,7 +280,7 @@ ; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -417,7 +409,8 @@ ; AVX1-FAST-LABEL: test_v4i32: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vpsrld $31, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: retq @@ -496,8 +489,9 @@ ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: vzeroupper @@ -507,34 +501,27 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v8i32_v8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: test_v8i32_v8i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vmovd %xmm0, %eax -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: test_v8i32_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = and <8 x i32> %a0, %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1) ret i32 %2 @@ -617,9 +604,9 @@ ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -628,11 +615,15 @@ ; ; AVX512-LABEL: test_v16i32_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -754,9 +745,9 @@ ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -961,15 +952,47 @@ ; SSE41-NEXT: # kill: def $ax killed $ax killed $eax ; SSE41-NEXT: retq ; -; AVX-LABEL: test_v8i16_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq +; AVX1-SLOW-LABEL: test_v8i16_v8i8: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: test_v8i16_v8i8: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-FAST-NEXT: retq +; +; AVX2-LABEL: test_v8i16_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i16_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq %1 = and <8 x i16> %a0, %2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %1) ret i16 %2 @@ -1006,62 +1029,67 @@ ; SSE41-NEXT: # kill: def $ax killed $ax killed $eax ; SSE41-NEXT: retq ; -; AVX1-LABEL: test_v16i16_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq +; AVX1-SLOW-LABEL: test_v16i16_v16i8: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-SLOW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: test_v16i16_v16i8: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq ; ; AVX2-LABEL: test_v16i16_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v16i16_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: test_v16i16_v16i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BWVL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vmovd %xmm0, %eax -; AVX512BWVL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: test_v16i16_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = and <16 x i16> %a0, %2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %1) ret i16 %2 @@ -1145,13 +1173,16 @@ ; AVX512-LABEL: test_v32i16_v32i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -1322,3 +1353,5 @@ declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>) declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>) declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll b/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll --- a/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll @@ -84,7 +84,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -95,7 +95,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovsxwq %xmm0, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -174,7 +174,7 @@ ; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -187,7 +187,7 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -320,7 +320,7 @@ ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -337,7 +337,7 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -441,7 +441,8 @@ ; AVX1-FAST-LABEL: test_v4i32: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: retq @@ -517,8 +518,9 @@ ; AVX1-FAST-NEXT: vpmovsxbd %xmm0, %xmm1 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX1-FAST-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: retq @@ -527,9 +529,9 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -540,9 +542,9 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -639,9 +641,9 @@ ; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -654,9 +656,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -810,9 +812,9 @@ ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -828,9 +830,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1026,8 +1028,10 @@ ; AVX1-FAST-LABEL: test_v8i16_v8i8: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vpmovsxbw %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -1120,9 +1124,11 @@ ; AVX1-FAST-NEXT: vpmovsxbw %xmm0, %xmm1 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vpmovsxbw %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -1132,11 +1138,11 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1148,11 +1154,11 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1263,11 +1269,11 @@ ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1281,11 +1287,11 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1445,11 +1451,11 @@ ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1466,11 +1472,11 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1560,7 +1566,8 @@ ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: retq @@ -1641,8 +1648,10 @@ ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -1773,11 +1782,15 @@ ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper @@ -1788,11 +1801,15 @@ ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper @@ -1888,13 +1905,17 @@ ; AVX512-NEXT: vpmovb2m %zmm0, %k0 ; AVX512-NEXT: vpmovm2b %k0, %zmm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll --- a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll @@ -81,7 +81,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -92,7 +92,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -111,12 +111,32 @@ ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq ; -; AVX-LABEL: test_v8i64_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq +; AVX1-LABEL: test_v8i64_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i64_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i64_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = zext <8 x i8> %a0 to <8 x i64> %2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %1) ret i64 %2 @@ -220,13 +240,22 @@ ; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: retq ; -; AVX1-LABEL: test_v4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: retq +; AVX1-SLOW-LABEL: test_v4i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX1-SLOW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: test_v4i32: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: retq ; ; AVX2-LABEL: test_v4i32: ; AVX2: # %bb.0: @@ -257,12 +286,56 @@ ; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_v8i32_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq +; AVX1-SLOW-LABEL: test_v8i32_v8i8: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-SLOW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: test_v8i32_v8i8: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-FAST-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: retq +; +; AVX2-LABEL: test_v8i32_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i32_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = zext <8 x i8> %a0 to <8 x i32> %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1) ret i32 %2 @@ -278,14 +351,38 @@ ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_v16i32_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq +; AVX1-LABEL: test_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = zext <16 x i8> %a0 to <16 x i32> %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) ret i32 %2 @@ -492,13 +589,41 @@ ; SSE-NEXT: # kill: def $ax killed $ax killed $eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_v8i16_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq +; AVX1-SLOW-LABEL: test_v8i16_v8i8: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: test_v8i16_v8i8: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-FAST-NEXT: retq +; +; AVX2-LABEL: test_v8i16_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i16_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq %1 = zext <8 x i8> %a0 to <8 x i16> %2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %1) ret i16 %2 @@ -515,15 +640,68 @@ ; SSE-NEXT: # kill: def $ax killed $ax killed $eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_v16i16_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq +; AVX1-SLOW-LABEL: test_v16i16_v16i8: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: test_v16i16_v16i8: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-FAST-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-FAST-NEXT: retq +; +; AVX2-LABEL: test_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i16_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = zext <16 x i8> %a0 to <16 x i16> %2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %1) ret i16 %2 @@ -571,12 +749,17 @@ ; ; AVX512-LABEL: test_v32i16_v32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-add.ll b/llvm/test/CodeGen/X86/vector-reduce-add.ll --- a/llvm/test/CodeGen/X86/vector-reduce-add.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add.ll @@ -58,7 +58,7 @@ ; AVX2-LABEL: test_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -68,7 +68,7 @@ ; AVX512-LABEL: test_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -106,7 +106,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -118,7 +118,7 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -168,7 +168,7 @@ ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -181,7 +181,7 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -254,7 +254,8 @@ ; ; AVX1-FAST-LABEL: test_v4i32: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: retq @@ -306,8 +307,9 @@ ; AVX1-FAST-LABEL: test_v8i32: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: vzeroupper @@ -316,9 +318,9 @@ ; AVX2-LABEL: test_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -328,9 +330,9 @@ ; AVX512-LABEL: test_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -386,9 +388,9 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -400,9 +402,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -476,9 +478,9 @@ ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -491,9 +493,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -633,8 +635,10 @@ ; ; AVX1-FAST-LABEL: test_v8i16: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -700,9 +704,11 @@ ; AVX1-FAST-LABEL: test_v16i16: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm1, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -712,11 +718,11 @@ ; AVX2-LABEL: test_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -727,11 +733,11 @@ ; AVX512-LABEL: test_v16i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -798,11 +804,11 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -815,11 +821,11 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -904,11 +910,11 @@ ; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -922,11 +928,11 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1201,11 +1207,15 @@ ; AVX2-LABEL: test_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper @@ -1214,11 +1224,15 @@ ; AVX512-LABEL: test_v32i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper @@ -1261,11 +1275,15 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper @@ -1274,13 +1292,17 @@ ; AVX512-LABEL: test_v64i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper @@ -1292,34 +1314,34 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; SSE-LABEL: test_v128i8: ; SSE: # %bb.0: -; SSE-NEXT: paddb %xmm7, %xmm3 -; SSE-NEXT: paddb %xmm5, %xmm1 -; SSE-NEXT: paddb %xmm3, %xmm1 ; SSE-NEXT: paddb %xmm6, %xmm2 ; SSE-NEXT: paddb %xmm4, %xmm0 ; SSE-NEXT: paddb %xmm2, %xmm0 -; SSE-NEXT: paddb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: paddb %xmm7, %xmm3 +; SSE-NEXT: paddb %xmm5, %xmm1 +; SSE-NEXT: paddb %xmm3, %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: psadbw %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: paddb %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: psadbw %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: # kill: def $al killed $al killed $eax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v128i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1335,11 +1357,15 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper @@ -1349,13 +1375,17 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -13,43 +13,49 @@ ; define i1 @trunc_v2i64_v2i1(<2 x i64>) nounwind { -; SSE2-LABEL: trunc_v2i64_v2i1: -; SSE2: # %bb.0: -; SSE2-NEXT: psllq $63, %xmm0 -; SSE2-NEXT: movmskpd %xmm0, %eax -; SSE2-NEXT: cmpl $3, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: ret{{[l|q]}} -; -; SSE41-LABEL: trunc_v2i64_v2i1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v2i64_v2i1: +; SSE: # %bb.0: +; SSE-NEXT: psllq $63, %xmm0 +; SSE-NEXT: movmskpd %xmm0, %eax +; SSE-NEXT: cmpl $3, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1OR2-LABEL: trunc_v2i64_v2i1: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1OR2-NEXT: vtestpd %xmm1, %xmm0 ; AVX1OR2-NEXT: setb %al ; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: trunc_v2i64_v2i1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512F-NEXT: setb %al +; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $3, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_v2i64_v2i1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512BW-NEXT: setb %al +; AVX512BW-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX512BW-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $3, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512VL-LABEL: trunc_v2i64_v2i1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; AVX512VL-NEXT: vptest %xmm1, %xmm0 -; AVX512VL-NEXT: setb %al +; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: testb $3, %al +; AVX512VL-NEXT: sete %al ; AVX512VL-NEXT: retq %a = trunc <2 x i64> %0 to <2 x i1> %b = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %a) @@ -57,43 +63,49 @@ } define i1 @trunc_v4i32_v4i1(<4 x i32>) nounwind { -; SSE2-LABEL: trunc_v4i32_v4i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: xorl $15, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: ret{{[l|q]}} -; -; SSE41-LABEL: trunc_v4i32_v4i1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v4i32_v4i1: +; SSE: # %bb.0: +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1OR2-LABEL: trunc_v4i32_v4i1: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1OR2-NEXT: vtestps %xmm1, %xmm0 ; AVX1OR2-NEXT: setb %al ; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: trunc_v4i32_v4i1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512F-NEXT: setb %al +; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $15, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_v4i32_v4i1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512BW-NEXT: setb %al +; AVX512BW-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $15, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512VL-LABEL: trunc_v4i32_v4i1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967297,4294967297] -; AVX512VL-NEXT: vptest %xmm1, %xmm0 -; AVX512VL-NEXT: setb %al +; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: testb $15, %al +; AVX512VL-NEXT: sete %al ; AVX512VL-NEXT: retq %a = trunc <4 x i32> %0 to <4 x i1> %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) @@ -101,25 +113,22 @@ } define i1 @trunc_v8i16_v8i1(<8 x i16>) nounwind { -; SSE2-LABEL: trunc_v8i16_v8i1: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-NEXT: sete %al -; SSE2-NEXT: ret{{[l|q]}} -; -; SSE41-LABEL: trunc_v8i16_v8i1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v8i16_v8i1: +; SSE: # %bb.0: +; SSE-NEXT: psllw $15, %xmm0 +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpb $-1, %al +; SSE-NEXT: sete %al +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1OR2-LABEL: trunc_v8i16_v8i1: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1OR2-NEXT: setb %al +; AVX1OR2-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: cmpb $-1, %al +; AVX1OR2-NEXT: sete %al ; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: trunc_v8i16_v8i1: @@ -146,88 +155,85 @@ } define i1 @trunc_v16i8_v16i1(<16 x i8>) nounwind { -; SSE2-LABEL: trunc_v16i8_v16i1: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: sete %al -; SSE2-NEXT: ret{{[l|q]}} -; -; SSE41-LABEL: trunc_v16i8_v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq -; -; AVX1OR2-LABEL: trunc_v16i8_v16i1: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1OR2-NEXT: setb %al -; AVX1OR2-NEXT: retq -; -; AVX512F-LABEL: trunc_v16i8_v16i1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512F-NEXT: setb %al -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_v16i8_v16i1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512BW-NEXT: setb %al -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: trunc_v16i8_v16i1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [72340172838076673,72340172838076673] -; AVX512VL-NEXT: vptest %xmm1, %xmm0 -; AVX512VL-NEXT: setb %al -; AVX512VL-NEXT: retq +; SSE-LABEL: trunc_v16i8_v16i1: +; SSE: # %bb.0: +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: sete %al +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: trunc_v16i8_v16i1: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX-NEXT: vpmovmskb %xmm0, %eax +; AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX-NEXT: sete %al +; AVX-NEXT: retq %a = trunc <16 x i8> %0 to <16 x i1> %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a) ret i1 %b } define i1 @trunc_v4i64_v4i1(<4 x i64>) nounwind { -; SSE2-LABEL: trunc_v4i64_v4i1: -; SSE2: # %bb.0: -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: cmpl $15, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: ret{{[l|q]}} -; -; SSE41-LABEL: trunc_v4i64_v4i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v4i64_v4i1: +; SSE: # %bb.0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-LABEL: trunc_v4i64_v4i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vtestps %xmm1, %xmm0 ; AVX1-NEXT: setb %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v4i64_v4i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vtestpd %ymm1, %ymm0 ; AVX2-NEXT: setb %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: trunc_v4i64_v4i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setb %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_v4i64_v4i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $15, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_v4i64_v4i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX512BW-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $15, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: trunc_v4i64_v4i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX512VL-NEXT: vptestnmq %ymm0, %ymm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: testb $15, %al +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = trunc <4 x i64> %0 to <4 x i1> %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) ret i1 %b @@ -236,31 +242,47 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) nounwind { ; SSE2-LABEL: trunc_v8i32_v8i1: ; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: psllw $15, %xmm0 +; SSE2-NEXT: packsswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: cmpb $-1, %al ; SSE2-NEXT: sete %al ; SSE2-NEXT: ret{{[l|q]}} ; ; SSE41-LABEL: trunc_v8i32_v8i1: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: psllw $15, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: cmpb $-1, %al +; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_v8i32_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vtestps %xmm1, %xmm0 ; AVX1-NEXT: setb %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v8i32_v8i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vtestps %ymm1, %ymm0 ; AVX2-NEXT: setb %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -278,35 +300,39 @@ } define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind { -; SSE2-LABEL: trunc_v16i16_v16i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-NEXT: sete %al -; SSE2-NEXT: ret{{[l|q]}} -; -; SSE41-LABEL: trunc_v16i16_v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v16i16_v16i1: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: sete %al +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-LABEL: trunc_v16i16_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v16i16_v16i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -323,42 +349,41 @@ } define i1 @trunc_v32i8_v32i1(<32 x i8>) nounwind { -; SSE2-LABEL: trunc_v32i8_v32i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: sete %al -; SSE2-NEXT: ret{{[l|q]}} -; -; SSE41-LABEL: trunc_v32i8_v32i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v32i8_v32i1: +; SSE: # %bb.0: +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: sete %al +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-LABEL: trunc_v32i8_v32i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v32i8_v32i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: cmpl $-1, %eax +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_v32i8_v32i1: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setb %al +; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512-NEXT: vpmovmskb %ymm0, %eax +; AVX512-NEXT: cmpl $-1, %eax +; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %a = trunc <32 x i8> %0 to <32 x i1> @@ -407,26 +432,43 @@ ; ; SSE41-LABEL: trunc_v8i64_v8i1: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packusdw %xmm2, %xmm0 +; SSE41-NEXT: psllw $15, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: cmpb $-1, %al +; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_v8i64_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vtestps %xmm1, %xmm0 ; AVX1-NEXT: setb %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v8i64_v8i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vtestps %ymm1, %ymm0 ; AVX2-NEXT: setb %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -452,12 +494,17 @@ ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pand 8(%ebp), %xmm1 -; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: pslld $31, %xmm1 -; X86-SSE2-NEXT: movmskps %xmm1, %eax -; X86-SSE2-NEXT: xorl $15, %eax +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: packuswb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm2 +; X86-SSE2-NEXT: pand 8(%ebp), %xmm3 +; X86-SSE2-NEXT: packuswb %xmm3, %xmm2 +; X86-SSE2-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE2-NEXT: psllw $7, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp @@ -465,38 +512,66 @@ ; ; X64-SSE2-LABEL: trunc_v16i32_v16i1: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pand %xmm3, %xmm1 -; X64-SSE2-NEXT: pand %xmm2, %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: pslld $31, %xmm0 -; X64-SSE2-NEXT: movmskps %xmm0, %eax -; X64-SSE2-NEXT: xorl $15, %eax +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; X64-SSE2-NEXT: pand %xmm4, %xmm3 +; X64-SSE2-NEXT: pand %xmm4, %xmm2 +; X64-SSE2-NEXT: packuswb %xmm3, %xmm2 +; X64-SSE2-NEXT: pand %xmm4, %xmm1 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: packuswb %xmm1, %xmm0 +; X64-SSE2-NEXT: packuswb %xmm2, %xmm0 +; X64-SSE2-NEXT: psllw $7, %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-SSE2-NEXT: sete %al ; X64-SSE2-NEXT: retq ; ; SSE41-LABEL: trunc_v16i32_v16i1: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm1 +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: psllw $7, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_v16i32_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v16i32_v16i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -519,53 +594,66 @@ ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pand 8(%ebp), %xmm1 -; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: psllw $7, %xmm1 -; X86-SSE2-NEXT: pmovmskb %xmm1, %eax -; X86-SSE2-NEXT: notl %eax -; X86-SSE2-NEXT: testl $21845, %eax # imm = 0x5555 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: packuswb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm2 +; X86-SSE2-NEXT: pand 8(%ebp), %xmm3 +; X86-SSE2-NEXT: packuswb %xmm3, %xmm2 +; X86-SSE2-NEXT: pand %xmm0, %xmm2 +; X86-SSE2-NEXT: psllw $7, %xmm2 +; X86-SSE2-NEXT: pmovmskb %xmm2, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; -; X64-SSE2-LABEL: trunc_v32i16_v32i1: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pand %xmm3, %xmm1 -; X64-SSE2-NEXT: pand %xmm2, %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: psllw $7, %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax -; X64-SSE2-NEXT: notl %eax -; X64-SSE2-NEXT: testl $21845, %eax # imm = 0x5555 -; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: retq -; -; SSE41-LABEL: trunc_v32i16_v32i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; X64-SSE-LABEL: trunc_v32i16_v32i1: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; X64-SSE-NEXT: pand %xmm4, %xmm1 +; X64-SSE-NEXT: pand %xmm4, %xmm0 +; X64-SSE-NEXT: packuswb %xmm1, %xmm0 +; X64-SSE-NEXT: pand %xmm4, %xmm3 +; X64-SSE-NEXT: pand %xmm4, %xmm2 +; X64-SSE-NEXT: packuswb %xmm3, %xmm2 +; X64-SSE-NEXT: pand %xmm0, %xmm2 +; X64-SSE-NEXT: psllw $7, %xmm2 +; X64-SSE-NEXT: pmovmskb %xmm2, %eax +; X64-SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE-NEXT: sete %al +; X64-SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v32i16_v32i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v32i16_v32i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: cmpl $-1, %eax +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -660,10 +748,11 @@ ; ; AVX512F-LABEL: trunc_v64i8_v64i1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovmskb %ymm0, %eax +; AVX512F-NEXT: cmpl $-1, %eax ; AVX512F-NEXT: sete %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -701,8 +790,10 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movmskpd %xmm0, %eax +; SSE2-NEXT: cmpl $3, %eax ; SSE2-NEXT: sete %al ; SSE2-NEXT: ret{{[l|q]}} ; @@ -712,11 +803,39 @@ ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX-LABEL: icmp0_v2i64_v2i1: -; AVX: # %bb.0: -; AVX-NEXT: vptest %xmm0, %xmm0 -; AVX-NEXT: sete %al -; AVX-NEXT: retq +; AVX1OR2-LABEL: icmp0_v2i64_v2i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vptest %xmm0, %xmm0 +; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: retq +; +; AVX512F-LABEL: icmp0_v2i64_v2i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $3, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp0_v2i64_v2i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $3, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp0_v2i64_v2i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: cmpb $3, %al +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: retq %a = icmp eq <2 x i64> %0, zeroinitializer %b = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %a) ret i1 %b @@ -728,7 +847,7 @@ ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: cmpl $15, %eax ; SSE2-NEXT: sete %al ; SSE2-NEXT: ret{{[l|q]}} ; @@ -738,37 +857,84 @@ ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX-LABEL: icmp0_v4i32_v4i1: -; AVX: # %bb.0: -; AVX-NEXT: vptest %xmm0, %xmm0 -; AVX-NEXT: sete %al -; AVX-NEXT: retq +; AVX1OR2-LABEL: icmp0_v4i32_v4i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vptest %xmm0, %xmm0 +; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: retq +; +; AVX512F-LABEL: icmp0_v4i32_v4i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $15, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp0_v4i32_v4i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $15, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp0_v4i32_v4i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: cmpb $15, %al +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: retq %a = icmp eq <4 x i32> %0, zeroinitializer %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) ret i1 %b } define i1 @icmp0_v8i16_v8i1(<8 x i16>) nounwind { -; SSE2-LABEL: icmp0_v8i16_v8i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: sete %al -; SSE2-NEXT: ret{{[l|q]}} +; SSE-LABEL: icmp0_v8i16_v8i1: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpeqw %xmm0, %xmm1 +; SSE-NEXT: packsswb %xmm1, %xmm1 +; SSE-NEXT: pmovmskb %xmm1, %eax +; SSE-NEXT: cmpb $-1, %al +; SSE-NEXT: sete %al +; SSE-NEXT: ret{{[l|q]}} +; +; AVX1OR2-LABEL: icmp0_v8i16_v8i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: cmpb $-1, %al +; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: retq ; -; SSE41-LABEL: icmp0_v8i16_v8i1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; AVX512F-LABEL: icmp0_v8i16_v8i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: retq ; -; AVX-LABEL: icmp0_v8i16_v8i1: -; AVX: # %bb.0: -; AVX-NEXT: vptest %xmm0, %xmm0 -; AVX-NEXT: sete %al -; AVX-NEXT: retq +; AVX512BW-LABEL: icmp0_v8i16_v8i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vptest %xmm0, %xmm0 +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp0_v8i16_v8i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vptest %xmm0, %xmm0 +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: retq %a = icmp eq <8 x i16> %0, zeroinitializer %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a) ret i1 %b @@ -780,7 +946,7 @@ ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE2-NEXT: pmovmskb %xmm1, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: ret{{[l|q]}} ; @@ -803,49 +969,81 @@ define i1 @icmp0_v4i64_v4i1(<4 x i64>) nounwind { ; SSE2-LABEL: icmp0_v4i64_v4i1: ; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %eax +; SSE2-NEXT: cmpl $15, %eax ; SSE2-NEXT: sete %al ; SSE2-NEXT: ret{{[l|q]}} ; ; SSE41-LABEL: icmp0_v4i64_v4i1: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pcmpeqq %xmm2, %xmm1 +; SSE41-NEXT: pcmpeqq %xmm2, %xmm0 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: movmskps %xmm0, %eax +; SSE41-NEXT: cmpl $15, %eax ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX-LABEL: icmp0_v4i64_v4i1: -; AVX: # %bb.0: -; AVX-NEXT: vptest %ymm0, %ymm0 -; AVX-NEXT: sete %al -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1OR2-LABEL: icmp0_v4i64_v4i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vptest %ymm0, %ymm0 +; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: vzeroupper +; AVX1OR2-NEXT: retq +; +; AVX512F-LABEL: icmp0_v4i64_v4i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $15, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp0_v4i64_v4i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $15, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp0_v4i64_v4i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vptestnmq %ymm0, %ymm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: cmpb $15, %al +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = icmp eq <4 x i64> %0, zeroinitializer %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) ret i1 %b } define i1 @icmp0_v8i32_v8i1(<8 x i32>) nounwind { -; SSE2-LABEL: icmp0_v8i32_v8i1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: ret{{[l|q]}} -; -; SSE41-LABEL: icmp0_v8i32_v8i1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: icmp0_v8i32_v8i1: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpb $-1, %al +; SSE-NEXT: sete %al +; SSE-NEXT: ret{{[l|q]}} ; ; AVX-LABEL: icmp0_v8i32_v8i1: ; AVX: # %bb.0: @@ -859,29 +1057,60 @@ } define i1 @icmp0_v16i16_v16i1(<16 x i16>) nounwind { -; SSE2-LABEL: icmp0_v16i16_v16i1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: sete %al -; SSE2-NEXT: ret{{[l|q]}} +; SSE-LABEL: icmp0_v16i16_v16i1: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqw %xmm2, %xmm1 +; SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: sete %al +; SSE-NEXT: ret{{[l|q]}} +; +; AVX1-LABEL: icmp0_v16i16_v16i1: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: sete %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq ; -; SSE41-LABEL: icmp0_v16i16_v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; AVX2-LABEL: icmp0_v16i16_v16i1: +; AVX2: # %bb.0: +; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: sete %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; -; AVX-LABEL: icmp0_v16i16_v16i1: -; AVX: # %bb.0: -; AVX-NEXT: vptest %ymm0, %ymm0 -; AVX-NEXT: sete %al -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX512F-LABEL: icmp0_v16i16_v16i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] +; AVX512F-NEXT: vptest %ymm1, %ymm0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp0_v16i16_v16i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vptest %ymm0, %ymm0 +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp0_v16i16_v16i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vptest %ymm0, %ymm0 +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = icmp eq <16 x i16> %0, zeroinitializer %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a) ret i1 %b @@ -890,11 +1119,11 @@ define i1 @icmp0_v32i8_v32i1(<32 x i8>) nounwind { ; SSE2-LABEL: icmp0_v32i8_v32i1: ; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; SSE2-NEXT: pcmpeqb %xmm2, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: ret{{[l|q]}} ; @@ -923,13 +1152,25 @@ ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: por 8(%ebp), %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm0, %xmm0 -; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; X86-SSE2-NEXT: movmskps %xmm0, %eax -; X86-SSE2-NEXT: xorl $15, %eax +; X86-SSE2-NEXT: pxor %xmm3, %xmm3 +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,0,3,2] +; X86-SSE2-NEXT: pand %xmm1, %xmm4 +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: packssdw %xmm4, %xmm1 +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pcmpeqd 8(%ebp), %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2] +; X86-SSE2-NEXT: pand %xmm3, %xmm2 +; X86-SSE2-NEXT: packssdw %xmm2, %xmm0 +; X86-SSE2-NEXT: packssdw %xmm0, %xmm1 +; X86-SSE2-NEXT: packsswb %xmm1, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax +; X86-SSE2-NEXT: cmpb $-1, %al ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp @@ -937,38 +1178,71 @@ ; ; X64-SSE2-LABEL: icmp0_v8i64_v8i1: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: por %xmm3, %xmm1 -; X64-SSE2-NEXT: por %xmm2, %xmm0 -; X64-SSE2-NEXT: por %xmm1, %xmm0 -; X64-SSE2-NEXT: pxor %xmm1, %xmm1 -; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; X64-SSE2-NEXT: movmskps %xmm1, %eax -; X64-SSE2-NEXT: xorl $15, %eax +; X64-SSE2-NEXT: pxor %xmm4, %xmm4 +; X64-SSE2-NEXT: pcmpeqd %xmm4, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,0,3,2] +; X64-SSE2-NEXT: pand %xmm3, %xmm5 +; X64-SSE2-NEXT: pcmpeqd %xmm4, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; X64-SSE2-NEXT: pand %xmm2, %xmm3 +; X64-SSE2-NEXT: packssdw %xmm5, %xmm3 +; X64-SSE2-NEXT: pcmpeqd %xmm4, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpeqd %xmm4, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X64-SSE2-NEXT: pand %xmm0, %xmm1 +; X64-SSE2-NEXT: packssdw %xmm2, %xmm1 +; X64-SSE2-NEXT: packssdw %xmm3, %xmm1 +; X64-SSE2-NEXT: packsswb %xmm1, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax +; X64-SSE2-NEXT: cmpb $-1, %al ; X64-SSE2-NEXT: sete %al ; X64-SSE2-NEXT: retq ; ; SSE41-LABEL: icmp0_v8i64_v8i1: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm3 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm2 +; SSE41-NEXT: packssdw %xmm3, %xmm2 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm1 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm0 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: packssdw %xmm2, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: cmpb $-1, %al ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: icmp0_v8i64_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 -; AVX1-NEXT: sete %al +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vtestps %xmm1, %xmm0 +; AVX1-NEXT: setb %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: icmp0_v8i64_v8i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 -; AVX2-NEXT: sete %al +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vtestps %ymm1, %ymm0 +; AVX2-NEXT: setb %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -991,51 +1265,62 @@ ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: por 8(%ebp), %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm0, %xmm0 -; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; X86-SSE2-NEXT: movmskps %xmm0, %eax -; X86-SSE2-NEXT: xorl $15, %eax +; X86-SSE2-NEXT: pxor %xmm3, %xmm3 +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm1 +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 +; X86-SSE2-NEXT: packssdw %xmm1, %xmm0 +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; X86-SSE2-NEXT: pcmpeqd 8(%ebp), %xmm3 +; X86-SSE2-NEXT: packssdw %xmm3, %xmm2 +; X86-SSE2-NEXT: packsswb %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; -; X64-SSE2-LABEL: icmp0_v16i32_v16i1: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: por %xmm3, %xmm1 -; X64-SSE2-NEXT: por %xmm2, %xmm0 -; X64-SSE2-NEXT: por %xmm1, %xmm0 -; X64-SSE2-NEXT: pxor %xmm1, %xmm1 -; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; X64-SSE2-NEXT: movmskps %xmm1, %eax -; X64-SSE2-NEXT: xorl $15, %eax -; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: retq -; -; SSE41-LABEL: icmp0_v16i32_v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; X64-SSE-LABEL: icmp0_v16i32_v16i1: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: pxor %xmm4, %xmm4 +; X64-SSE-NEXT: pcmpeqd %xmm4, %xmm3 +; X64-SSE-NEXT: pcmpeqd %xmm4, %xmm2 +; X64-SSE-NEXT: packssdw %xmm3, %xmm2 +; X64-SSE-NEXT: pcmpeqd %xmm4, %xmm1 +; X64-SSE-NEXT: pcmpeqd %xmm4, %xmm0 +; X64-SSE-NEXT: packssdw %xmm1, %xmm0 +; X64-SSE-NEXT: packsswb %xmm2, %xmm0 +; X64-SSE-NEXT: pmovmskb %xmm0, %eax +; X64-SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE-NEXT: sete %al +; X64-SSE-NEXT: retq ; ; AVX1-LABEL: icmp0_v16i32_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: icmp0_v16i32_v16i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: cmpl $-1, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1059,62 +1344,92 @@ ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: pxor %xmm3, %xmm3 ; X86-SSE2-NEXT: por %xmm2, %xmm0 ; X86-SSE2-NEXT: por 8(%ebp), %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm0, %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pcmpeqw %xmm3, %xmm1 +; X86-SSE2-NEXT: pcmpeqw %xmm3, %xmm0 +; X86-SSE2-NEXT: packsswb %xmm1, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; -; X64-SSE2-LABEL: icmp0_v32i16_v32i1: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: por %xmm3, %xmm1 -; X64-SSE2-NEXT: por %xmm2, %xmm0 -; X64-SSE2-NEXT: por %xmm1, %xmm0 -; X64-SSE2-NEXT: pxor %xmm1, %xmm1 -; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X64-SSE2-NEXT: pmovmskb %xmm1, %eax -; X64-SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: retq -; -; SSE41-LABEL: icmp0_v32i16_v32i1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; X64-SSE-LABEL: icmp0_v32i16_v32i1: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: pxor %xmm4, %xmm4 +; X64-SSE-NEXT: por %xmm2, %xmm0 +; X64-SSE-NEXT: por %xmm3, %xmm1 +; X64-SSE-NEXT: pcmpeqw %xmm4, %xmm1 +; X64-SSE-NEXT: pcmpeqw %xmm4, %xmm0 +; X64-SSE-NEXT: packsswb %xmm1, %xmm0 +; X64-SSE-NEXT: pmovmskb %xmm0, %eax +; X64-SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE-NEXT: sete %al +; X64-SSE-NEXT: retq ; ; AVX1-LABEL: icmp0_v32i16_v32i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm4 +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpacksswb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: icmp0_v32i16_v32i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: cmpl $-1, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: icmp0_v32i16_v32i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: icmp0_v32i16_v32i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp0_v32i16_v32i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kortestw %k0, %k0 +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp0_v32i16_v32i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512VL-NEXT: kortestw %k0, %k0 +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = icmp eq <32 x i16> %0, zeroinitializer %b = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a) ret i1 %b @@ -1127,13 +1442,13 @@ ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: pxor %xmm3, %xmm3 ; X86-SSE2-NEXT: por %xmm2, %xmm0 ; X86-SSE2-NEXT: por 8(%ebp), %xmm1 ; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm0, %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: pcmpeqb %xmm3, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp @@ -1141,48 +1456,75 @@ ; ; X64-SSE2-LABEL: icmp0_v64i8_v64i1: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: por %xmm3, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm4 ; X64-SSE2-NEXT: por %xmm2, %xmm0 -; X64-SSE2-NEXT: por %xmm1, %xmm0 -; X64-SSE2-NEXT: pxor %xmm1, %xmm1 -; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-SSE2-NEXT: por %xmm3, %xmm1 +; X64-SSE2-NEXT: por %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpeqb %xmm4, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax -; X64-SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-SSE2-NEXT: sete %al ; X64-SSE2-NEXT: retq ; ; SSE41-LABEL: icmp0_v64i8_v64i1: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 ; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: ptest %xmm1, %xmm1 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: icmp0_v64i8_v64i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm4, %xmm4 +; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: icmp0_v64i8_v64i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vptest %ymm0, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: icmp0_v64i8_v64i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: icmp0_v64i8_v64i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vptest %ymm0, %ymm0 +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp0_v64i8_v64i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kortestw %k0, %k0 +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp0_v64i8_v64i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512VL-NEXT: kortestw %k0, %k0 +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = icmp eq <64 x i8> %0, zeroinitializer %b = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> %a) ret i1 %b @@ -1192,57 +1534,60 @@ ; SSE2-LABEL: icmp0_v8i1: ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psllw $15, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: psllw $15, %xmm1 +; SSE2-NEXT: packsswb %xmm1, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: cmpb $-1, %al ; SSE2-NEXT: sete %al ; SSE2-NEXT: ret{{[l|q]}} ; ; SSE41-LABEL: icmp0_v8i1: ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: psllw $15, %xmm0 -; SSE41-NEXT: pmovmskb %xmm0, %eax -; SSE41-NEXT: testl $43690, %eax # imm = 0xAAAA +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm1 +; SSE41-NEXT: psllw $15, %xmm1 +; SSE41-NEXT: packsswb %xmm1, %xmm1 +; SSE41-NEXT: pmovmskb %xmm1, %eax +; SSE41-NEXT: cmpb $-1, %al ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1OR2-LABEL: icmp0_v8i1: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1OR2-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax -; AVX1OR2-NEXT: testl $43690, %eax # imm = 0xAAAA +; AVX1OR2-NEXT: cmpb $-1, %al ; AVX1OR2-NEXT: sete %al ; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: icmp0_v8i1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: testq %rax, %rax ; AVX512F-NEXT: sete %al -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: icmp0_v8i1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX512BW-NEXT: vpmovb2m %zmm0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: testb %al, %al +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: testq %rax, %rax ; AVX512BW-NEXT: sete %al -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512VL-LABEL: icmp0_v8i1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovb2m %xmm0, %k0 -; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: testb %al, %al +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: testq %rax, %rax ; AVX512VL-NEXT: sete %al ; AVX512VL-NEXT: retq %a = trunc <8 x i8> %0 to <8 x i1> @@ -1261,8 +1606,10 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movmskpd %xmm0, %eax +; SSE2-NEXT: cmpl $3, %eax ; SSE2-NEXT: sete %al ; SSE2-NEXT: ret{{[l|q]}} ; @@ -1273,12 +1620,43 @@ ; SSE41-NEXT: setb %al ; SSE41-NEXT: retq ; -; AVX-LABEL: icmp1_v2i64_v2i1: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vptest %xmm1, %xmm0 -; AVX-NEXT: setb %al -; AVX-NEXT: retq +; AVX1OR2-LABEL: icmp1_v2i64_v2i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1OR2-NEXT: vptest %xmm1, %xmm0 +; AVX1OR2-NEXT: setb %al +; AVX1OR2-NEXT: retq +; +; AVX512F-LABEL: icmp1_v2i64_v2i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $3, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp1_v2i64_v2i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $3, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp1_v2i64_v2i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: cmpb $3, %al +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: retq %a = icmp eq <2 x i64> %0, %b = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %a) ret i1 %b @@ -1290,7 +1668,7 @@ ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: cmpl $15, %eax ; SSE2-NEXT: sete %al ; SSE2-NEXT: ret{{[l|q]}} ; @@ -1301,40 +1679,90 @@ ; SSE41-NEXT: setb %al ; SSE41-NEXT: retq ; -; AVX-LABEL: icmp1_v4i32_v4i1: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vptest %xmm1, %xmm0 -; AVX-NEXT: setb %al -; AVX-NEXT: retq +; AVX1OR2-LABEL: icmp1_v4i32_v4i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1OR2-NEXT: vptest %xmm1, %xmm0 +; AVX1OR2-NEXT: setb %al +; AVX1OR2-NEXT: retq +; +; AVX512F-LABEL: icmp1_v4i32_v4i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $15, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp1_v4i32_v4i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $15, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp1_v4i32_v4i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: cmpb $15, %al +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: retq %a = icmp eq <4 x i32> %0, %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) ret i1 %b } define i1 @icmp1_v8i16_v8i1(<8 x i16>) nounwind { -; SSE2-LABEL: icmp1_v8i16_v8i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: sete %al -; SSE2-NEXT: ret{{[l|q]}} +; SSE-LABEL: icmp1_v8i16_v8i1: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: pcmpeqw %xmm0, %xmm1 +; SSE-NEXT: packsswb %xmm1, %xmm1 +; SSE-NEXT: pmovmskb %xmm1, %eax +; SSE-NEXT: cmpb $-1, %al +; SSE-NEXT: sete %al +; SSE-NEXT: ret{{[l|q]}} +; +; AVX1OR2-LABEL: icmp1_v8i16_v8i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: cmpb $-1, %al +; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: retq ; -; SSE41-LABEL: icmp1_v8i16_v8i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; AVX512F-LABEL: icmp1_v8i16_v8i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: retq ; -; AVX-LABEL: icmp1_v8i16_v8i1: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vptest %xmm1, %xmm0 -; AVX-NEXT: setb %al -; AVX-NEXT: retq +; AVX512BW-LABEL: icmp1_v8i16_v8i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vptest %xmm1, %xmm0 +; AVX512BW-NEXT: setb %al +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp1_v8i16_v8i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vptest %xmm1, %xmm0 +; AVX512VL-NEXT: setb %al +; AVX512VL-NEXT: retq %a = icmp eq <8 x i16> %0, %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a) ret i1 %b @@ -1346,7 +1774,7 @@ ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE2-NEXT: pmovmskb %xmm1, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: ret{{[l|q]}} ; @@ -1371,28 +1799,38 @@ define i1 @icmp1_v4i64_v4i1(<4 x i64>) nounwind { ; SSE2-LABEL: icmp1_v4i64_v4i1: ; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %eax +; SSE2-NEXT: cmpl $15, %eax ; SSE2-NEXT: sete %al ; SSE2-NEXT: ret{{[l|q]}} ; ; SSE41-LABEL: icmp1_v4i64_v4i1: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm0 -; SSE41-NEXT: setb %al +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: pcmpeqq %xmm2, %xmm1 +; SSE41-NEXT: pcmpeqq %xmm2, %xmm0 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: movmskps %xmm0, %eax +; SSE41-NEXT: cmpl $15, %eax +; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: icmp1_v4i64_v4i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vptest %xmm0, %xmm0 +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1404,43 +1842,64 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: icmp1_v4i64_v4i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setb %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: icmp1_v4i64_v4i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $15, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp1_v4i64_v4i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512BW-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $15, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp1_v4i64_v4i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512VL-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: cmpb $15, %al +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = icmp eq <4 x i64> %0, %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) ret i1 %b } define i1 @icmp1_v8i32_v8i1(<8 x i32>) nounwind { -; SSE2-LABEL: icmp1_v8i32_v8i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: ret{{[l|q]}} -; -; SSE41-LABEL: icmp1_v8i32_v8i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; SSE-LABEL: icmp1_v8i32_v8i1: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpb $-1, %al +; SSE-NEXT: sete %al +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-LABEL: icmp1_v8i32_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vptest %xmm0, %xmm0 +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1465,30 +1924,27 @@ } define i1 @icmp1_v16i16_v16i1(<16 x i16>) nounwind { -; SSE2-LABEL: icmp1_v16i16_v16i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: sete %al -; SSE2-NEXT: ret{{[l|q]}} -; -; SSE41-LABEL: icmp1_v16i16_v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; SSE-LABEL: icmp1_v16i16_v16i1: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE-NEXT: pcmpeqw %xmm2, %xmm1 +; SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: sete %al +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-LABEL: icmp1_v16i16_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1500,13 +1956,31 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: icmp1_v16i16_v16i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setb %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: icmp1_v16i16_v16i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] +; AVX512F-NEXT: vptest %ymm1, %ymm0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp1_v16i16_v16i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512BW-NEXT: vptest %ymm1, %ymm0 +; AVX512BW-NEXT: setb %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp1_v16i16_v16i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512VL-NEXT: vptest %ymm1, %ymm0 +; AVX512VL-NEXT: setb %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = icmp eq <16 x i16> %0, %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a) ret i1 %b @@ -1515,11 +1989,11 @@ define i1 @icmp1_v32i8_v32i1(<32 x i8>) nounwind { ; SSE2-LABEL: icmp1_v32i8_v32i1: ; SSE2: # %bb.0: +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; SSE2-NEXT: pcmpeqb %xmm2, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: ret{{[l|q]}} ; @@ -1533,10 +2007,13 @@ ; ; AVX1-LABEL: icmp1_v32i8_v32i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vptest %xmm0, %xmm0 +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1566,14 +2043,26 @@ ; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp -; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pand 8(%ebp), %xmm1 +; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm3 +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,0,3,2] +; X86-SSE2-NEXT: pand %xmm1, %xmm4 +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] ; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; X86-SSE2-NEXT: movmskps %xmm0, %eax -; X86-SSE2-NEXT: xorl $15, %eax +; X86-SSE2-NEXT: packssdw %xmm4, %xmm1 +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: pcmpeqd 8(%ebp), %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2] +; X86-SSE2-NEXT: pand %xmm3, %xmm2 +; X86-SSE2-NEXT: packssdw %xmm2, %xmm0 +; X86-SSE2-NEXT: packssdw %xmm0, %xmm1 +; X86-SSE2-NEXT: packsswb %xmm1, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax +; X86-SSE2-NEXT: cmpb $-1, %al ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp @@ -1581,41 +2070,68 @@ ; ; X64-SSE2-LABEL: icmp1_v8i64_v8i1: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pand %xmm3, %xmm1 -; X64-SSE2-NEXT: pand %xmm2, %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; X64-SSE2-NEXT: movmskps %xmm1, %eax -; X64-SSE2-NEXT: xorl $15, %eax +; X64-SSE2-NEXT: pcmpeqd %xmm4, %xmm4 +; X64-SSE2-NEXT: pcmpeqd %xmm4, %xmm3 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,0,3,2] +; X64-SSE2-NEXT: pand %xmm3, %xmm5 +; X64-SSE2-NEXT: pcmpeqd %xmm4, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; X64-SSE2-NEXT: pand %xmm2, %xmm3 +; X64-SSE2-NEXT: packssdw %xmm5, %xmm3 +; X64-SSE2-NEXT: pcmpeqd %xmm4, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] +; X64-SSE2-NEXT: pand %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpeqd %xmm4, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X64-SSE2-NEXT: pand %xmm0, %xmm1 +; X64-SSE2-NEXT: packssdw %xmm2, %xmm1 +; X64-SSE2-NEXT: packssdw %xmm3, %xmm1 +; X64-SSE2-NEXT: packsswb %xmm1, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax +; X64-SSE2-NEXT: cmpb $-1, %al ; X64-SSE2-NEXT: sete %al ; X64-SSE2-NEXT: retq ; ; SSE41-LABEL: icmp1_v8i64_v8i1: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm0 -; SSE41-NEXT: setb %al +; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm3 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm2 +; SSE41-NEXT: packssdw %xmm3, %xmm2 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm1 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm0 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: packssdw %xmm2, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: cmpb $-1, %al +; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: icmp1_v8i64_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vtestps %xmm2, %xmm0 ; AVX1-NEXT: setb %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: icmp1_v8i64_v8i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vtestps %ymm2, %ymm0 ; AVX2-NEXT: setb %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1640,56 +2156,63 @@ ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pand 8(%ebp), %xmm1 -; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; X86-SSE2-NEXT: movmskps %xmm0, %eax -; X86-SSE2-NEXT: xorl $15, %eax +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm3 +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm1 +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 +; X86-SSE2-NEXT: packssdw %xmm1, %xmm0 +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; X86-SSE2-NEXT: pcmpeqd 8(%ebp), %xmm3 +; X86-SSE2-NEXT: packssdw %xmm3, %xmm2 +; X86-SSE2-NEXT: packsswb %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; -; X64-SSE2-LABEL: icmp1_v16i32_v16i1: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pand %xmm3, %xmm1 -; X64-SSE2-NEXT: pand %xmm2, %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; X64-SSE2-NEXT: movmskps %xmm1, %eax -; X64-SSE2-NEXT: xorl $15, %eax -; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: retq -; -; SSE41-LABEL: icmp1_v16i32_v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; X64-SSE-LABEL: icmp1_v16i32_v16i1: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: pcmpeqd %xmm4, %xmm4 +; X64-SSE-NEXT: pcmpeqd %xmm4, %xmm3 +; X64-SSE-NEXT: pcmpeqd %xmm4, %xmm2 +; X64-SSE-NEXT: packssdw %xmm3, %xmm2 +; X64-SSE-NEXT: pcmpeqd %xmm4, %xmm1 +; X64-SSE-NEXT: pcmpeqd %xmm4, %xmm0 +; X64-SSE-NEXT: packssdw %xmm1, %xmm0 +; X64-SSE-NEXT: packsswb %xmm2, %xmm0 +; X64-SSE-NEXT: pmovmskb %xmm0, %eax +; X64-SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE-NEXT: sete %al +; X64-SSE-NEXT: retq ; ; AVX1-LABEL: icmp1_v16i32_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: icmp1_v16i32_v16i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: cmpl $-1, %eax +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1713,67 +2236,94 @@ ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pand 8(%ebp), %xmm1 -; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pcmpeqw %xmm3, %xmm1 +; X86-SSE2-NEXT: pcmpeqw %xmm3, %xmm0 +; X86-SSE2-NEXT: packsswb %xmm1, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; -; X64-SSE2-LABEL: icmp1_v32i16_v32i1: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pand %xmm3, %xmm1 -; X64-SSE2-NEXT: pand %xmm2, %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; X64-SSE2-NEXT: pmovmskb %xmm1, %eax -; X64-SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: retq -; -; SSE41-LABEL: icmp1_v32i16_v32i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; X64-SSE-LABEL: icmp1_v32i16_v32i1: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: pcmpeqd %xmm4, %xmm4 +; X64-SSE-NEXT: pand %xmm2, %xmm0 +; X64-SSE-NEXT: pand %xmm3, %xmm1 +; X64-SSE-NEXT: pcmpeqw %xmm4, %xmm1 +; X64-SSE-NEXT: pcmpeqw %xmm4, %xmm0 +; X64-SSE-NEXT: packsswb %xmm1, %xmm0 +; X64-SSE-NEXT: pmovmskb %xmm0, %eax +; X64-SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE-NEXT: sete %al +; X64-SSE-NEXT: retq ; ; AVX1-LABEL: icmp1_v32i16_v32i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm4 +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpacksswb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: icmp1_v32i16_v32i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: cmpl $-1, %eax +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: icmp1_v32i16_v32i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: icmp1_v32i16_v32i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp1_v32i16_v32i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kortestw %k0, %k0 +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp1_v32i16_v32i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512VL-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512VL-NEXT: kortestw %k0, %k0 +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = icmp eq <32 x i16> %0, %b = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a) ret i1 %b @@ -1786,13 +2336,13 @@ ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pand 8(%ebp), %xmm1 ; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: pcmpeqb %xmm3, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp @@ -1800,53 +2350,81 @@ ; ; X64-SSE2-LABEL: icmp1_v64i8_v64i1: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pand %xmm3, %xmm1 +; X64-SSE2-NEXT: pcmpeqd %xmm4, %xmm4 ; X64-SSE2-NEXT: pand %xmm2, %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; X64-SSE2-NEXT: pand %xmm3, %xmm1 +; X64-SSE2-NEXT: pand %xmm0, %xmm1 +; X64-SSE2-NEXT: pcmpeqb %xmm4, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax -; X64-SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-SSE2-NEXT: sete %al ; X64-SSE2-NEXT: retq ; ; SSE41-LABEL: icmp1_v64i8_v64i1: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm3, %xmm1 ; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm1 ; SSE41-NEXT: setb %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: icmp1_v64i8_v64i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm4, %xmm4 +; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: icmp1_v64i8_v64i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vptest %ymm1, %ymm0 ; AVX2-NEXT: setb %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: icmp1_v64i8_v64i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: icmp1_v64i8_v64i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vptest %ymm0, %ymm0 +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp1_v64i8_v64i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kortestw %k0, %k0 +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp1_v64i8_v64i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512VL-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512VL-NEXT: kortestw %k0, %k0 +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = icmp eq <64 x i8> %0, %b = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> %a) ret i1 %b @@ -1885,31 +2463,28 @@ ; ; AVX512F-LABEL: icmp1_v8i1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: cmpb $-1, %al +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 +; AVX512F-NEXT: cmpq %rcx, %rax ; AVX512F-NEXT: sete %al -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: icmp1_v8i1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX512BW-NEXT: vpmovb2m %zmm0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: cmpb $-1, %al +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 +; AVX512BW-NEXT: cmpq %rcx, %rax ; AVX512BW-NEXT: sete %al -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512VL-LABEL: icmp1_v8i1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovb2m %xmm0, %k0 -; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: cmpb $-1, %al +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 +; AVX512VL-NEXT: cmpq %rcx, %rax ; AVX512VL-NEXT: sete %al ; AVX512VL-NEXT: retq %a = trunc <8 x i8> %0 to <8 x i1> @@ -1927,8 +2502,10 @@ ; SSE2-LABEL: icmp_v2i64_v2i1: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movmskpd %xmm1, %eax +; SSE2-NEXT: cmpl $3, %eax ; SSE2-NEXT: sete %al ; SSE2-NEXT: ret{{[l|q]}} ; @@ -1939,12 +2516,42 @@ ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX-LABEL: icmp_v2i64_v2i1: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vptest %xmm0, %xmm0 -; AVX-NEXT: sete %al -; AVX-NEXT: retq +; AVX1OR2-LABEL: icmp_v2i64_v2i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vptest %xmm0, %xmm0 +; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: retq +; +; AVX512F-LABEL: icmp_v2i64_v2i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $3, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp_v2i64_v2i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $3, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp_v2i64_v2i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: cmpb $3, %al +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: retq %a = icmp eq <2 x i64> %0, %1 %b = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %a) ret i1 %b @@ -1955,7 +2562,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: cmpl $15, %eax ; SSE2-NEXT: sete %al ; SSE2-NEXT: ret{{[l|q]}} ; @@ -1966,39 +2573,86 @@ ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX-LABEL: icmp_v4i32_v4i1: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vptest %xmm0, %xmm0 -; AVX-NEXT: sete %al -; AVX-NEXT: retq +; AVX1OR2-LABEL: icmp_v4i32_v4i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vptest %xmm0, %xmm0 +; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: retq +; +; AVX512F-LABEL: icmp_v4i32_v4i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $15, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp_v4i32_v4i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $15, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp_v4i32_v4i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: cmpb $15, %al +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: retq %a = icmp eq <4 x i32> %0, %1 %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) ret i1 %b } define i1 @icmp_v8i16_v8i1(<8 x i16>, <8 x i16>) nounwind { -; SSE2-LABEL: icmp_v8i16_v8i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: sete %al -; SSE2-NEXT: ret{{[l|q]}} +; SSE-LABEL: icmp_v8i16_v8i1: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpb $-1, %al +; SSE-NEXT: sete %al +; SSE-NEXT: ret{{[l|q]}} +; +; AVX1OR2-LABEL: icmp_v8i16_v8i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: cmpb $-1, %al +; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: retq ; -; SSE41-LABEL: icmp_v8i16_v8i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; AVX512F-LABEL: icmp_v8i16_v8i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: retq ; -; AVX-LABEL: icmp_v8i16_v8i1: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vptest %xmm0, %xmm0 -; AVX-NEXT: sete %al -; AVX-NEXT: retq +; AVX512BW-LABEL: icmp_v8i16_v8i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vptest %xmm0, %xmm0 +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp_v8i16_v8i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vptest %xmm0, %xmm0 +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: retq %a = icmp eq <8 x i16> %0, %1 %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a) ret i1 %b @@ -2009,7 +2663,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 ; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: ret{{[l|q]}} ; @@ -2040,9 +2694,12 @@ ; X86-SSE2-NEXT: subl $16, %esp ; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 ; X86-SSE2-NEXT: pcmpeqd 8(%ebp), %xmm1 -; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: movmskps %xmm1, %eax -; X86-SSE2-NEXT: xorl $15, %eax +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3] +; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X86-SSE2-NEXT: andps %xmm2, %xmm0 +; X86-SSE2-NEXT: movmskps %xmm0, %eax +; X86-SSE2-NEXT: cmpl $15, %eax ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp @@ -2052,25 +2709,33 @@ ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm1 ; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3] +; X64-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X64-SSE2-NEXT: andps %xmm2, %xmm0 ; X64-SSE2-NEXT: movmskps %xmm0, %eax -; X64-SSE2-NEXT: xorl $15, %eax +; X64-SSE2-NEXT: cmpl $15, %eax ; X64-SSE2-NEXT: sete %al ; X64-SSE2-NEXT: retq ; ; SSE41-LABEL: icmp_v4i64_v4i1: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm3, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: pcmpeqq %xmm3, %xmm1 +; SSE41-NEXT: pcmpeqq %xmm2, %xmm0 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: movmskps %xmm0, %eax +; SSE41-NEXT: cmpl $15, %eax ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: icmp_v4i64_v4i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vptest %xmm0, %xmm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2083,13 +2748,36 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: icmp_v4i64_v4i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vptest %ymm0, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: icmp_v4i64_v4i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $15, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp_v4i64_v4i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $15, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp_v4i64_v4i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: cmpb $15, %al +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = icmp eq <4 x i64> %0, %1 %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) ret i1 %b @@ -2104,37 +2792,34 @@ ; X86-SSE2-NEXT: subl $16, %esp ; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 ; X86-SSE2-NEXT: pcmpeqd 8(%ebp), %xmm1 -; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: movmskps %xmm1, %eax -; X86-SSE2-NEXT: xorl $15, %eax +; X86-SSE2-NEXT: packssdw %xmm1, %xmm0 +; X86-SSE2-NEXT: packsswb %xmm0, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpb $-1, %al ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; -; X64-SSE2-LABEL: icmp_v8i32_v8i1: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm1 -; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: movmskps %xmm0, %eax -; X64-SSE2-NEXT: xorl $15, %eax -; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: retq -; -; SSE41-LABEL: icmp_v8i32_v8i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm3, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; X64-SSE-LABEL: icmp_v8i32_v8i1: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: pcmpeqd %xmm3, %xmm1 +; X64-SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; X64-SSE-NEXT: packssdw %xmm1, %xmm0 +; X64-SSE-NEXT: packsswb %xmm0, %xmm0 +; X64-SSE-NEXT: pmovmskb %xmm0, %eax +; X64-SSE-NEXT: cmpb $-1, %al +; X64-SSE-NEXT: sete %al +; X64-SSE-NEXT: retq ; ; AVX1-LABEL: icmp_v8i32_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vptest %xmm0, %xmm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2166,39 +2851,35 @@ ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: pcmpeqb %xmm2, %xmm0 -; X86-SSE2-NEXT: pcmpeqb 8(%ebp), %xmm1 -; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: pmovmskb %xmm1, %eax -; X86-SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: pcmpeqw %xmm2, %xmm0 +; X86-SSE2-NEXT: pcmpeqw 8(%ebp), %xmm1 +; X86-SSE2-NEXT: packsswb %xmm1, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; -; X64-SSE2-LABEL: icmp_v16i16_v16i1: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pcmpeqb %xmm3, %xmm1 -; X64-SSE2-NEXT: pcmpeqb %xmm2, %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax -; X64-SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: retq -; -; SSE41-LABEL: icmp_v16i16_v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm3, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; X64-SSE-LABEL: icmp_v16i16_v16i1: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: pcmpeqw %xmm3, %xmm1 +; X64-SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; X64-SSE-NEXT: packsswb %xmm1, %xmm0 +; X64-SSE-NEXT: pmovmskb %xmm0, %eax +; X64-SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE-NEXT: sete %al +; X64-SSE-NEXT: retq ; ; AVX1-LABEL: icmp_v16i16_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2211,13 +2892,30 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: icmp_v16i16_v16i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vptest %ymm0, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: icmp_v16i16_v16i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] +; AVX512F-NEXT: vptest %ymm1, %ymm0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp_v16i16_v16i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vptest %ymm0, %ymm0 +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp_v16i16_v16i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vptest %ymm0, %ymm0 +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = icmp eq <16 x i16> %0, %1 %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a) ret i1 %b @@ -2234,7 +2932,7 @@ ; X86-SSE2-NEXT: pcmpeqb 8(%ebp), %xmm1 ; X86-SSE2-NEXT: pand %xmm0, %xmm1 ; X86-SSE2-NEXT: pmovmskb %xmm1, %eax -; X86-SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp @@ -2242,27 +2940,31 @@ ; ; X64-SSE2-LABEL: icmp_v32i8_v32i1: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pcmpeqb %xmm3, %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm2, %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax -; X64-SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; X64-SSE2-NEXT: pcmpeqb %xmm3, %xmm1 +; X64-SSE2-NEXT: pand %xmm0, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax +; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-SSE2-NEXT: sete %al ; X64-SSE2-NEXT: retq ; ; SSE41-LABEL: icmp_v32i8_v32i1: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm3, %xmm1 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: ptest %xmm1, %xmm1 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: icmp_v32i8_v32i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vptest %xmm0, %xmm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2296,14 +2998,23 @@ ; X86-SSE2-NEXT: subl $16, %esp ; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE2-NEXT: pcmpeqd 72(%ebp), %xmm3 -; X86-SSE2-NEXT: pcmpeqd 40(%ebp), %xmm1 -; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,0,3,2] +; X86-SSE2-NEXT: pand %xmm3, %xmm4 ; X86-SSE2-NEXT: pcmpeqd 56(%ebp), %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; X86-SSE2-NEXT: pand %xmm2, %xmm3 +; X86-SSE2-NEXT: packssdw %xmm4, %xmm3 +; X86-SSE2-NEXT: pcmpeqd 40(%ebp), %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] +; X86-SSE2-NEXT: pand %xmm1, %xmm2 ; X86-SSE2-NEXT: pcmpeqd 24(%ebp), %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: movmskps %xmm0, %eax -; X86-SSE2-NEXT: xorl $15, %eax +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X86-SSE2-NEXT: pand %xmm0, %xmm1 +; X86-SSE2-NEXT: packssdw %xmm2, %xmm1 +; X86-SSE2-NEXT: packssdw %xmm3, %xmm1 +; X86-SSE2-NEXT: packsswb %xmm1, %xmm1 +; X86-SSE2-NEXT: pmovmskb %xmm1, %eax +; X86-SSE2-NEXT: cmpb $-1, %al ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp @@ -2312,47 +3023,68 @@ ; X64-SSE2-LABEL: icmp_v8i64_v8i1: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: pcmpeqd %xmm7, %xmm3 -; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm1 -; X64-SSE2-NEXT: pand %xmm3, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,0,3,2] +; X64-SSE2-NEXT: pand %xmm3, %xmm7 ; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; X64-SSE2-NEXT: pand %xmm2, %xmm3 +; X64-SSE2-NEXT: packssdw %xmm7, %xmm3 +; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] +; X64-SSE2-NEXT: pand %xmm1, %xmm2 ; X64-SSE2-NEXT: pcmpeqd %xmm4, %xmm0 -; X64-SSE2-NEXT: pand %xmm2, %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: movmskps %xmm0, %eax -; X64-SSE2-NEXT: xorl $15, %eax +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X64-SSE2-NEXT: pand %xmm0, %xmm1 +; X64-SSE2-NEXT: packssdw %xmm2, %xmm1 +; X64-SSE2-NEXT: packssdw %xmm3, %xmm1 +; X64-SSE2-NEXT: packsswb %xmm1, %xmm1 +; X64-SSE2-NEXT: pmovmskb %xmm1, %eax +; X64-SSE2-NEXT: cmpb $-1, %al ; X64-SSE2-NEXT: sete %al ; X64-SSE2-NEXT: retq ; ; SSE41-LABEL: icmp_v8i64_v8i1: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm7, %xmm3 -; SSE41-NEXT: pxor %xmm5, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: pxor %xmm6, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: pcmpeqq %xmm7, %xmm3 +; SSE41-NEXT: pcmpeqq %xmm6, %xmm2 +; SSE41-NEXT: packssdw %xmm3, %xmm2 +; SSE41-NEXT: pcmpeqq %xmm5, %xmm1 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm0 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: packssdw %xmm2, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: cmpb $-1, %al ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: icmp_v8i64_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 -; AVX1-NEXT: sete %al +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vtestps %xmm1, %xmm0 +; AVX1-NEXT: setb %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: icmp_v8i64_v8i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 -; AVX2-NEXT: sete %al +; AVX2-NEXT: vpcmpeqq %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vtestps %ymm1, %ymm0 +; AVX2-NEXT: setb %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2377,62 +3109,59 @@ ; X86-SSE2-NEXT: subl $16, %esp ; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE2-NEXT: pcmpeqd 72(%ebp), %xmm3 -; X86-SSE2-NEXT: pcmpeqd 40(%ebp), %xmm1 -; X86-SSE2-NEXT: pand %xmm3, %xmm1 ; X86-SSE2-NEXT: pcmpeqd 56(%ebp), %xmm2 +; X86-SSE2-NEXT: packssdw %xmm3, %xmm2 +; X86-SSE2-NEXT: pcmpeqd 40(%ebp), %xmm1 ; X86-SSE2-NEXT: pcmpeqd 24(%ebp), %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: movmskps %xmm0, %eax -; X86-SSE2-NEXT: xorl $15, %eax +; X86-SSE2-NEXT: packssdw %xmm1, %xmm0 +; X86-SSE2-NEXT: packsswb %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; -; X64-SSE2-LABEL: icmp_v16i32_v16i1: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pcmpeqd %xmm7, %xmm3 -; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm1 -; X64-SSE2-NEXT: pand %xmm3, %xmm1 -; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm2 -; X64-SSE2-NEXT: pcmpeqd %xmm4, %xmm0 -; X64-SSE2-NEXT: pand %xmm2, %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: movmskps %xmm0, %eax -; X64-SSE2-NEXT: xorl $15, %eax -; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: retq -; -; SSE41-LABEL: icmp_v16i32_v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm7, %xmm3 -; SSE41-NEXT: pxor %xmm5, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: pxor %xmm6, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; X64-SSE-LABEL: icmp_v16i32_v16i1: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: pcmpeqd %xmm7, %xmm3 +; X64-SSE-NEXT: pcmpeqd %xmm6, %xmm2 +; X64-SSE-NEXT: packssdw %xmm3, %xmm2 +; X64-SSE-NEXT: pcmpeqd %xmm5, %xmm1 +; X64-SSE-NEXT: pcmpeqd %xmm4, %xmm0 +; X64-SSE-NEXT: packssdw %xmm1, %xmm0 +; X64-SSE-NEXT: packsswb %xmm2, %xmm0 +; X64-SSE-NEXT: pmovmskb %xmm0, %eax +; X64-SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE-NEXT: sete %al +; X64-SSE-NEXT: retq ; ; AVX1-LABEL: icmp_v16i32_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: icmp_v16i32_v16i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: cmpl $-1, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2457,74 +3186,94 @@ ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp ; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm3 -; X86-SSE2-NEXT: pcmpeqb 72(%ebp), %xmm3 -; X86-SSE2-NEXT: pcmpeqb 40(%ebp), %xmm1 -; X86-SSE2-NEXT: pand %xmm3, %xmm1 -; X86-SSE2-NEXT: pcmpeqb 56(%ebp), %xmm2 -; X86-SSE2-NEXT: pcmpeqb 24(%ebp), %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: pcmpeqw 24(%ebp), %xmm0 +; X86-SSE2-NEXT: pcmpeqw 56(%ebp), %xmm2 +; X86-SSE2-NEXT: pand %xmm0, %xmm2 +; X86-SSE2-NEXT: pcmpeqw 40(%ebp), %xmm1 +; X86-SSE2-NEXT: pcmpeqw 72(%ebp), %xmm3 +; X86-SSE2-NEXT: pand %xmm1, %xmm3 +; X86-SSE2-NEXT: packsswb %xmm3, %xmm2 +; X86-SSE2-NEXT: pmovmskb %xmm2, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; -; X64-SSE2-LABEL: icmp_v32i16_v32i1: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pcmpeqb %xmm7, %xmm3 -; X64-SSE2-NEXT: pcmpeqb %xmm5, %xmm1 -; X64-SSE2-NEXT: pand %xmm3, %xmm1 -; X64-SSE2-NEXT: pcmpeqb %xmm6, %xmm2 -; X64-SSE2-NEXT: pcmpeqb %xmm4, %xmm0 -; X64-SSE2-NEXT: pand %xmm2, %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax -; X64-SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: retq -; -; SSE41-LABEL: icmp_v32i16_v32i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm7, %xmm3 -; SSE41-NEXT: pxor %xmm5, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: pxor %xmm6, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; X64-SSE-LABEL: icmp_v32i16_v32i1: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: pcmpeqw %xmm4, %xmm0 +; X64-SSE-NEXT: pcmpeqw %xmm6, %xmm2 +; X64-SSE-NEXT: pand %xmm0, %xmm2 +; X64-SSE-NEXT: pcmpeqw %xmm5, %xmm1 +; X64-SSE-NEXT: pcmpeqw %xmm7, %xmm3 +; X64-SSE-NEXT: pand %xmm1, %xmm3 +; X64-SSE-NEXT: packsswb %xmm3, %xmm2 +; X64-SSE-NEXT: pmovmskb %xmm2, %eax +; X64-SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE-NEXT: sete %al +; X64-SSE-NEXT: retq ; ; AVX1-LABEL: icmp_v32i16_v32i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm5 +; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpacksswb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: icmp_v32i16_v32i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: cmpl $-1, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: icmp_v32i16_v32i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: icmp_v32i16_v32i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp_v32i16_v32i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kortestw %k0, %k0 +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp_v32i16_v32i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512VL-NEXT: kortestw %k0, %k0 +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = icmp eq <32 x i16> %0, %1 %b = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a) ret i1 %b @@ -2538,74 +3287,90 @@ ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp ; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm3 -; X86-SSE2-NEXT: pcmpeqb 72(%ebp), %xmm3 ; X86-SSE2-NEXT: pcmpeqb 40(%ebp), %xmm1 -; X86-SSE2-NEXT: pand %xmm3, %xmm1 -; X86-SSE2-NEXT: pcmpeqb 56(%ebp), %xmm2 +; X86-SSE2-NEXT: pcmpeqb 72(%ebp), %xmm3 ; X86-SSE2-NEXT: pcmpeqb 24(%ebp), %xmm0 -; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pand %xmm1, %xmm0 -; X86-SSE2-NEXT: pmovmskb %xmm0, %eax -; X86-SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; X86-SSE2-NEXT: pcmpeqb 56(%ebp), %xmm2 +; X86-SSE2-NEXT: pand %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm3, %xmm2 +; X86-SSE2-NEXT: pmovmskb %xmm2, %eax +; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; -; X64-SSE2-LABEL: icmp_v64i8_v64i1: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pcmpeqb %xmm7, %xmm3 -; X64-SSE2-NEXT: pcmpeqb %xmm5, %xmm1 -; X64-SSE2-NEXT: pand %xmm3, %xmm1 -; X64-SSE2-NEXT: pcmpeqb %xmm6, %xmm2 -; X64-SSE2-NEXT: pcmpeqb %xmm4, %xmm0 -; X64-SSE2-NEXT: pand %xmm2, %xmm0 -; X64-SSE2-NEXT: pand %xmm1, %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax -; X64-SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; X64-SSE2-NEXT: sete %al -; X64-SSE2-NEXT: retq -; -; SSE41-LABEL: icmp_v64i8_v64i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm7, %xmm3 -; SSE41-NEXT: pxor %xmm5, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: pxor %xmm6, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; X64-SSE-LABEL: icmp_v64i8_v64i1: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: pcmpeqb %xmm5, %xmm1 +; X64-SSE-NEXT: pcmpeqb %xmm7, %xmm3 +; X64-SSE-NEXT: pand %xmm1, %xmm3 +; X64-SSE-NEXT: pcmpeqb %xmm4, %xmm0 +; X64-SSE-NEXT: pcmpeqb %xmm6, %xmm2 +; X64-SSE-NEXT: pand %xmm0, %xmm2 +; X64-SSE-NEXT: pand %xmm3, %xmm2 +; X64-SSE-NEXT: pmovmskb %xmm2, %eax +; X64-SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; X64-SSE-NEXT: sete %al +; X64-SSE-NEXT: retq ; ; AVX1-LABEL: icmp_v64i8_v64i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: icmp_v64i8_v64i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vptest %ymm0, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: icmp_v64i8_v64i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: icmp_v64i8_v64i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vptest %ymm0, %ymm0 +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp_v64i8_v64i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kortestw %k0, %k0 +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp_v64i8_v64i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512VL-NEXT: kortestw %k0, %k0 +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = icmp eq <64 x i8> %0, %1 %b = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> %a) ret i1 %b @@ -2617,6 +3382,3 @@ declare i1 @llvm.vector.reduce.and.v16i1(<16 x i1>) declare i1 @llvm.vector.reduce.and.v32i1(<32 x i1>) declare i1 @llvm.vector.reduce.and.v64i1(<64 x i1>) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; SSE: {{.*}} -; X64-SSE: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll @@ -60,26 +60,37 @@ ; ; AVX1-LABEL: test_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setae %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: cmpq $-1, %rax +; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setae %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: cmpq $-1, %rax +; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v4i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setae %al +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: cmpq $-1, %rax +; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %a0) @@ -113,27 +124,39 @@ ; AVX1-LABEL: test_v8i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: cmpq $-1, %rax +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v8i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: cmpq $-1, %rax +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v8i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: cmpq $-1, %rax ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -145,31 +168,31 @@ define i1 @test_v16i64(<16 x i64> %a0) { ; SSE2-LABEL: test_v16i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm6, %xmm2 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %eax ; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm7, %xmm3 -; SSE41-NEXT: pand %xmm5, %xmm1 -; SSE41-NEXT: pand %xmm3, %xmm1 ; SSE41-NEXT: pand %xmm6, %xmm2 ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm3 +; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm1 ; SSE41-NEXT: setae %al ; SSE41-NEXT: retq ; @@ -178,10 +201,13 @@ ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setae %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: cmpq $-1, %rax +; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -190,18 +216,27 @@ ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setae %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: cmpq $-1, %rax +; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v16i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: cmpq $-1, %rax ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -282,26 +317,43 @@ ; ; AVX1-LABEL: test_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: cmpl $-1, %eax +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: cmpl $-1, %eax +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v8i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setb %al +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: cmpl $-1, %eax +; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %a0) @@ -335,27 +387,45 @@ ; AVX1-LABEL: test_v16i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setae %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: cmpl $-1, %eax +; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v16i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setae %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: cmpl $-1, %eax +; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v16i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: cmpl $-1, %eax ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -367,31 +437,31 @@ define i1 @test_v32i32(<32 x i32> %a0) { ; SSE2-LABEL: test_v32i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm6, %xmm2 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %eax ; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v32i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm7, %xmm3 -; SSE41-NEXT: pand %xmm5, %xmm1 -; SSE41-NEXT: pand %xmm3, %xmm1 ; SSE41-NEXT: pand %xmm6, %xmm2 ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm3 +; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm1 ; SSE41-NEXT: setb %al ; SSE41-NEXT: retq ; @@ -400,10 +470,15 @@ ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: cmpl $-1, %eax +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -412,18 +487,31 @@ ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: cmpl $-1, %eax +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v32i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: cmpl $-1, %eax ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -523,26 +611,49 @@ ; ; AVX1-LABEL: test_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setae %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setae %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: cmpw $-1, %ax +; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setae %al +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: cmpw $-1, %ax +; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %a0) @@ -576,27 +687,51 @@ ; AVX1-LABEL: test_v32i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v32i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: cmpw $-1, %ax +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v32i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: cmpw $-1, %ax ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -608,31 +743,31 @@ define i1 @test_v64i16(<64 x i16> %a0) { ; SSE2-LABEL: test_v64i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm6, %xmm2 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v64i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm7, %xmm3 -; SSE41-NEXT: pand %xmm5, %xmm1 -; SSE41-NEXT: pand %xmm3, %xmm1 ; SSE41-NEXT: pand %xmm6, %xmm2 ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm3 +; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm1 ; SSE41-NEXT: setae %al ; SSE41-NEXT: retq ; @@ -641,10 +776,17 @@ ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setae %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -653,18 +795,35 @@ ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setae %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: cmpw $-1, %ax +; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v64i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: cmpw $-1, %ax ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -783,26 +942,55 @@ ; ; AVX1-LABEL: test_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: cmpb $-1, %al +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: cmpb $-1, %al +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setb %al +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: cmpb $-1, %al +; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %a0) @@ -836,27 +1024,57 @@ ; AVX1-LABEL: test_v64i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setae %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: cmpb $-1, %al +; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v64i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setae %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: cmpb $-1, %al +; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v64i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: cmpb $-1, %al ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -868,31 +1086,31 @@ define i1 @test_v128i8(<128 x i8> %a0) { ; SSE2-LABEL: test_v128i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm6, %xmm2 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v128i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm7, %xmm3 -; SSE41-NEXT: pand %xmm5, %xmm1 -; SSE41-NEXT: pand %xmm3, %xmm1 ; SSE41-NEXT: pand %xmm6, %xmm2 ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm3 +; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm1 ; SSE41-NEXT: setb %al ; SSE41-NEXT: retq ; @@ -901,10 +1119,19 @@ ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: cmpb $-1, %al +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -913,18 +1140,39 @@ ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: cmpb $-1, %al +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v128i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: cmpb $-1, %al ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-and.ll b/llvm/test/CodeGen/X86/vector-reduce-and.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and.ll @@ -60,9 +60,9 @@ ; AVX1-LABEL: test_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -70,7 +70,7 @@ ; AVX2-LABEL: test_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -80,7 +80,7 @@ ; AVX512-LABEL: test_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -123,7 +123,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -134,7 +134,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -146,7 +146,7 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -200,7 +200,7 @@ ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -213,7 +213,7 @@ ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -226,7 +226,7 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -294,11 +294,11 @@ ; AVX1-LABEL: test_v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -306,7 +306,7 @@ ; AVX2-LABEL: test_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -318,7 +318,7 @@ ; AVX512-LABEL: test_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -365,7 +365,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -378,7 +378,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -392,9 +392,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -450,7 +450,7 @@ ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -465,7 +465,7 @@ ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -480,9 +480,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -589,11 +589,11 @@ ; AVX1-LABEL: test_v16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -604,11 +604,11 @@ ; AVX2-LABEL: test_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -619,11 +619,11 @@ ; AVX512-LABEL: test_v16i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -677,11 +677,11 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vandps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vandps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -693,11 +693,11 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -779,11 +779,11 @@ ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vandps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vandps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -797,11 +797,11 @@ ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -966,11 +966,11 @@ ; AVX1-LABEL: test_v32i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 @@ -983,7 +983,7 @@ ; AVX2-LABEL: test_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1000,7 +1000,7 @@ ; AVX512-LABEL: test_v32i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1066,7 +1066,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1084,7 +1084,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1103,13 +1103,13 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1180,7 +1180,7 @@ ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1200,7 +1200,7 @@ ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1220,13 +1220,13 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll b/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll --- a/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-ctpop.ll @@ -358,7 +358,7 @@ ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -378,9 +378,10 @@ ; AVX512VL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0 -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -388,9 +389,10 @@ ; AVX512VPOPCNT-LABEL: reduce_ctpop_v4i64: ; AVX512VPOPCNT: # %bb.0: ; AVX512VPOPCNT-NEXT: vpopcntq %ymm0, %ymm0 -; AVX512VPOPCNT-NEXT: vpmovqb %ymm0, %xmm0 -; AVX512VPOPCNT-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNT-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VPOPCNT-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNT-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VPOPCNT-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNT-NEXT: vmovq %xmm0, %rax ; AVX512VPOPCNT-NEXT: vzeroupper ; AVX512VPOPCNT-NEXT: retq @@ -458,9 +460,9 @@ ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -484,9 +486,12 @@ ; AVX512VL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX512VL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovd %xmm0, %eax ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -494,9 +499,12 @@ ; AVX512VPOPCNT-LABEL: reduce_ctpop_v8i32: ; AVX512VPOPCNT: # %bb.0: ; AVX512VPOPCNT-NEXT: vpopcntd %ymm0, %ymm0 -; AVX512VPOPCNT-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512VPOPCNT-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNT-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VPOPCNT-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNT-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VPOPCNT-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNT-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512VPOPCNT-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNT-NEXT: vmovd %xmm0, %eax ; AVX512VPOPCNT-NEXT: vzeroupper ; AVX512VPOPCNT-NEXT: retq @@ -643,15 +651,26 @@ ; AVX2-NEXT: vpshufb %ymm3, %ymm6, %ymm3 ; AVX2-NEXT: vpaddb %ymm7, %ymm3, %ymm3 ; AVX2-NEXT: vpsadbw %ymm5, %ymm3, %ymm3 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm1[2,3],ymm3[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-NEXT: vpaddq %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX2-NEXT: vpaddq %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX2-NEXT: vpaddq %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; AVX2-NEXT: vpaddq %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-NEXT: vpaddq %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] +; AVX2-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpaddq %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX2-NEXT: vpaddq %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastq %xmm3, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: reduce_ctpop_v4i64_buildvector_v4i64: @@ -688,15 +707,26 @@ ; AVX512VL-NEXT: vpshufb %ymm3, %ymm6, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsadbw %ymm5, %ymm3, %ymm3 -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm1[2,3],ymm3[2,3] -; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512VL-NEXT: vpaddq %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX512VL-NEXT: vpaddq %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512VL-NEXT: vpaddq %ymm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[2,3] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; AVX512VL-NEXT: vpaddq %xmm4, %xmm1, %xmm1 +; AVX512VL-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512VL-NEXT: vpaddq %ymm4, %ymm2, %ymm2 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] +; AVX512VL-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; AVX512VL-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512VL-NEXT: vpaddq %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX512VL-NEXT: vpaddq %xmm4, %xmm3, %xmm3 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpaddq %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512VL-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpbroadcastq %xmm3, %ymm1 +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512VL-NEXT: retq ; ; AVX512VPOPCNT-LABEL: reduce_ctpop_v4i64_buildvector_v4i64: @@ -705,15 +735,26 @@ ; AVX512VPOPCNT-NEXT: vpopcntq %ymm1, %ymm1 ; AVX512VPOPCNT-NEXT: vpopcntq %ymm2, %ymm2 ; AVX512VPOPCNT-NEXT: vpopcntq %ymm3, %ymm3 -; AVX512VPOPCNT-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm1[2,3],ymm3[2,3] -; AVX512VPOPCNT-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512VPOPCNT-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512VPOPCNT-NEXT: vpaddq %ymm4, %ymm0, %ymm0 +; AVX512VPOPCNT-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX512VPOPCNT-NEXT: vpaddq %ymm4, %ymm0, %ymm0 +; AVX512VPOPCNT-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512VPOPCNT-NEXT: vpaddq %ymm4, %ymm1, %ymm1 -; AVX512VPOPCNT-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[2,3] +; AVX512VPOPCNT-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; AVX512VPOPCNT-NEXT: vpaddq %xmm4, %xmm1, %xmm1 +; AVX512VPOPCNT-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512VPOPCNT-NEXT: vpaddq %ymm4, %ymm2, %ymm2 +; AVX512VPOPCNT-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] +; AVX512VPOPCNT-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; AVX512VPOPCNT-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512VPOPCNT-NEXT: vpaddq %ymm4, %ymm3, %ymm3 +; AVX512VPOPCNT-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX512VPOPCNT-NEXT: vpaddq %xmm4, %xmm3, %xmm3 +; AVX512VPOPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VPOPCNT-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512VPOPCNT-NEXT: vpaddq %ymm3, %ymm0, %ymm0 -; AVX512VPOPCNT-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX512VPOPCNT-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512VPOPCNT-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNT-NEXT: vpbroadcastq %xmm3, %ymm1 +; AVX512VPOPCNT-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512VPOPCNT-NEXT: retq %p0 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a0) %p1 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a1) @@ -998,28 +1039,28 @@ ; AVX2-NEXT: vpaddb %ymm7, %ymm11, %ymm7 ; AVX2-NEXT: vpsadbw %ymm7, %ymm10, %ymm7 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX2-NEXT: vpaddq %xmm0, %xmm8, %xmm0 +; AVX2-NEXT: vpaddq %ymm0, %ymm8, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX2-NEXT: vpaddq %xmm1, %xmm9, %xmm1 +; AVX2-NEXT: vpaddq %ymm1, %ymm9, %ymm1 ; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm10 -; AVX2-NEXT: vpaddq %xmm2, %xmm10, %xmm2 +; AVX2-NEXT: vpaddq %ymm2, %ymm10, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[2,3,2,3] ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm11 -; AVX2-NEXT: vpaddq %xmm3, %xmm11, %xmm3 +; AVX2-NEXT: vpaddq %ymm3, %ymm11, %ymm3 ; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[2,3,2,3] ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm12 -; AVX2-NEXT: vpaddq %xmm4, %xmm12, %xmm4 +; AVX2-NEXT: vpaddq %ymm4, %ymm12, %ymm4 ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[2,3,2,3] ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm13 -; AVX2-NEXT: vpaddq %xmm5, %xmm13, %xmm5 +; AVX2-NEXT: vpaddq %ymm5, %ymm13, %ymm5 ; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[2,3,2,3] ; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm14 -; AVX2-NEXT: vpaddq %xmm6, %xmm14, %xmm6 +; AVX2-NEXT: vpaddq %ymm6, %ymm14, %ymm6 ; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[2,3,2,3] ; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm15 -; AVX2-NEXT: vpaddq %xmm7, %xmm15, %xmm7 +; AVX2-NEXT: vpaddq %ymm7, %ymm15, %ymm7 ; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm0, %xmm8, %xmm0 ; AVX2-NEXT: vpaddq %xmm1, %xmm9, %xmm1 @@ -1054,83 +1095,103 @@ ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpand %ymm0, %ymm8, %ymm0 ; AVX512VL-NEXT: vpshufb %ymm0, %ymm10, %ymm0 -; AVX512VL-NEXT: vpaddb %ymm0, %ymm9, %ymm9 -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsadbw %ymm0, %ymm9, %ymm9 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm9, %ymm0 +; AVX512VL-NEXT: vpxor %xmm9, %xmm9, %xmm9 +; AVX512VL-NEXT: vpsadbw %ymm0, %ymm9, %ymm0 ; AVX512VL-NEXT: vpand %ymm1, %ymm8, %ymm11 ; AVX512VL-NEXT: vpshufb %ymm11, %ymm10, %ymm11 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpand %ymm1, %ymm8, %ymm1 ; AVX512VL-NEXT: vpshufb %ymm1, %ymm10, %ymm1 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm11, %ymm1 -; AVX512VL-NEXT: vpsadbw %ymm0, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsadbw %ymm1, %ymm9, %ymm1 ; AVX512VL-NEXT: vpand %ymm2, %ymm8, %ymm11 ; AVX512VL-NEXT: vpshufb %ymm11, %ymm10, %ymm11 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512VL-NEXT: vpand %ymm2, %ymm8, %ymm2 ; AVX512VL-NEXT: vpshufb %ymm2, %ymm10, %ymm2 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm11, %ymm2 -; AVX512VL-NEXT: vpsadbw %ymm0, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsadbw %ymm2, %ymm9, %ymm2 ; AVX512VL-NEXT: vpand %ymm3, %ymm8, %ymm11 ; AVX512VL-NEXT: vpshufb %ymm11, %ymm10, %ymm11 ; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm3 ; AVX512VL-NEXT: vpand %ymm3, %ymm8, %ymm3 ; AVX512VL-NEXT: vpshufb %ymm3, %ymm10, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm11, %ymm3 -; AVX512VL-NEXT: vpsadbw %ymm0, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsadbw %ymm3, %ymm9, %ymm3 ; AVX512VL-NEXT: vpand %ymm4, %ymm8, %ymm11 ; AVX512VL-NEXT: vpshufb %ymm11, %ymm10, %ymm11 ; AVX512VL-NEXT: vpsrlw $4, %ymm4, %ymm4 ; AVX512VL-NEXT: vpand %ymm4, %ymm8, %ymm4 ; AVX512VL-NEXT: vpshufb %ymm4, %ymm10, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm11, %ymm4 -; AVX512VL-NEXT: vpsadbw %ymm0, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsadbw %ymm4, %ymm9, %ymm4 ; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm11 ; AVX512VL-NEXT: vpshufb %ymm11, %ymm10, %ymm11 ; AVX512VL-NEXT: vpsrlw $4, %ymm5, %ymm5 ; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5 ; AVX512VL-NEXT: vpshufb %ymm5, %ymm10, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm5, %ymm11, %ymm5 -; AVX512VL-NEXT: vpsadbw %ymm0, %ymm5, %ymm5 +; AVX512VL-NEXT: vpsadbw %ymm5, %ymm9, %ymm5 ; AVX512VL-NEXT: vpand %ymm6, %ymm8, %ymm11 ; AVX512VL-NEXT: vpshufb %ymm11, %ymm10, %ymm11 ; AVX512VL-NEXT: vpsrlw $4, %ymm6, %ymm6 ; AVX512VL-NEXT: vpand %ymm6, %ymm8, %ymm6 ; AVX512VL-NEXT: vpshufb %ymm6, %ymm10, %ymm6 ; AVX512VL-NEXT: vpaddb %ymm6, %ymm11, %ymm6 -; AVX512VL-NEXT: vpsadbw %ymm0, %ymm6, %ymm6 +; AVX512VL-NEXT: vpsadbw %ymm6, %ymm9, %ymm6 ; AVX512VL-NEXT: vpand %ymm7, %ymm8, %ymm11 ; AVX512VL-NEXT: vpshufb %ymm11, %ymm10, %ymm11 ; AVX512VL-NEXT: vpsrlw $4, %ymm7, %ymm7 ; AVX512VL-NEXT: vpand %ymm7, %ymm8, %ymm7 ; AVX512VL-NEXT: vpshufb %ymm7, %ymm10, %ymm7 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm11, %ymm7 -; AVX512VL-NEXT: vpsadbw %ymm0, %ymm7, %ymm7 -; AVX512VL-NEXT: vpmovqb %ymm9, %xmm8 -; AVX512VL-NEXT: vpxor %xmm9, %xmm9, %xmm9 -; AVX512VL-NEXT: vpsadbw %xmm9, %xmm8, %xmm8 -; AVX512VL-NEXT: vpmovqb %ymm1, %xmm1 -; AVX512VL-NEXT: vpsadbw %xmm1, %xmm9, %xmm1 -; AVX512VL-NEXT: vpmovqb %ymm2, %xmm2 -; AVX512VL-NEXT: vpmovqb %ymm3, %xmm3 -; AVX512VL-NEXT: vpmovqb %ymm4, %xmm4 -; AVX512VL-NEXT: vpmovqb %ymm5, %xmm5 -; AVX512VL-NEXT: vpmovqb %ymm6, %xmm6 -; AVX512VL-NEXT: vpmovqb %ymm7, %xmm7 -; AVX512VL-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX512VL-NEXT: vpsadbw %ymm0, %ymm6, %ymm6 -; AVX512VL-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX512VL-NEXT: vpsadbw %ymm0, %ymm4, %ymm4 -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12] -; AVX512VL-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512VL-NEXT: vpermi2d %ymm6, %ymm4, %ymm5 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpsadbw %ymm0, %ymm2, %ymm0 -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] -; AVX512VL-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512VL-NEXT: vpsadbw %ymm7, %ymm9, %ymm7 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX512VL-NEXT: vpaddq %ymm0, %ymm8, %ymm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] +; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX512VL-NEXT: vpaddq %ymm1, %ymm9, %ymm1 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] +; AVX512VL-NEXT: vextracti128 $1, %ymm2, %xmm10 +; AVX512VL-NEXT: vpaddq %ymm2, %ymm10, %ymm2 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[2,3,2,3] +; AVX512VL-NEXT: vextracti128 $1, %ymm3, %xmm11 +; AVX512VL-NEXT: vpaddq %ymm3, %ymm11, %ymm3 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[2,3,2,3] +; AVX512VL-NEXT: vextracti128 $1, %ymm4, %xmm12 +; AVX512VL-NEXT: vpaddq %ymm4, %ymm12, %ymm4 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[2,3,2,3] +; AVX512VL-NEXT: vextracti128 $1, %ymm5, %xmm13 +; AVX512VL-NEXT: vpaddq %ymm5, %ymm13, %ymm5 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[2,3,2,3] +; AVX512VL-NEXT: vextracti128 $1, %ymm6, %xmm14 +; AVX512VL-NEXT: vpaddq %ymm6, %ymm14, %ymm6 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[2,3,2,3] +; AVX512VL-NEXT: vextracti128 $1, %ymm7, %xmm15 +; AVX512VL-NEXT: vpaddq %ymm7, %ymm15, %ymm7 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[2,3,2,3] +; AVX512VL-NEXT: vpaddq %xmm0, %xmm8, %xmm0 +; AVX512VL-NEXT: vpaddq %xmm1, %xmm9, %xmm1 +; AVX512VL-NEXT: vmovd %xmm1, %eax +; AVX512VL-NEXT: vpaddq %xmm2, %xmm10, %xmm1 +; AVX512VL-NEXT: vmovd %xmm1, %ecx +; AVX512VL-NEXT: vpaddq %xmm3, %xmm11, %xmm1 +; AVX512VL-NEXT: vmovd %xmm1, %edx +; AVX512VL-NEXT: vpaddq %xmm4, %xmm12, %xmm1 +; AVX512VL-NEXT: vpaddq %xmm5, %xmm13, %xmm2 +; AVX512VL-NEXT: vmovd %xmm2, %esi +; AVX512VL-NEXT: vpaddq %xmm6, %xmm14, %xmm2 +; AVX512VL-NEXT: vmovd %xmm2, %edi +; AVX512VL-NEXT: vpaddq %xmm7, %xmm15, %xmm2 +; AVX512VL-NEXT: vmovd %xmm2, %r8d +; AVX512VL-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1 +; AVX512VL-NEXT: vpinsrd $2, %edi, %xmm1, %xmm1 +; AVX512VL-NEXT: vpinsrd $3, %r8d, %xmm1, %xmm1 +; AVX512VL-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512VPOPCNT-LABEL: reduce_ctpop_v4i64_buildvector_v8i32: @@ -1143,32 +1204,49 @@ ; AVX512VPOPCNT-NEXT: vpopcntq %ymm5, %ymm5 ; AVX512VPOPCNT-NEXT: vpopcntq %ymm6, %ymm6 ; AVX512VPOPCNT-NEXT: vpopcntq %ymm7, %ymm7 -; AVX512VPOPCNT-NEXT: vpmovqb %ymm0, %xmm0 -; AVX512VPOPCNT-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX512VPOPCNT-NEXT: vpsadbw %xmm0, %xmm8, %xmm0 -; AVX512VPOPCNT-NEXT: vpmovqb %ymm1, %xmm1 -; AVX512VPOPCNT-NEXT: vpsadbw %xmm1, %xmm8, %xmm1 -; AVX512VPOPCNT-NEXT: vpmovqb %ymm2, %xmm2 -; AVX512VPOPCNT-NEXT: vpmovqb %ymm3, %xmm3 -; AVX512VPOPCNT-NEXT: vpmovqb %ymm4, %xmm4 -; AVX512VPOPCNT-NEXT: vpmovqb %ymm5, %xmm5 -; AVX512VPOPCNT-NEXT: vpmovqb %ymm6, %xmm6 -; AVX512VPOPCNT-NEXT: vpmovqb %ymm7, %xmm7 +; AVX512VPOPCNT-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX512VPOPCNT-NEXT: vpaddq %ymm0, %ymm8, %ymm0 +; AVX512VPOPCNT-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] +; AVX512VPOPCNT-NEXT: vpaddq %xmm0, %xmm8, %xmm0 +; AVX512VPOPCNT-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX512VPOPCNT-NEXT: vpaddq %ymm1, %ymm8, %ymm1 +; AVX512VPOPCNT-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] +; AVX512VPOPCNT-NEXT: vpaddq %xmm1, %xmm8, %xmm1 +; AVX512VPOPCNT-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512VPOPCNT-NEXT: vpaddq %ymm2, %ymm8, %ymm2 +; AVX512VPOPCNT-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] +; AVX512VPOPCNT-NEXT: vpaddq %xmm2, %xmm8, %xmm2 +; AVX512VPOPCNT-NEXT: vmovq %xmm2, %rax +; AVX512VPOPCNT-NEXT: vextracti128 $1, %ymm3, %xmm2 +; AVX512VPOPCNT-NEXT: vpaddq %ymm2, %ymm3, %ymm2 +; AVX512VPOPCNT-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; AVX512VPOPCNT-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX512VPOPCNT-NEXT: vmovq %xmm2, %rcx +; AVX512VPOPCNT-NEXT: vextracti128 $1, %ymm4, %xmm2 +; AVX512VPOPCNT-NEXT: vpaddq %ymm2, %ymm4, %ymm2 +; AVX512VPOPCNT-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; AVX512VPOPCNT-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX512VPOPCNT-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX512VPOPCNT-NEXT: vpaddq %ymm3, %ymm5, %ymm3 +; AVX512VPOPCNT-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX512VPOPCNT-NEXT: vpaddq %xmm4, %xmm3, %xmm3 +; AVX512VPOPCNT-NEXT: vextracti128 $1, %ymm6, %xmm4 +; AVX512VPOPCNT-NEXT: vpaddq %ymm4, %ymm6, %ymm4 +; AVX512VPOPCNT-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] +; AVX512VPOPCNT-NEXT: vpaddq %xmm5, %xmm4, %xmm4 +; AVX512VPOPCNT-NEXT: vmovq %xmm4, %rdx +; AVX512VPOPCNT-NEXT: vextracti128 $1, %ymm7, %xmm4 +; AVX512VPOPCNT-NEXT: vpaddq %ymm4, %ymm7, %ymm4 +; AVX512VPOPCNT-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] +; AVX512VPOPCNT-NEXT: vpaddq %xmm5, %xmm4, %xmm4 +; AVX512VPOPCNT-NEXT: vmovq %xmm4, %rsi +; AVX512VPOPCNT-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512VPOPCNT-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 +; AVX512VPOPCNT-NEXT: vpinsrd $3, %esi, %xmm2, %xmm2 ; AVX512VPOPCNT-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512VPOPCNT-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] -; AVX512VPOPCNT-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512VPOPCNT-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512VPOPCNT-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512VPOPCNT-NEXT: vpermd %ymm2, %ymm1, %ymm1 -; AVX512VPOPCNT-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512VPOPCNT-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm1 -; AVX512VPOPCNT-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNT-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm2 -; AVX512VPOPCNT-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512VPOPCNT-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,8,12,0,4,8,12] -; AVX512VPOPCNT-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512VPOPCNT-NEXT: vpermi2d %ymm1, %ymm2, %ymm3 -; AVX512VPOPCNT-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512VPOPCNT-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; AVX512VPOPCNT-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 +; AVX512VPOPCNT-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX512VPOPCNT-NEXT: retq %p0 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a0) %p1 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a1) diff --git a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll @@ -90,7 +90,8 @@ ; ; AVX1-FAST-LABEL: test_v4f32: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: retq @@ -143,7 +144,7 @@ ; AVX1-SLOW-LABEL: test_v8f32: ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] @@ -155,9 +156,11 @@ ; AVX1-FAST-LABEL: test_v8f32: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm2, %xmm1 -; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq @@ -165,7 +168,7 @@ ; AVX2-LABEL: test_v8f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] @@ -177,7 +180,7 @@ ; AVX512-LABEL: test_v8f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] @@ -221,7 +224,7 @@ ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] @@ -234,7 +237,7 @@ ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 @@ -246,7 +249,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] @@ -260,7 +263,7 @@ ; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 ; AVX512-NEXT: vaddps %zmm2, %zmm1, %zmm1 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vaddps %zmm2, %zmm1, %zmm1 ; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] @@ -346,7 +349,8 @@ ; ; AVX1-FAST-LABEL: test_v4f32_zero: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: retq ; @@ -394,7 +398,7 @@ ; AVX1-SLOW-LABEL: test_v8f32_zero: ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -405,16 +409,18 @@ ; AVX1-FAST-LABEL: test_v8f32_zero: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm0 -; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq ; ; AVX2-LABEL: test_v8f32_zero: ; AVX2: # %bb.0: ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -425,7 +431,7 @@ ; AVX512-LABEL: test_v8f32_zero: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -466,7 +472,7 @@ ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -478,7 +484,7 @@ ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 @@ -489,7 +495,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -502,7 +508,7 @@ ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -587,7 +593,8 @@ ; ; AVX1-FAST-LABEL: test_v4f32_undef: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: retq ; @@ -635,7 +642,7 @@ ; AVX1-SLOW-LABEL: test_v8f32_undef: ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -646,16 +653,18 @@ ; AVX1-FAST-LABEL: test_v8f32_undef: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm0 -; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq ; ; AVX2-LABEL: test_v8f32_undef: ; AVX2: # %bb.0: ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -666,7 +675,7 @@ ; AVX512-LABEL: test_v8f32_undef: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -707,7 +716,7 @@ ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -719,7 +728,7 @@ ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 @@ -730,7 +739,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -743,7 +752,7 @@ ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -820,8 +829,9 @@ ; AVX1-FAST-LABEL: test_v4f64: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm2, %xmm1 -; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vaddpd %xmm2, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-FAST-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq @@ -1037,8 +1047,9 @@ ; AVX1-FAST-LABEL: test_v4f64_zero: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0 -; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq ; @@ -1241,8 +1252,9 @@ ; AVX1-FAST-LABEL: test_v4f64_undef: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0 -; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll @@ -107,7 +107,7 @@ ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -118,7 +118,7 @@ ; AVX512-LABEL: test_v8f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -159,7 +159,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -172,7 +172,7 @@ ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll @@ -112,7 +112,7 @@ ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -123,7 +123,7 @@ ; AVX512-LABEL: test_v8f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -164,7 +164,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -177,7 +177,7 @@ ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll @@ -153,7 +153,7 @@ ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -164,7 +164,7 @@ ; AVX512-LABEL: test_v8f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -205,7 +205,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vminps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -218,7 +218,7 @@ ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vminps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll @@ -113,7 +113,7 @@ ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] @@ -125,7 +125,7 @@ ; AVX512-LABEL: test_v8f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vmulps %ymm2, %ymm1, %ymm1 ; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] @@ -169,7 +169,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] @@ -183,7 +183,7 @@ ; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 ; AVX512-NEXT: vmulps %zmm2, %zmm1, %zmm1 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vmulps %zmm2, %zmm1, %zmm1 ; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] @@ -292,7 +292,7 @@ ; AVX-LABEL: test_v8f32_zero: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -303,7 +303,7 @@ ; AVX512-LABEL: test_v8f32_zero: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -344,7 +344,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -357,7 +357,7 @@ ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -465,7 +465,7 @@ ; AVX-LABEL: test_v8f32_undef: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -476,7 +476,7 @@ ; AVX512-LABEL: test_v8f32_undef: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -517,7 +517,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -530,7 +530,7 @@ ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -225,7 +225,7 @@ ; AVX512DQVL-LABEL: test_v4i64: ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmullq %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax @@ -437,7 +437,7 @@ ; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax @@ -757,7 +757,7 @@ ; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax @@ -869,9 +869,9 @@ ; AVX2-LABEL: test_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -881,9 +881,9 @@ ; AVX512-LABEL: test_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -945,9 +945,9 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -959,9 +959,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1047,9 +1047,9 @@ ; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1062,9 +1062,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1186,11 +1186,11 @@ ; AVX2-LABEL: test_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1201,11 +1201,11 @@ ; AVX512-LABEL: test_v16i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1255,11 +1255,11 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1272,11 +1272,11 @@ ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax @@ -1289,11 +1289,11 @@ ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vmovd %xmm0, %eax @@ -1306,11 +1306,11 @@ ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vmovd %xmm0, %eax @@ -1323,11 +1323,11 @@ ; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovd %xmm0, %eax @@ -1389,11 +1389,11 @@ ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1407,11 +1407,11 @@ ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax @@ -1425,11 +1425,11 @@ ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vmovd %xmm0, %eax @@ -1445,11 +1445,11 @@ ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vmovd %xmm0, %eax @@ -1465,11 +1465,11 @@ ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovd %xmm0, %eax @@ -1646,13 +1646,13 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pmullw %xmm2, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm2, %xmm0 ; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: pmullw %xmm3, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmullw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] @@ -1668,19 +1668,19 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm2, %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm2, %xmm0 ; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE41-NEXT: pmullw %xmm2, %xmm3 +; SSE41-NEXT: pmullw %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; SSE41-NEXT: pmullw %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE41-NEXT: pmullw %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 ; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pmullw %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; @@ -1688,17 +1688,18 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $8, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax @@ -1707,39 +1708,127 @@ ; ; AVX2-LABEL: test_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_v32i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v32i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BW-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: # kill: def $al killed $al killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: test_v32i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovd %xmm0, %eax +; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512DQ-LABEL: test_v32i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovd %xmm0, %eax +; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512DQVL-LABEL: test_v32i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vmovd %xmm0, %eax +; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq %1 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> %a0) ret i8 %1 } @@ -1748,24 +1837,24 @@ ; SSE2-LABEL: test_v64i8: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm4, %xmm1 -; SSE2-NEXT: pmullw %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm4, %xmm6 +; SSE2-NEXT: pmullw %xmm5, %xmm6 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm3, %xmm0 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pmullw %xmm3, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: pmullw %xmm2, %xmm0 ; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: pmullw %xmm6, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmullw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] @@ -1781,27 +1870,27 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm4, %xmm3 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm4, %xmm1 ; SSE41-NEXT: pmullw %xmm3, %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm3, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm3, %xmm0 ; SSE41-NEXT: pmullw %xmm2, %xmm0 ; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE41-NEXT: pmullw %xmm4, %xmm5 +; SSE41-NEXT: pmullw %xmm3, %xmm6 +; SSE41-NEXT: pmullw %xmm5, %xmm6 +; SSE41-NEXT: pmullw %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; SSE41-NEXT: pmullw %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE41-NEXT: pmullw %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 ; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pmullw %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; @@ -1809,26 +1898,27 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmullw %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpmullw %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $8, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax @@ -1838,19 +1928,31 @@ ; AVX2-LABEL: test_v64i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax @@ -1859,18 +1961,33 @@ ; ; AVX512BW-LABEL: test_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpmullw %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax @@ -1879,18 +1996,39 @@ ; ; AVX512BWVL-LABEL: test_v64i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; AVX512BWVL-NEXT: vpmullw %zmm3, %zmm2, %zmm2 +; AVX512BWVL-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BWVL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmullw %ymm0, %ymm1, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vmovd %xmm0, %eax ; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax @@ -1901,21 +2039,33 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQ-NEXT: vpmullw %ymm2, %ymm3, %ymm2 +; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQ-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovd %xmm0, %eax +; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax ; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -1924,21 +2074,33 @@ ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm3, %ymm2 +; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQVL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vmovd %xmm0, %eax +; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq @@ -1949,142 +2111,145 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; SSE2-LABEL: test_v128i8: ; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pmullw %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm4, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm0, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pmullw %xmm8, %xmm10 +; SSE2-NEXT: pmullw %xmm9, %xmm10 ; SSE2-NEXT: movdqa %xmm7, %xmm8 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm8, %xmm7 -; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: movdqa %xmm3, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pmullw %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm5, %xmm11 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm1, %xmm8 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm8, %xmm3 -; SSE2-NEXT: pmullw %xmm7, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm7, %xmm5 -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm7, %xmm1 -; SSE2-NEXT: pmullw %xmm5, %xmm1 -; SSE2-NEXT: pmullw %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm6, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pmullw %xmm11, %xmm8 +; SSE2-NEXT: pmullw %xmm9, %xmm8 +; SSE2-NEXT: pmullw %xmm10, %xmm8 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm3, %xmm2 ; SSE2-NEXT: pmullw %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm3, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm3, %xmm0 ; SSE2-NEXT: pmullw %xmm4, %xmm0 ; SSE2-NEXT: pmullw %xmm2, %xmm0 -; SSE2-NEXT: pmullw %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm7, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm5, %xmm1 +; SSE2-NEXT: pmullw %xmm3, %xmm1 ; SSE2-NEXT: pmullw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE2-NEXT: pmullw %xmm8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: pmullw %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE2-NEXT: pmullw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v128i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: movdqa %xmm2, %xmm10 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm9, %xmm10 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm4, %xmm0 +; SSE41-NEXT: pmullw %xmm10, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm8, %xmm7 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm10 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm8, %xmm3 ; SSE41-NEXT: pmullw %xmm7, %xmm3 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm7, %xmm5 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm11 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm7, %xmm1 ; SSE41-NEXT: pmullw %xmm5, %xmm1 ; SSE41-NEXT: pmullw %xmm3, %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm3, %xmm6 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm3, %xmm2 +; SSE41-NEXT: pmullw %xmm0, %xmm1 +; SSE41-NEXT: pmullw %xmm8, %xmm6 +; SSE41-NEXT: pmullw %xmm9, %xmm2 ; SSE41-NEXT: pmullw %xmm6, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm3, %xmm4 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm3, %xmm0 -; SSE41-NEXT: pmullw %xmm4, %xmm0 -; SSE41-NEXT: pmullw %xmm2, %xmm0 -; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE41-NEXT: pmullw %xmm4, %xmm10 +; SSE41-NEXT: pmullw %xmm7, %xmm11 +; SSE41-NEXT: pmullw %xmm10, %xmm11 +; SSE41-NEXT: pmullw %xmm2, %xmm11 +; SSE41-NEXT: pmullw %xmm1, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] +; SSE41-NEXT: pmullw %xmm11, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE41-NEXT: pmullw %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 ; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pmullw %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_v128i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero -; AVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm10 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmullw %xmm9, %xmm11, %xmm9 +; AVX1-NEXT: vpmullw %xmm6, %xmm9, %xmm6 +; AVX1-NEXT: vpmullw %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-NEXT: vpmullw %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero +; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $8, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax @@ -2093,28 +2258,40 @@ ; ; AVX2-LABEL: test_v128i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX2-NEXT: vpmullw %ymm5, %ymm6, %ymm5 +; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax @@ -2123,22 +2300,36 @@ ; ; AVX512BW-LABEL: test_v128i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512BW-NEXT: vpmullw %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vpandq %zmm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vpackuswb %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpmullw %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax @@ -2147,22 +2338,42 @@ ; ; AVX512BWVL-LABEL: test_v128i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm1, %zmm1 -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm3, %zmm2 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpbroadcastw {{.*#+}} zmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512BWVL-NEXT: vpmullw %zmm3, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512BWVL-NEXT: vpmullw %zmm3, %zmm2, %zmm2 +; AVX512BWVL-NEXT: vpandq %zmm1, %zmm2, %zmm1 +; AVX512BWVL-NEXT: vpackuswb %zmm0, %zmm1, %zmm0 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BWVL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmullw %ymm0, %ymm1, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vmovd %xmm0, %eax ; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax @@ -2172,31 +2383,43 @@ ; AVX512DQ-LABEL: test_v128i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQ-NEXT: vpmullw %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQ-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQ-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQ-NEXT: vpmullw %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQ-NEXT: vpmullw %ymm3, %ymm5, %ymm3 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQ-NEXT: vpmullw %ymm5, %ymm6, %ymm5 +; AVX512DQ-NEXT: vpmullw %ymm3, %ymm5, %ymm3 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQ-NEXT: vpmullw %ymm2, %ymm4, %ymm2 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm1 +; AVX512DQ-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovd %xmm0, %eax +; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax ; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -2204,31 +2427,43 @@ ; AVX512DQVL-LABEL: test_v128i8: ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm2, %ymm2 -; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm5, %ymm3 +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQVL-NEXT: vpmullw %ymm5, %ymm6, %ymm5 +; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm5, %ymm3 +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm4, %ymm2 +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpand %ymm1, %ymm3, %ymm1 +; AVX512DQVL-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vmovd %xmm0, %eax +; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -13,42 +13,47 @@ ; define i1 @trunc_v2i64_v2i1(<2 x i64>) nounwind { -; SSE2-LABEL: trunc_v2i64_v2i1: -; SSE2: # %bb.0: -; SSE2-NEXT: psllq $63, %xmm0 -; SSE2-NEXT: movmskpd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: setne %al -; SSE2-NEXT: ret{{[l|q]}} -; -; SSE41-LABEL: trunc_v2i64_v2i1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v2i64_v2i1: +; SSE: # %bb.0: +; SSE-NEXT: psllq $63, %xmm0 +; SSE-NEXT: movmskpd %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: setne %al +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1OR2-LABEL: trunc_v2i64_v2i1: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX1OR2-NEXT: vtestpd %xmm0, %xmm0 ; AVX1OR2-NEXT: setne %al ; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: trunc_v2i64_v2i1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $3, %al ; AVX512F-NEXT: setne %al +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_v2i64_v2i1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512BW-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $3, %al ; AVX512BW-NEXT: setne %al +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512VL-LABEL: trunc_v2i64_v2i1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; AVX512VL-NEXT: vptest %xmm1, %xmm0 +; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: testb %al, %al ; AVX512VL-NEXT: setne %al ; AVX512VL-NEXT: retq %a = trunc <2 x i64> %0 to <2 x i1> @@ -57,42 +62,47 @@ } define i1 @trunc_v4i32_v4i1(<4 x i32>) nounwind { -; SSE2-LABEL: trunc_v4i32_v4i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: setne %al -; SSE2-NEXT: ret{{[l|q]}} -; -; SSE41-LABEL: trunc_v4i32_v4i1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v4i32_v4i1: +; SSE: # %bb.0: +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: setne %al +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1OR2-LABEL: trunc_v4i32_v4i1: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1OR2-NEXT: vtestps %xmm0, %xmm0 ; AVX1OR2-NEXT: setne %al ; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: trunc_v4i32_v4i1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $15, %al ; AVX512F-NEXT: setne %al +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_v4i32_v4i1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512BW-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $15, %al ; AVX512BW-NEXT: setne %al +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512VL-LABEL: trunc_v4i32_v4i1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967297,4294967297] -; AVX512VL-NEXT: vptest %xmm1, %xmm0 +; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: testb %al, %al ; AVX512VL-NEXT: setne %al ; AVX512VL-NEXT: retq %a = trunc <4 x i32> %0 to <4 x i1> @@ -101,23 +111,19 @@ } define i1 @trunc_v8i16_v8i1(<8 x i16>) nounwind { -; SSE2-LABEL: trunc_v8i16_v8i1: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-NEXT: setne %al -; SSE2-NEXT: ret{{[l|q]}} -; -; SSE41-LABEL: trunc_v8i16_v8i1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v8i16_v8i1: +; SSE: # %bb.0: +; SSE-NEXT: psllw $15, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA +; SSE-NEXT: setne %al +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1OR2-LABEL: trunc_v8i16_v8i1: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: testl $43690, %eax # imm = 0xAAAA ; AVX1OR2-NEXT: setne %al ; AVX1OR2-NEXT: retq ; @@ -145,88 +151,83 @@ } define i1 @trunc_v16i8_v16i1(<16 x i8>) nounwind { -; SSE2-LABEL: trunc_v16i8_v16i1: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: setne %al -; SSE2-NEXT: ret{{[l|q]}} -; -; SSE41-LABEL: trunc_v16i8_v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq -; -; AVX1OR2-LABEL: trunc_v16i8_v16i1: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1OR2-NEXT: setne %al -; AVX1OR2-NEXT: retq -; -; AVX512F-LABEL: trunc_v16i8_v16i1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512F-NEXT: setne %al -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_v16i8_v16i1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512BW-NEXT: setne %al -; AVX512BW-NEXT: retq +; SSE-LABEL: trunc_v16i8_v16i1: +; SSE: # %bb.0: +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: setne %al +; SSE-NEXT: ret{{[l|q]}} ; -; AVX512VL-LABEL: trunc_v16i8_v16i1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [72340172838076673,72340172838076673] -; AVX512VL-NEXT: vptest %xmm1, %xmm0 -; AVX512VL-NEXT: setne %al -; AVX512VL-NEXT: retq +; AVX-LABEL: trunc_v16i8_v16i1: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX-NEXT: vpmovmskb %xmm0, %eax +; AVX-NEXT: testl %eax, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: retq %a = trunc <16 x i8> %0 to <16 x i1> %b = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a) ret i1 %b } define i1 @trunc_v4i64_v4i1(<4 x i64>) nounwind { -; SSE2-LABEL: trunc_v4i64_v4i1: -; SSE2: # %bb.0: -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: setne %al -; SSE2-NEXT: ret{{[l|q]}} -; -; SSE41-LABEL: trunc_v4i64_v4i1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v4i64_v4i1: +; SSE: # %bb.0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: setne %al +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-LABEL: trunc_v4i64_v4i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vtestps %xmm0, %xmm0 ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v4i64_v4i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX2-NEXT: vtestpd %ymm0, %ymm0 ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: trunc_v4i64_v4i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setne %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_v4i64_v4i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $15, %al +; AVX512F-NEXT: setne %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_v4i64_v4i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $15, %al +; AVX512BW-NEXT: setne %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: trunc_v4i64_v4i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX512VL-NEXT: vptestmq %ymm0, %ymm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: testb %al, %al +; AVX512VL-NEXT: setne %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = trunc <4 x i64> %0 to <4 x i1> %b = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %a) ret i1 %b @@ -235,31 +236,43 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) nounwind { ; SSE2-LABEL: trunc_v8i32_v8i1: ; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: psllw $15, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE2-NEXT: setne %al ; SSE2-NEXT: ret{{[l|q]}} ; ; SSE41-LABEL: trunc_v8i32_v8i1: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: psllw $15, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_v8i32_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vtestps %xmm0, %xmm0 ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v8i32_v8i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vtestps %ymm0, %ymm0 ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -277,33 +290,38 @@ } define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind { -; SSE2-LABEL: trunc_v16i16_v16i1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-NEXT: setne %al -; SSE2-NEXT: ret{{[l|q]}} -; -; SSE41-LABEL: trunc_v16i16_v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v16i16_v16i1: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: setne %al +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-LABEL: trunc_v16i16_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v16i16_v16i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -321,41 +339,40 @@ } define i1 @trunc_v32i8_v32i1(<32 x i8>) nounwind { -; SSE2-LABEL: trunc_v32i8_v32i1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: setne %al -; SSE2-NEXT: ret{{[l|q]}} -; -; SSE41-LABEL: trunc_v32i8_v32i1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v32i8_v32i1: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: setne %al +; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-LABEL: trunc_v32i8_v32i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v32i8_v32i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_v32i8_v32i1: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX512-NEXT: vptest %ymm1, %ymm0 +; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512-NEXT: vpmovmskb %ymm0, %eax +; AVX512-NEXT: testl %eax, %eax ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -403,26 +420,40 @@ ; ; SSE41-LABEL: trunc_v8i64_v8i1: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packusdw %xmm2, %xmm0 +; SSE41-NEXT: psllw $15, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_v8i64_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vtestps %xmm0, %xmm0 ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v8i64_v8i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vtestps %ymm0, %ymm0 ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -447,11 +478,16 @@ ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: por 8(%ebp), %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: pslld $31, %xmm1 -; X86-SSE2-NEXT: movmskps %xmm1, %eax +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: packuswb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm2 +; X86-SSE2-NEXT: pand 8(%ebp), %xmm3 +; X86-SSE2-NEXT: packuswb %xmm3, %xmm2 +; X86-SSE2-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE2-NEXT: psllw $7, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax ; X86-SSE2-NEXT: testl %eax, %eax ; X86-SSE2-NEXT: setne %al ; X86-SSE2-NEXT: movl %ebp, %esp @@ -460,37 +496,65 @@ ; ; X64-SSE2-LABEL: trunc_v16i32_v16i1: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: por %xmm3, %xmm1 -; X64-SSE2-NEXT: por %xmm2, %xmm0 -; X64-SSE2-NEXT: por %xmm1, %xmm0 -; X64-SSE2-NEXT: pslld $31, %xmm0 -; X64-SSE2-NEXT: movmskps %xmm0, %eax +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; X64-SSE2-NEXT: pand %xmm4, %xmm3 +; X64-SSE2-NEXT: pand %xmm4, %xmm2 +; X64-SSE2-NEXT: packuswb %xmm3, %xmm2 +; X64-SSE2-NEXT: pand %xmm4, %xmm1 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 +; X64-SSE2-NEXT: packuswb %xmm1, %xmm0 +; X64-SSE2-NEXT: packuswb %xmm2, %xmm0 +; X64-SSE2-NEXT: psllw $7, %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: testl %eax, %eax ; X64-SSE2-NEXT: setne %al ; X64-SSE2-NEXT: retq ; ; SSE41-LABEL: trunc_v16i32_v16i1: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm1 +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: psllw $7, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: testl %eax, %eax ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_v16i32_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v16i32_v16i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -514,50 +578,65 @@ ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: por 8(%ebp), %xmm1 -; X86-SSE2-NEXT: por %xmm0, %xmm1 -; X86-SSE2-NEXT: psllw $7, %xmm1 -; X86-SSE2-NEXT: pmovmskb %xmm1, %eax -; X86-SSE2-NEXT: testl $21845, %eax # imm = 0x5555 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; X86-SSE2-NEXT: pand %xmm3, %xmm1 +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: packuswb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm3, %xmm2 +; X86-SSE2-NEXT: pand 8(%ebp), %xmm3 +; X86-SSE2-NEXT: packuswb %xmm3, %xmm2 +; X86-SSE2-NEXT: por %xmm0, %xmm2 +; X86-SSE2-NEXT: psllw $7, %xmm2 +; X86-SSE2-NEXT: pmovmskb %xmm2, %eax +; X86-SSE2-NEXT: testl %eax, %eax ; X86-SSE2-NEXT: setne %al ; X86-SSE2-NEXT: movl %ebp, %esp ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; -; X64-SSE2-LABEL: trunc_v32i16_v32i1: -; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: por %xmm3, %xmm1 -; X64-SSE2-NEXT: por %xmm2, %xmm0 -; X64-SSE2-NEXT: por %xmm1, %xmm0 -; X64-SSE2-NEXT: psllw $7, %xmm0 -; X64-SSE2-NEXT: pmovmskb %xmm0, %eax -; X64-SSE2-NEXT: testl $21845, %eax # imm = 0x5555 -; X64-SSE2-NEXT: setne %al -; X64-SSE2-NEXT: retq -; -; SSE41-LABEL: trunc_v32i16_v32i1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; X64-SSE-LABEL: trunc_v32i16_v32i1: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; X64-SSE-NEXT: pand %xmm4, %xmm3 +; X64-SSE-NEXT: pand %xmm4, %xmm2 +; X64-SSE-NEXT: packuswb %xmm3, %xmm2 +; X64-SSE-NEXT: pand %xmm4, %xmm1 +; X64-SSE-NEXT: pand %xmm4, %xmm0 +; X64-SSE-NEXT: packuswb %xmm1, %xmm0 +; X64-SSE-NEXT: por %xmm2, %xmm0 +; X64-SSE-NEXT: psllw $7, %xmm0 +; X64-SSE-NEXT: pmovmskb %xmm0, %eax +; X64-SSE-NEXT: testl %eax, %eax +; X64-SSE-NEXT: setne %al +; X64-SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v32i16_v32i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v32i16_v32i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -629,13 +708,32 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: trunc_v64i8_v64i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: setne %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_v64i8_v64i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovmskb %ymm0, %eax +; AVX512F-NEXT: testl %eax, %eax +; AVX512F-NEXT: setne %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_v64i8_v64i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 +; AVX512BW-NEXT: kortestw %k0, %k0 +; AVX512BW-NEXT: setne %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: trunc_v64i8_v64i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 +; AVX512VL-NEXT: kortestw %k0, %k0 +; AVX512VL-NEXT: setne %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = trunc <64 x i8> %0 to <64 x i1> %b = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> %a) ret i1 %b @@ -779,11 +877,8 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AVX512F-NEXT: setne %al -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: icmp0_v8i16_v8i1: @@ -1041,9 +1136,8 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] +; AVX512F-NEXT: vptest %ymm1, %ymm0 ; AVX512F-NEXT: setne %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1414,9 +1508,8 @@ ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 ; AVX512F-NEXT: kortestw %k0, %k0 ; AVX512F-NEXT: setne %al ; AVX512F-NEXT: vzeroupper @@ -1675,11 +1768,8 @@ ; AVX512F-LABEL: icmp_v8i16_v8i1: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AVX512F-NEXT: setne %al -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: icmp_v8i16_v8i1: @@ -1983,9 +2073,8 @@ ; AVX512F-LABEL: icmp_v16i16_v16i1: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] +; AVX512F-NEXT: vptest %ymm1, %ymm0 ; AVX512F-NEXT: setne %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2367,9 +2456,8 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 ; AVX512F-NEXT: kortestw %k0, %k0 ; AVX512F-NEXT: setne %al ; AVX512F-NEXT: vzeroupper @@ -2500,5 +2588,3 @@ declare i1 @llvm.vector.reduce.or.v16i1(<16 x i1>) declare i1 @llvm.vector.reduce.or.v32i1(<32 x i1>) declare i1 @llvm.vector.reduce.or.v64i1(<64 x i1>) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; AVX: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll @@ -55,12 +55,41 @@ ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; -; AVX-LABEL: test_v4i64: -; AVX: # %bb.0: -; AVX-NEXT: vptest %ymm0, %ymm0 -; AVX-NEXT: setne %al -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: setne %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: testq %rax, %rax +; AVX2-NEXT: setne %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: testq %rax, %rax +; AVX512-NEXT: setne %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %a0) %2 = icmp ne i64 %1, 0 ret i1 %2 @@ -91,7 +120,12 @@ ; AVX1-LABEL: test_v8i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -99,15 +133,26 @@ ; AVX2-LABEL: test_v8i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v8i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: testq %rax, %rax ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -119,30 +164,30 @@ define i1 @test_v16i64(<16 x i64> %a0) { ; SSE2-LABEL: test_v16i64: ; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm6, %xmm2 ; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %eax ; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16i64: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm7, %xmm3 -; SSE41-NEXT: por %xmm5, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm1 ; SSE41-NEXT: por %xmm6, %xmm2 ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm3 +; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: ptest %xmm1, %xmm1 ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; @@ -151,7 +196,12 @@ ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -161,7 +211,12 @@ ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -169,8 +224,14 @@ ; AVX512-LABEL: test_v16i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: testq %rax, %rax ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -246,12 +307,47 @@ ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX-LABEL: test_v8i32: -; AVX: # %bb.0: -; AVX-NEXT: vptest %ymm0, %ymm0 -; AVX-NEXT: sete %al -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: sete %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: testl %eax, %eax +; AVX2-NEXT: sete %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: sete %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0) %2 = icmp eq i32 %1, 0 ret i1 %2 @@ -282,7 +378,14 @@ ; AVX1-LABEL: test_v16i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -290,15 +393,30 @@ ; AVX2-LABEL: test_v16i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v16i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testl %eax, %eax ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -310,30 +428,30 @@ define i1 @test_v32i32(<32 x i32> %a0) { ; SSE2-LABEL: test_v32i32: ; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm6, %xmm2 ; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %eax ; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v32i32: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm7, %xmm3 -; SSE41-NEXT: por %xmm5, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm1 ; SSE41-NEXT: por %xmm6, %xmm2 ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm3 +; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: ptest %xmm1, %xmm1 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; @@ -342,7 +460,14 @@ ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -352,7 +477,14 @@ ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -360,8 +492,16 @@ ; AVX512-LABEL: test_v32i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testl %eax, %eax ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -456,12 +596,53 @@ ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; -; AVX-LABEL: test_v16i16: -; AVX: # %bb.0: -; AVX-NEXT: vptest %ymm0, %ymm0 -; AVX-NEXT: setne %al -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testw %ax, %ax +; AVX1-NEXT: setne %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: testw %ax, %ax +; AVX2-NEXT: setne %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testw %ax, %ax +; AVX512-NEXT: setne %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %a0) %2 = icmp ne i16 %1, 0 ret i1 %2 @@ -492,7 +673,16 @@ ; AVX1-LABEL: test_v32i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testw %ax, %ax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -500,15 +690,34 @@ ; AVX2-LABEL: test_v32i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: testw %ax, %ax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v32i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testw %ax, %ax ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -520,30 +729,30 @@ define i1 @test_v64i16(<64 x i16> %a0) { ; SSE2-LABEL: test_v64i16: ; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm6, %xmm2 ; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v64i16: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm7, %xmm3 -; SSE41-NEXT: por %xmm5, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm1 ; SSE41-NEXT: por %xmm6, %xmm2 ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm3 +; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: ptest %xmm1, %xmm1 ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; @@ -552,7 +761,16 @@ ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testw %ax, %ax ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -562,7 +780,16 @@ ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: testw %ax, %ax ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -570,8 +797,18 @@ ; AVX512-LABEL: test_v64i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testw %ax, %ax ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -685,12 +922,59 @@ ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX-LABEL: test_v32i8: -; AVX: # %bb.0: -; AVX-NEXT: vptest %ymm0, %ymm0 -; AVX-NEXT: sete %al -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testb %al, %al +; AVX1-NEXT: sete %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: testb %al, %al +; AVX2-NEXT: sete %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testb %al, %al +; AVX512-NEXT: sete %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %a0) %2 = icmp eq i8 %1, 0 ret i1 %2 @@ -721,7 +1005,18 @@ ; AVX1-LABEL: test_v64i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testb %al, %al ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -729,15 +1024,38 @@ ; AVX2-LABEL: test_v64i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: testb %al, %al ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v64i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testb %al, %al ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -749,30 +1067,30 @@ define i1 @test_v128i8(<128 x i8> %a0) { ; SSE2-LABEL: test_v128i8: ; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm6, %xmm2 ; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v128i8: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm7, %xmm3 -; SSE41-NEXT: por %xmm5, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm1 ; SSE41-NEXT: por %xmm6, %xmm2 ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm3 +; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: ptest %xmm1, %xmm1 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; @@ -781,7 +1099,18 @@ ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testb %al, %al ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -791,7 +1120,18 @@ ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: testb %al, %al ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -799,8 +1139,20 @@ ; AVX512-LABEL: test_v128i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testb %al, %al ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -877,23 +1229,42 @@ ; ; AVX1-LABEL: mask_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testl $-2147483648, %eax # imm = 0x80000000 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: mask_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: testl $-2147483648, %eax # imm = 0x80000000 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: mask_v8i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456] -; AVX512-NEXT: vptest %ymm1, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testl $-2147483648, %eax # imm = 0x80000000 ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -904,30 +1275,68 @@ } define i1 @trunc_v16i16(<16 x i16> %a0) { -; SSE2-LABEL: trunc_v16i16: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: setne %al -; SSE2-NEXT: retq +; SSE-LABEL: trunc_v16i16: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: testb %al, %al +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; -; SSE41-LABEL: trunc_v16i16: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; AVX1-LABEL: trunc_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testb %al, %al +; AVX1-NEXT: setne %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq ; -; AVX-LABEL: trunc_v16i16: -; AVX: # %bb.0: -; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX-NEXT: setne %al -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX2-LABEL: trunc_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: testb %al, %al +; AVX2-NEXT: setne %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testb %al, %al +; AVX512-NEXT: setne %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %a0) %2 = trunc i16 %1 to i8 %3 = icmp ne i8 %2, 0 @@ -937,29 +1346,29 @@ define i1 @mask_v128i8(<128 x i8> %a0) { ; SSE2-LABEL: mask_v128i8: ; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm6, %xmm2 ; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: psllw $7, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax ; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: mask_v128i8: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm7, %xmm3 -; SSE41-NEXT: por %xmm5, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm1 ; SSE41-NEXT: por %xmm6, %xmm2 ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: por %xmm7, %xmm3 +; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; @@ -968,7 +1377,18 @@ ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -978,8 +1398,18 @@ ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -987,8 +1417,20 @@ ; AVX512-LABEL: mask_v128i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testb $1, %al ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-or.ll b/llvm/test/CodeGen/X86/vector-reduce-or.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or.ll @@ -60,9 +60,9 @@ ; AVX1-LABEL: test_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -70,7 +70,7 @@ ; AVX2-LABEL: test_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -80,7 +80,7 @@ ; AVX512-LABEL: test_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -123,7 +123,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -134,7 +134,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -146,7 +146,7 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -200,7 +200,7 @@ ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -213,7 +213,7 @@ ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -226,7 +226,7 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -294,11 +294,11 @@ ; AVX1-LABEL: test_v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -306,7 +306,7 @@ ; AVX2-LABEL: test_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -318,7 +318,7 @@ ; AVX512-LABEL: test_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -365,7 +365,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -378,7 +378,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -392,9 +392,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -450,7 +450,7 @@ ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -465,7 +465,7 @@ ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -480,9 +480,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -589,11 +589,11 @@ ; AVX1-LABEL: test_v16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -604,11 +604,11 @@ ; AVX2-LABEL: test_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -619,11 +619,11 @@ ; AVX512-LABEL: test_v16i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -677,11 +677,11 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vorps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vorps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -693,11 +693,11 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -779,11 +779,11 @@ ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vorps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vorps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -797,11 +797,11 @@ ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -966,11 +966,11 @@ ; AVX1-LABEL: test_v32i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 @@ -983,7 +983,7 @@ ; AVX2-LABEL: test_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1000,7 +1000,7 @@ ; AVX512-LABEL: test_v32i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1066,7 +1066,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1084,7 +1084,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1103,13 +1103,13 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1180,7 +1180,7 @@ ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1200,7 +1200,7 @@ ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1220,13 +1220,13 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/vector-reduce-smax.ll b/llvm/test/CodeGen/X86/vector-reduce-smax.ll --- a/llvm/test/CodeGen/X86/vector-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smax.ll @@ -180,8 +180,8 @@ ; AVX2-LABEL: test_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -203,7 +203,7 @@ ; AVX512VL-LABEL: test_v4i64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -372,8 +372,8 @@ ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -398,7 +398,7 @@ ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -697,8 +697,8 @@ ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -725,7 +725,7 @@ ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -871,9 +871,9 @@ ; AVX2-LABEL: test_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -883,9 +883,9 @@ ; AVX512-LABEL: test_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -959,9 +959,9 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -973,9 +973,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1081,9 +1081,9 @@ ; AVX2-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1096,9 +1096,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1269,38 +1269,32 @@ ; AVX2-LABEL: test_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorl $32767, %eax # imm = 0x7FFF ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v16i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v16i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %a0) ret i16 %1 } @@ -1353,42 +1347,34 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorl $32767, %eax # imm = 0x7FFF ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v32i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v32i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v32i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> %a0) ret i16 %1 } @@ -1416,15 +1402,15 @@ ; ; SSE4-LABEL: test_v64i16: ; SSE4: # %bb.0: -; SSE4-NEXT: pmaxsw %xmm7, %xmm3 -; SSE4-NEXT: pmaxsw %xmm5, %xmm1 -; SSE4-NEXT: pmaxsw %xmm3, %xmm1 ; SSE4-NEXT: pmaxsw %xmm6, %xmm2 ; SSE4-NEXT: pmaxsw %xmm4, %xmm0 ; SSE4-NEXT: pmaxsw %xmm2, %xmm0 -; SSE4-NEXT: pmaxsw %xmm1, %xmm0 -; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: phminposuw %xmm0, %xmm0 +; SSE4-NEXT: pmaxsw %xmm7, %xmm3 +; SSE4-NEXT: pmaxsw %xmm5, %xmm1 +; SSE4-NEXT: pmaxsw %xmm3, %xmm1 +; SSE4-NEXT: pmaxsw %xmm0, %xmm1 +; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE4-NEXT: phminposuw %xmm1, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: xorl $32767, %eax # imm = 0x7FFF ; SSE4-NEXT: # kill: def $ax killed $ax killed $eax @@ -1432,17 +1418,17 @@ ; ; AVX1-LABEL: test_v64i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpmaxsw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpmaxsw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpmaxsw %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpmaxsw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpmaxsw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmaxsw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsw %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -1457,44 +1443,35 @@ ; AVX2-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorl $32767, %eax # imm = 0x7FFF ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v64i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v64i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v64i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> %a0) ret i16 %1 } @@ -1818,44 +1795,36 @@ ; AVX2-LABEL: test_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorb $127, %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v32i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: xorb $127, %al -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v32i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: xorb $127, %al -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %a0) ret i8 %1 } @@ -1944,48 +1913,38 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorb $127, %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v64i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: xorb $127, %al -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v64i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: xorb $127, %al -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v64i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> %a0) ret i8 %1 } @@ -2060,18 +2019,18 @@ ; ; SSE4-LABEL: test_v128i8: ; SSE4: # %bb.0: -; SSE4-NEXT: pmaxsb %xmm7, %xmm3 -; SSE4-NEXT: pmaxsb %xmm5, %xmm1 -; SSE4-NEXT: pmaxsb %xmm3, %xmm1 ; SSE4-NEXT: pmaxsb %xmm6, %xmm2 ; SSE4-NEXT: pmaxsb %xmm4, %xmm0 ; SSE4-NEXT: pmaxsb %xmm2, %xmm0 -; SSE4-NEXT: pmaxsb %xmm1, %xmm0 -; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: movdqa %xmm0, %xmm1 -; SSE4-NEXT: psrlw $8, %xmm1 -; SSE4-NEXT: pminub %xmm0, %xmm1 -; SSE4-NEXT: phminposuw %xmm1, %xmm0 +; SSE4-NEXT: pmaxsb %xmm7, %xmm3 +; SSE4-NEXT: pmaxsb %xmm5, %xmm1 +; SSE4-NEXT: pmaxsb %xmm3, %xmm1 +; SSE4-NEXT: pmaxsb %xmm0, %xmm1 +; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE4-NEXT: movdqa %xmm1, %xmm0 +; SSE4-NEXT: psrlw $8, %xmm0 +; SSE4-NEXT: pminub %xmm1, %xmm0 +; SSE4-NEXT: phminposuw %xmm0, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: xorb $127, %al ; SSE4-NEXT: # kill: def $al killed $al killed $eax @@ -2079,17 +2038,17 @@ ; ; AVX1-LABEL: test_v128i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpmaxsb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpmaxsb %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpmaxsb %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpmaxsb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpmaxsb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmaxsb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 @@ -2106,50 +2065,39 @@ ; AVX2-NEXT: vpmaxsb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorb $127, %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v128i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: xorb $127, %al -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v128i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: xorb $127, %al -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v128i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> %a0) ret i8 %1 } diff --git a/llvm/test/CodeGen/X86/vector-reduce-smin.ll b/llvm/test/CodeGen/X86/vector-reduce-smin.ll --- a/llvm/test/CodeGen/X86/vector-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smin.ll @@ -180,8 +180,8 @@ ; AVX2-LABEL: test_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -203,7 +203,7 @@ ; AVX512VL-LABEL: test_v4i64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpminsq %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -372,8 +372,8 @@ ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -398,7 +398,7 @@ ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -697,8 +697,8 @@ ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -725,7 +725,7 @@ ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -871,9 +871,9 @@ ; AVX2-LABEL: test_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -883,9 +883,9 @@ ; AVX512-LABEL: test_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -959,9 +959,9 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -973,9 +973,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1081,9 +1081,9 @@ ; AVX2-NEXT: vpminsd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1096,9 +1096,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1269,38 +1269,32 @@ ; AVX2-LABEL: test_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorl $32768, %eax # imm = 0x8000 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v16i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v16i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %a0) ret i16 %1 } @@ -1353,42 +1347,34 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorl $32768, %eax # imm = 0x8000 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v32i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v32i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v32i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> %a0) ret i16 %1 } @@ -1416,15 +1402,15 @@ ; ; SSE4-LABEL: test_v64i16: ; SSE4: # %bb.0: -; SSE4-NEXT: pminsw %xmm7, %xmm3 -; SSE4-NEXT: pminsw %xmm5, %xmm1 -; SSE4-NEXT: pminsw %xmm3, %xmm1 ; SSE4-NEXT: pminsw %xmm6, %xmm2 ; SSE4-NEXT: pminsw %xmm4, %xmm0 ; SSE4-NEXT: pminsw %xmm2, %xmm0 -; SSE4-NEXT: pminsw %xmm1, %xmm0 -; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: phminposuw %xmm0, %xmm0 +; SSE4-NEXT: pminsw %xmm7, %xmm3 +; SSE4-NEXT: pminsw %xmm5, %xmm1 +; SSE4-NEXT: pminsw %xmm3, %xmm1 +; SSE4-NEXT: pminsw %xmm0, %xmm1 +; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE4-NEXT: phminposuw %xmm1, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: xorl $32768, %eax # imm = 0x8000 ; SSE4-NEXT: # kill: def $ax killed $ax killed $eax @@ -1432,17 +1418,17 @@ ; ; AVX1-LABEL: test_v64i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpminsw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpminsw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpminsw %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpminsw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpminsw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpminsw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpminsw %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -1457,44 +1443,35 @@ ; AVX2-NEXT: vpminsw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorl $32768, %eax # imm = 0x8000 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v64i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpminsw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v64i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminsw %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v64i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> %a0) ret i16 %1 } @@ -1818,44 +1795,36 @@ ; AVX2-LABEL: test_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: addb $-128, %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v32i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: addb $-128, %al -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v32i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: addb $-128, %al -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %a0) ret i8 %1 } @@ -1944,48 +1913,38 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: addb $-128, %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v64i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: addb $-128, %al -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v64i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: addb $-128, %al -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v64i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> %a0) ret i8 %1 } @@ -2060,18 +2019,18 @@ ; ; SSE4-LABEL: test_v128i8: ; SSE4: # %bb.0: -; SSE4-NEXT: pminsb %xmm7, %xmm3 -; SSE4-NEXT: pminsb %xmm5, %xmm1 -; SSE4-NEXT: pminsb %xmm3, %xmm1 ; SSE4-NEXT: pminsb %xmm6, %xmm2 ; SSE4-NEXT: pminsb %xmm4, %xmm0 ; SSE4-NEXT: pminsb %xmm2, %xmm0 -; SSE4-NEXT: pminsb %xmm1, %xmm0 -; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: movdqa %xmm0, %xmm1 -; SSE4-NEXT: psrlw $8, %xmm1 -; SSE4-NEXT: pminub %xmm0, %xmm1 -; SSE4-NEXT: phminposuw %xmm1, %xmm0 +; SSE4-NEXT: pminsb %xmm7, %xmm3 +; SSE4-NEXT: pminsb %xmm5, %xmm1 +; SSE4-NEXT: pminsb %xmm3, %xmm1 +; SSE4-NEXT: pminsb %xmm0, %xmm1 +; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE4-NEXT: movdqa %xmm1, %xmm0 +; SSE4-NEXT: psrlw $8, %xmm0 +; SSE4-NEXT: pminub %xmm1, %xmm0 +; SSE4-NEXT: phminposuw %xmm0, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: addb $-128, %al ; SSE4-NEXT: # kill: def $al killed $al killed $eax @@ -2079,17 +2038,17 @@ ; ; AVX1-LABEL: test_v128i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpminsb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpminsb %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpminsb %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpminsb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpminsb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpminsb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpminsb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 @@ -2106,50 +2065,39 @@ ; AVX2-NEXT: vpminsb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: addb $-128, %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v128i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpminsb %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: addb $-128, %al -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v128i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminsb %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: addb $-128, %al -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v128i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> %a0) ret i8 %1 } diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll --- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -211,10 +211,10 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -238,7 +238,7 @@ ; AVX512VL-LABEL: test_v4i64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -431,10 +431,10 @@ ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm3 -; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -461,7 +461,7 @@ ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -809,10 +809,10 @@ ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm2 -; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm3 -; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vxorpd %ymm4, %ymm0, %ymm2 +; AVX2-NEXT: vxorpd %ymm4, %ymm1, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2 ; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 @@ -841,7 +841,7 @@ ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -1005,9 +1005,9 @@ ; AVX2-LABEL: test_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1017,9 +1017,9 @@ ; AVX512-LABEL: test_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1108,9 +1108,9 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1122,9 +1122,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1257,9 +1257,9 @@ ; AVX2-NEXT: vpmaxud %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1272,9 +1272,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1481,39 +1481,32 @@ ; AVX2-LABEL: test_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: notl %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v16i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: notl %eax -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v16i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: notl %eax -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %a0) ret i16 %1 } @@ -1574,43 +1567,34 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: notl %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v32i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: notl %eax -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v32i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: notl %eax -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v32i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> %a0) ret i16 %1 } @@ -1648,16 +1632,16 @@ ; ; SSE4-LABEL: test_v64i16: ; SSE4: # %bb.0: -; SSE4-NEXT: pmaxuw %xmm7, %xmm3 -; SSE4-NEXT: pmaxuw %xmm5, %xmm1 -; SSE4-NEXT: pmaxuw %xmm3, %xmm1 ; SSE4-NEXT: pmaxuw %xmm6, %xmm2 ; SSE4-NEXT: pmaxuw %xmm4, %xmm0 ; SSE4-NEXT: pmaxuw %xmm2, %xmm0 -; SSE4-NEXT: pmaxuw %xmm1, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm0, %xmm1 -; SSE4-NEXT: phminposuw %xmm1, %xmm0 +; SSE4-NEXT: pmaxuw %xmm7, %xmm3 +; SSE4-NEXT: pmaxuw %xmm5, %xmm1 +; SSE4-NEXT: pmaxuw %xmm3, %xmm1 +; SSE4-NEXT: pmaxuw %xmm0, %xmm1 +; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE4-NEXT: pxor %xmm1, %xmm0 +; SSE4-NEXT: phminposuw %xmm0, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: notl %eax ; SSE4-NEXT: # kill: def $ax killed $ax killed $eax @@ -1665,17 +1649,17 @@ ; ; AVX1-LABEL: test_v64i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpmaxuw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpmaxuw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpmaxuw %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpmaxuw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpmaxuw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmaxuw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxuw %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vphminposuw %xmm0, %xmm0 @@ -1691,45 +1675,35 @@ ; AVX2-NEXT: vpmaxuw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: notl %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v64i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: notl %eax -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v64i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: notl %eax -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v64i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> %a0) ret i16 %1 } @@ -1964,45 +1938,36 @@ ; AVX2-LABEL: test_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: notb %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v32i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: notb %al -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v32i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: notb %al -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %a0) ret i8 %1 } @@ -2065,49 +2030,38 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: notb %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v64i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: notb %al -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v64i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: notb %al -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v64i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> %a0) ret i8 %1 } @@ -2138,19 +2092,19 @@ ; ; SSE4-LABEL: test_v128i8: ; SSE4: # %bb.0: -; SSE4-NEXT: pmaxub %xmm7, %xmm3 -; SSE4-NEXT: pmaxub %xmm5, %xmm1 -; SSE4-NEXT: pmaxub %xmm3, %xmm1 ; SSE4-NEXT: pmaxub %xmm6, %xmm2 ; SSE4-NEXT: pmaxub %xmm4, %xmm0 ; SSE4-NEXT: pmaxub %xmm2, %xmm0 -; SSE4-NEXT: pmaxub %xmm1, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm0, %xmm1 -; SSE4-NEXT: movdqa %xmm1, %xmm0 -; SSE4-NEXT: psrlw $8, %xmm0 -; SSE4-NEXT: pminub %xmm1, %xmm0 -; SSE4-NEXT: phminposuw %xmm0, %xmm0 +; SSE4-NEXT: pmaxub %xmm7, %xmm3 +; SSE4-NEXT: pmaxub %xmm5, %xmm1 +; SSE4-NEXT: pmaxub %xmm3, %xmm1 +; SSE4-NEXT: pmaxub %xmm0, %xmm1 +; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE4-NEXT: pxor %xmm1, %xmm0 +; SSE4-NEXT: movdqa %xmm0, %xmm1 +; SSE4-NEXT: psrlw $8, %xmm1 +; SSE4-NEXT: pminub %xmm0, %xmm1 +; SSE4-NEXT: phminposuw %xmm1, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: notb %al ; SSE4-NEXT: # kill: def $al killed $al killed $eax @@ -2158,17 +2112,17 @@ ; ; AVX1-LABEL: test_v128i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpmaxub %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpmaxub %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpmaxub %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpmaxub %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpmaxub %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmaxub %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxub %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 @@ -2186,51 +2140,39 @@ ; AVX2-NEXT: vpmaxub %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: notb %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v128i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: notb %al -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v128i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: notb %al -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v128i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> %a0) ret i8 %1 } diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll --- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll @@ -212,10 +212,10 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -239,7 +239,7 @@ ; AVX512VL-LABEL: test_v4i64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpminuq %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -434,10 +434,10 @@ ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -464,7 +464,7 @@ ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -813,10 +813,10 @@ ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2 -; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 -; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vxorpd %ymm4, %ymm0, %ymm2 +; AVX2-NEXT: vxorpd %ymm4, %ymm1, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2 ; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 @@ -845,7 +845,7 @@ ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -1009,9 +1009,9 @@ ; AVX2-LABEL: test_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1021,9 +1021,9 @@ ; AVX512-LABEL: test_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1112,9 +1112,9 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1126,9 +1126,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1261,9 +1261,9 @@ ; AVX2-NEXT: vpminud %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1276,9 +1276,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1470,8 +1470,13 @@ ; AVX2-LABEL: test_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -1480,8 +1485,13 @@ ; AVX512-LABEL: test_v16i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -1546,8 +1556,13 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -1556,10 +1571,15 @@ ; AVX512-LABEL: test_v32i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -1611,31 +1631,31 @@ ; ; SSE4-LABEL: test_v64i16: ; SSE4: # %bb.0: -; SSE4-NEXT: pminuw %xmm7, %xmm3 -; SSE4-NEXT: pminuw %xmm5, %xmm1 -; SSE4-NEXT: pminuw %xmm3, %xmm1 ; SSE4-NEXT: pminuw %xmm6, %xmm2 ; SSE4-NEXT: pminuw %xmm4, %xmm0 ; SSE4-NEXT: pminuw %xmm2, %xmm0 -; SSE4-NEXT: pminuw %xmm1, %xmm0 -; SSE4-NEXT: phminposuw %xmm0, %xmm0 +; SSE4-NEXT: pminuw %xmm7, %xmm3 +; SSE4-NEXT: pminuw %xmm5, %xmm1 +; SSE4-NEXT: pminuw %xmm3, %xmm1 +; SSE4-NEXT: pminuw %xmm0, %xmm1 +; SSE4-NEXT: phminposuw %xmm1, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: # kill: def $ax killed $ax killed $eax ; SSE4-NEXT: retq ; ; AVX1-LABEL: test_v64i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpminuw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpminuw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpminuw %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpminuw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpminuw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpminuw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpminuw %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax @@ -1648,8 +1668,13 @@ ; AVX2-NEXT: vpminuw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -1659,10 +1684,15 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -1874,10 +1904,15 @@ ; AVX2-LABEL: test_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper @@ -1886,10 +1921,15 @@ ; AVX512-LABEL: test_v32i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper @@ -1950,10 +1990,15 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper @@ -1962,12 +2007,17 @@ ; AVX512-LABEL: test_v64i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper @@ -2002,34 +2052,34 @@ ; ; SSE4-LABEL: test_v128i8: ; SSE4: # %bb.0: -; SSE4-NEXT: pminub %xmm7, %xmm3 -; SSE4-NEXT: pminub %xmm5, %xmm1 -; SSE4-NEXT: pminub %xmm3, %xmm1 ; SSE4-NEXT: pminub %xmm6, %xmm2 ; SSE4-NEXT: pminub %xmm4, %xmm0 ; SSE4-NEXT: pminub %xmm2, %xmm0 -; SSE4-NEXT: pminub %xmm1, %xmm0 -; SSE4-NEXT: movdqa %xmm0, %xmm1 -; SSE4-NEXT: psrlw $8, %xmm1 +; SSE4-NEXT: pminub %xmm7, %xmm3 +; SSE4-NEXT: pminub %xmm5, %xmm1 +; SSE4-NEXT: pminub %xmm3, %xmm1 ; SSE4-NEXT: pminub %xmm0, %xmm1 -; SSE4-NEXT: phminposuw %xmm1, %xmm0 +; SSE4-NEXT: movdqa %xmm1, %xmm0 +; SSE4-NEXT: psrlw $8, %xmm0 +; SSE4-NEXT: pminub %xmm1, %xmm0 +; SSE4-NEXT: phminposuw %xmm0, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: # kill: def $al killed $al killed $eax ; SSE4-NEXT: retq ; ; AVX1-LABEL: test_v128i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpminub %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpminub %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpminub %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpminub %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpminub %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpminub %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpminub %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vphminposuw %xmm0, %xmm0 @@ -2044,10 +2094,15 @@ ; AVX2-NEXT: vpminub %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper @@ -2057,12 +2112,17 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor.ll b/llvm/test/CodeGen/X86/vector-reduce-xor.ll --- a/llvm/test/CodeGen/X86/vector-reduce-xor.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor.ll @@ -60,9 +60,9 @@ ; AVX1-LABEL: test_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -70,7 +70,7 @@ ; AVX2-LABEL: test_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -80,7 +80,7 @@ ; AVX512-LABEL: test_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -123,7 +123,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -134,7 +134,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -146,7 +146,7 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -200,7 +200,7 @@ ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -213,7 +213,7 @@ ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -226,7 +226,7 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -294,11 +294,11 @@ ; AVX1-LABEL: test_v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -306,7 +306,7 @@ ; AVX2-LABEL: test_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -318,7 +318,7 @@ ; AVX512-LABEL: test_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -365,7 +365,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -378,7 +378,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -392,9 +392,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -450,7 +450,7 @@ ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -465,7 +465,7 @@ ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -480,9 +480,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -589,11 +589,11 @@ ; AVX1-LABEL: test_v16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -604,11 +604,11 @@ ; AVX2-LABEL: test_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -619,11 +619,11 @@ ; AVX512-LABEL: test_v16i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -677,11 +677,11 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vxorps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vxorps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -693,11 +693,11 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -779,11 +779,11 @@ ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vxorps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vxorps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -797,11 +797,11 @@ ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -966,11 +966,11 @@ ; AVX1-LABEL: test_v32i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 @@ -983,7 +983,7 @@ ; AVX2-LABEL: test_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1000,7 +1000,7 @@ ; AVX512-LABEL: test_v32i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1066,7 +1066,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1084,7 +1084,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1103,13 +1103,13 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1180,7 +1180,7 @@ ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1200,7 +1200,7 @@ ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1220,13 +1220,13 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll --- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll +++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll @@ -200,10 +200,10 @@ ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k2 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} @@ -211,14 +211,14 @@ ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k4} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; @@ -227,10 +227,10 @@ ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 @@ -238,56 +238,56 @@ ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k4} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-ONLY-LABEL: mask_replication_factor2_vf32: ; AVX512BW-ONLY: # %bb.0: -; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512BW-ONLY-NEXT: kmovd (%rdi), %k0 ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3] ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-ONLY-NEXT: vzeroupper ; AVX512BW-ONLY-NEXT: retq ; ; AVX512VBMI-ONLY-LABEL: mask_replication_factor2_vf32: ; AVX512VBMI-ONLY: # %bb.0: -; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: kmovd (%rdi), %k0 ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512VBMI-ONLY-NEXT: vzeroupper ; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 @@ -306,11 +306,11 @@ ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k4 ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} @@ -328,23 +328,23 @@ ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k7} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k6} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k7} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k5} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k6} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k4} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} ; AVX512F-ONLY-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; @@ -355,11 +355,11 @@ ; AVX512DQ-NEXT: kmovw 4(%rdi), %k3 ; AVX512DQ-NEXT: kmovw 6(%rdi), %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 ; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0 @@ -377,23 +377,23 @@ ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k7 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k7} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k6} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k5} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k4} {z} +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} ; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -409,28 +409,28 @@ ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3] ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k2} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-ONLY-NEXT: vzeroupper ; AVX512BW-ONLY-NEXT: retq ; @@ -444,28 +444,28 @@ ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512VBMI-ONLY-NEXT: vzeroupper ; AVX512VBMI-ONLY-NEXT: retq %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 @@ -600,22 +600,17 @@ ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512F-ONLY-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,6,6,6,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-NEXT: vptestmd %ymm0, %ymm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa %ymm1, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512F-ONLY-NEXT: vmovdqa %ymm0, 64(%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; @@ -624,21 +619,16 @@ ; AVX512DQ-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 ; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,6,6,6,7,7,7] ; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vpmovd2m %ymm0, %k2 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} -; AVX512DQ-NEXT: vmovdqa %ymm1, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -673,25 +663,20 @@ ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq @@ -700,50 +685,59 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: mask_replication_factor3_vf16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovw (%rdi), %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] -; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] -; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] -; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-LABEL: mask_replication_factor3_vf16: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,21,21,22,22,22,23,23,23,24,24,24,25,25,25,26,26,42,43,43,43,44,44,44,45,45,45,46,46,46,47,47,47,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor3_vf16: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10,10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <48 x i32> @@ -755,62 +749,52 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { ; AVX512F-ONLY-LABEL: mask_replication_factor3_vf32: ; AVX512F-ONLY: # %bb.0: -; AVX512F-ONLY-NEXT: kmovw (%rdi), %k2 +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k3 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k2 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k2} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k3 +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm3 ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z} ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k6 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k6} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k5} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k5} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; ; AVX512DQ-LABEL: mask_replication_factor3_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: kmovw (%rdi), %k1 -; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k2 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm3 -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 @@ -820,481 +804,80 @@ ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k6 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k6} {z} -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k5} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k5} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: mask_replication_factor3_vf32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovd (%rdi), %k0 -; AVX512BW-NEXT: kshiftrd $1, %k0, %k1 -; AVX512BW-NEXT: movw $-3, %ax -; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: kmovw (%rdi), %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k3 -; AVX512BW-NEXT: kmovq %k4, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-5, %ax -; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: movw $-9, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-17, %ax -; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-33, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: movw $-65, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $2, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: movw $-129, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $3, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $4, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $2, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $5, %k0, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $27, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k4 -; AVX512BW-NEXT: kshiftrd $26, %k0, %k1 -; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovq %k7, %k2 -; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $28, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $29, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $30, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $5, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $31, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k7 -; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $21, %k0, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k6, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $22, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $23, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $24, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $25, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $16, %k0, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $17, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $18, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $19, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $20, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $11, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 -; AVX512BW-NEXT: kshiftrd $10, %k0, %k4 -; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k4, %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $12, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $13, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $14, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $5, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $15, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k2} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $14, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $6, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $13, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $7, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $10, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $8, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $9, %k0, %k0 -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k2, %k0 -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 -; AVX512BW-NEXT: korw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-LABEL: mask_replication_factor3_vf32: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovd (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2w %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [21,21,22,22,22,23,23,23,24,24,24,25,25,25,26,26,26,27,27,27,28,28,28,29,29,29,30,30,30,31,31,31] +; AVX512BW-ONLY-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-ONLY-NEXT: vpmovw2m %zmm0, %k1 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,2,3] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,21,21,22,22,22,23,23,23,24,24,24,25,25,25,26,26,42,43,43,43,44,44,44,45,45,45,46,46,46,47,47,47,48,48,48,49,49,49,50,50,50,51,51,51,52,52,52,53] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor3_vf32: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovd (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2w %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [21,21,22,22,22,23,23,23,24,24,24,25,25,25,26,26,26,27,27,27,28,28,28,29,29,29,30,30,30,31,31,31] +; AVX512VBMI-ONLY-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovw2m %zmm0, %k1 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10,10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15,16,16,16,17,17,17,18,18,18,19,19,19,20,20,20,21] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <96 x i32> @@ -1306,1024 +889,229 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { ; AVX512F-ONLY-LABEL: mask_replication_factor3_vf64: ; AVX512F-ONLY: # %bb.0: -; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} -; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm0 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm8 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm9 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm3 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm1, %zmm10 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm2, %zmm11 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm4 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm2, %zmm2 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm5 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm8 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm9 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm7, %zmm1 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm3, %zmm10 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm5, %zmm11 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm7, %zmm2 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm12, %zmm12, %zmm12 {%k1} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm12, %zmm3, %zmm3 +; AVX512F-ONLY-NEXT: vpermd %zmm12, %zmm5, %zmm5 +; AVX512F-ONLY-NEXT: vpermd %zmm12, %zmm7, %zmm7 +; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm7 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm11 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm11 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm10 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm10 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm9 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm6 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 512(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 576(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 640(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 704(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 704(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 640(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 576(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; ; AVX512DQ-LABEL: mask_replication_factor3_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 -; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm8 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm9 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm3 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm1, %zmm10 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm2, %zmm11 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm4 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm5 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm8 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm9 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm7, %zmm1 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm3, %zmm10 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm5, %zmm11 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm7, %zmm2 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm12 +; AVX512DQ-NEXT: vpermd %zmm12, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpermd %zmm12, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpermd %zmm12, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm7 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm1 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm11 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm11 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm10 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm10 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm9 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm6 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 512(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 576(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 640(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 704(%rdx) +; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 +; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 704(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 640(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 576(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: mask_replication_factor3_vf64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovq (%rdi), %k0 -; AVX512BW-NEXT: kshiftrq $1, %k0, %k1 -; AVX512BW-NEXT: movw $-3, %ax -; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: kmovw (%rdi), %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k3 -; AVX512BW-NEXT: kmovq %k4, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-5, %ax -; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: movw $-9, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-17, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-33, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: movw $-65, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $2, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: movw $-129, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovq %k3, %k5 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $8, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $3, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $4, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k3 -; AVX512BW-NEXT: kshiftrw $3, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $2, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $5, %k0, %k2 -; AVX512BW-NEXT: kmovq %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $59, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 -; AVX512BW-NEXT: kshiftrq $58, %k0, %k1 -; AVX512BW-NEXT: kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: kmovq %k7, %k3 -; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $60, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $61, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $62, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $63, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k7 -; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $53, %k0, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k6, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $54, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $55, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $56, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $57, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 8-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $48, %k0, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $49, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $50, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $51, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $52, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $43, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 -; AVX512BW-NEXT: kshiftrq $42, %k0, %k1 -; AVX512BW-NEXT: kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k3 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $44, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $45, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $46, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $47, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k2} {z} -; AVX512BW-NEXT: kshiftrq $37, %k0, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrq $38, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrq $39, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrq $40, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrq $41, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 8-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $32, %k0, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $33, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $34, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $35, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $36, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $27, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 -; AVX512BW-NEXT: kshiftrq $26, %k0, %k3 -; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $28, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $29, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $30, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $31, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k2} {z} -; AVX512BW-NEXT: kshiftrq $21, %k0, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrq $22, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrq $23, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrq $24, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrq $25, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 8-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $16, %k0, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $17, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $18, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $19, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $20, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $11, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 -; AVX512BW-NEXT: kshiftrq $10, %k0, %k3 -; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $12, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $13, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $14, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $15, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k2} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $6, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $7, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $8, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $9, %k0, %k0 -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k2, %k0 -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 8-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 -; AVX512BW-NEXT: korw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm11 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 512(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 576(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 640(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 704(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-LABEL: mask_replication_factor3_vf64: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,6,7,6,7] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15,16,16,16,17,17,17,18,18,18,19,19,19,20,20,20,21,37,37,38,38,38,39,39,39,40,40,40,41,41,41,42,42,58,59,59,59,60,60,60,61,61,61,62,62,62,63,63,63] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,4,5,4,5] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10,26,27,27,27,28,28,28,29,29,29,30,30,30,31,31,31,32,32,32,33,33,33,34,34,34,35,35,35,36,36,36,37,53,53,54,54,54,55,55,55,56,56,56,57,57,57,58,58] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,2,3] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,21,21,22,22,22,23,23,23,24,24,24,25,25,25,26,26,42,43,43,43,44,44,44,45,45,45,46,46,46,47,47,47,48,48,48,49,49,49,50,50,50,51,51,51,52,52,52,53] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k3} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor3_vf64: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [42,43,43,43,44,44,44,45,45,45,46,46,46,47,47,47,48,48,48,49,49,49,50,50,50,51,51,51,52,52,52,53,53,53,54,54,54,55,55,55,56,56,56,57,57,57,58,58,58,59,59,59,60,60,60,61,61,61,62,62,62,63,63,63] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [21,21,22,22,22,23,23,23,24,24,24,25,25,25,26,26,26,27,27,27,28,28,28,29,29,29,30,30,30,31,31,31,32,32,32,33,33,33,34,34,34,35,35,35,36,36,36,37,37,37,38,38,38,39,39,39,40,40,40,41,41,41,42,42] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10,10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15,16,16,16,17,17,17,18,18,18,19,19,19,20,20,20,21] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <192 x i32> %data = call <192 x i32> @llvm.masked.load.v192i32.p0(ptr %in.vec, i32 64, <192 x i1> %tgt.mask, <192 x i32> poison) @@ -2548,26 +1336,26 @@ ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k4} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; @@ -2575,68 +1363,68 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k4} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf16: ; AVX512BW-ONLY: # %bb.0: -; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k0 ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63] ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-ONLY-NEXT: vzeroupper ; AVX512BW-ONLY-NEXT: retq ; ; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf16: ; AVX512VBMI-ONLY: # %bb.0: -; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k0 ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512VBMI-ONLY-NEXT: vzeroupper ; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 @@ -2653,17 +1441,17 @@ ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k4 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm3 ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k3 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm4, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k5 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} @@ -2675,23 +1463,23 @@ ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k7 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm4, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k5} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k7} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k6} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k5} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} ; AVX512F-ONLY-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; @@ -2700,17 +1488,17 @@ ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 ; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm3 ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 @@ -2722,23 +1510,23 @@ ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k7 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k7} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k6} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k5} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k4} {z} +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} ; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2753,28 +1541,28 @@ ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k2} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-ONLY-NEXT: vzeroupper ; AVX512BW-ONLY-NEXT: retq ; @@ -2788,28 +1576,28 @@ ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512VBMI-ONLY-NEXT: vzeroupper ; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 @@ -3000,50 +1788,50 @@ ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k4} {z} ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k5 -; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k5} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k4, %k4 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k5 -; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k3} {z} ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4 -; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k4} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k3 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4 -; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k3} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k2} {z} ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512BW-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k3} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512BW-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k1} {z} ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k2} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z} -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm15, 896(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm14, 960(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm13, 768(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm12, 832(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm11, 640(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm10, 704(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm9, 512(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm8, 576(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm15, 960(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm14, 896(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm12, 768(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-ONLY-NEXT: vzeroupper ; AVX512BW-ONLY-NEXT: retq ; @@ -3063,50 +1851,50 @@ ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k4} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k5 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k5} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k4, %k4 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k5 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k3} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k4} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k3 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k3} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm15, 896(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm14, 960(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm13, 768(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm12, 832(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm11, 640(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm10, 704(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm9, 512(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm8, 576(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm15, 960(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm14, 896(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm12, 768(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512VBMI-ONLY-NEXT: vzeroupper ; AVX512VBMI-ONLY-NEXT: retq %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 @@ -3246,15 +2034,10 @@ ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 ; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 @@ -3262,12 +2045,12 @@ ; AVX512F-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-NEXT: vptestmd %ymm0, %ymm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa %ymm1, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512F-ONLY-NEXT: vmovdqa %ymm0, 128(%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; @@ -3275,27 +2058,22 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vpmovd2m %ymm0, %k3 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm0 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z} ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 128(%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3310,12 +2088,12 @@ ; AVX512BW-ONLY-NEXT: kmovq %rax, %k1 ; AVX512BW-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1} ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k2} {z} -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 -; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z} -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa %ymm1, 128(%rdx) +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa %ymm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-ONLY-NEXT: vzeroupper ; AVX512BW-ONLY-NEXT: retq @@ -3331,12 +2109,12 @@ ; AVX512VBMI-ONLY-NEXT: kmovq %rax, %k1 ; AVX512VBMI-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa %ymm1, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa %ymm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512VBMI-ONLY-NEXT: vzeroupper ; AVX512VBMI-ONLY-NEXT: retq @@ -3354,35 +2132,30 @@ ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k5 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm1 {%k5} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k5} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k4} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq @@ -3391,70 +2164,87 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k5 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm1 {%k5} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k2} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k5} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k4} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: mask_replication_factor5_vf16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovw (%rdi), %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] -; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] -; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] -; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm3 -; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] -; AVX512BW-NEXT: vpermd %zmm0, %zmm4, %zmm4 -; AVX512BW-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] -; AVX512BW-NEXT: vpermd %zmm0, %zmm5, %zmm0 -; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-LABEL: mask_replication_factor5_vf16: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512BW-ONLY-NEXT: vpmovm2b %k1, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,19,19,19,19,20,20,20,20,20,21,21,21,21,21,22,22,38,38,38,39,39,39,39,39,40,40,40,40,40,41,41,41,57,57,58,58,58,58,58,59,59,59,59,59,60,60,60,60] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512BW-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512BW-ONLY-NEXT: vpermd %zmm4, %zmm5, %zmm4 +; AVX512BW-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor5_vf16: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6,6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9,9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512VBMI-ONLY-NEXT: vpermd %zmm4, %zmm5, %zmm4 +; AVX512VBMI-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <80 x i32> @@ -3466,834 +2256,199 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { ; AVX512F-ONLY-LABEL: mask_replication_factor5_vf32: ; AVX512F-ONLY: # %bb.0: -; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm4 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm0 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm2 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm5, %zmm5 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm7 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm3 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm10, %zmm10, %zmm10 {%k1} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm3, %zmm3 +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm5, %zmm5 +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm7, %zmm7 +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm9, %zmm9 +; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm9 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm7 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm7 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm1 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm8 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 512(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 576(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 576(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 512(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; ; AVX512DQ-LABEL: mask_replication_factor5_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm5, %zmm5 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm3 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm10 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm9, %zmm9 +; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm9 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 -; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm7 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm7 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm1 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm8 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 512(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 576(%rdx) +; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 +; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 576(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 512(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: mask_replication_factor5_vf32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovd (%rdi), %k5 -; AVX512BW-NEXT: kshiftrd $1, %k5, %k1 -; AVX512BW-NEXT: movw $-3, %ax -; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: kmovw (%rdi), %k2 -; AVX512BW-NEXT: kandw %k6, %k2, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-5, %ax -; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-9, %ax -; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-17, %ax -; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: movw $-33, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-65, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-129, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512BW-NEXT: kmovd %eax, %k7 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k3 -; AVX512BW-NEXT: kshiftrd $2, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF -; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF -; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $3, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $2, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $3, %k5, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $29, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 -; AVX512BW-NEXT: kshiftrd $28, %k5, %k1 -; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k6, %k1, %k3 -; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $10, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $30, %k5, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $31, %k5, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k4 -; AVX512BW-NEXT: kshiftrw $4, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $2, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm1 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $25, %k5, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k6, %k2, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrd $26, %k5, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k7, %k7 -; AVX512BW-NEXT: kshiftrw $13, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $10, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrd $27, %k5, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $8, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k1 -; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $22, %k5, %k0 -; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k6, %k6 -; AVX512BW-NEXT: kshiftrd $23, %k5, %k7 -; AVX512BW-NEXT: kmovq %k5, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k7, %k7 -; AVX512BW-NEXT: kshiftrw $12, %k7, %k5 -; AVX512BW-NEXT: korw %k5, %k6, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $9, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrd $24, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm3 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $19, %k0, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k6 -; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kshiftrd $20, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $10, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $9, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $8, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $7, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrd $21, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kandw %k3, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k4, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $16, %k0, %k1 -; AVX512BW-NEXT: kandw %k7, %k1, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $17, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $10, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $18, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k4 -; AVX512BW-NEXT: kshiftrw $5, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $13, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $12, %k0, %k3 -; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k2 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $14, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $15, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k4 -; AVX512BW-NEXT: kshiftrw $4, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm6 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $9, %k0, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftrd $10, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $13, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $12, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kandw %k7, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $11, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $10, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $9, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrd $11, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $4, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kandw %k3, %k4, %k4 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $15, %k7, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $2, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k4, %k4 -; AVX512BW-NEXT: kshiftlw $14, %k7, %k3 -; AVX512BW-NEXT: korw %k3, %k4, %k3 -; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k3} {z} -; AVX512BW-NEXT: kshiftrd $6, %k0, %k4 -; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k4, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $13, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrd $7, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kshiftrd $8, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $2, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm8 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $14, %k5, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k5, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k5, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $4, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $5, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: korw %k1, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm9 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 512(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 576(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-LABEL: mask_replication_factor5_vf32: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovd (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2w %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [25,25,26,26,26,26,26,27,27,27,27,27,28,28,28,28,28,29,29,29,29,29,30,30,30,30,30,31,31,31,31,31] +; AVX512BW-ONLY-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-ONLY-NEXT: vpmovw2m %zmm0, %k1 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3,2,3,2,3] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15,16,16,16,16,16,17,17,17,17,17,18,18,18,18,18,19,35,35,35,35,36,36,36,36,36,37,37,37,37,37,38,38,54,54,54,55,55,55,55,55,56,56,56,56,56,57,57,57] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,19,19,19,19,20,20,20,20,20,21,21,21,21,21,22,22,38,38,38,39,39,39,39,39,40,40,40,40,40,41,41,41,57,57,58,58,58,58,58,59,59,59,59,59,60,60,60,60] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k3} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor5_vf32: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovd (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2w %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [25,25,26,26,26,26,26,27,27,27,27,27,28,28,28,28,28,29,29,29,29,29,30,30,30,30,30,31,31,31,31,31] +; AVX512VBMI-ONLY-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovw2m %zmm0, %k1 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15,16,16,16,16,16,17,17,17,17,17,18,18,18,18,18,19,19,19,19,19,20,20,20,20,20,21,21,21,21,21,22,22,22,22,22,23,23,23,23,23,24,24,24,24,24,25,25,25] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6,6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9,9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <160 x i32> @@ -4305,1546 +2460,353 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { ; AVX512F-ONLY-LABEL: mask_replication_factor5_vf64: ; AVX512F-ONLY: # %bb.0: -; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm1 -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm8 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm8, %zmm2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm9, %zmm10 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm12 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm3, %zmm4 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm13 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm8, %zmm14 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm9, %zmm15 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm11, %zmm16 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm3, %zmm5 -; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm7, %zmm17 -; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm8, %zmm18 -; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm9, %zmm19 -; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm3, %zmm3 -; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm11, %zmm6 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm7 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm8, %zmm8 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm9 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm0 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm5, %zmm0 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm6 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm6, %zmm1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm7, %zmm8 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm9, %zmm10 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm11, %zmm2 +; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm5, %zmm12 +; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm6, %zmm13 +; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm14 +; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm15 +; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm3 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm5, %zmm16 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm6, %zmm17 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm18 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm9, %zmm19 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm4 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm20, %zmm20, %zmm20 {%k1} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm20, %zmm5, %zmm5 +; AVX512F-ONLY-NEXT: vpermd %zmm20, %zmm6, %zmm6 +; AVX512F-ONLY-NEXT: vpermd %zmm20, %zmm7, %zmm7 +; AVX512F-ONLY-NEXT: vpermd %zmm20, %zmm9, %zmm9 +; AVX512F-ONLY-NEXT: vpermd %zmm20, %zmm11, %zmm11 +; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm9 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm7 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm19 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm19 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm18 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm17, %zmm17, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm17 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm17 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm16 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm16 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm15 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm15 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm14 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm13 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm12 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm10 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm12 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm10 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm8 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 1216(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 1152(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1088(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1024(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 960(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 896(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 832(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 768(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 704(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 640(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 576(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 512(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 1216(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 1152(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1088(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1024(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 960(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 896(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 768(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 704(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 640(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 576(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 512(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; ; AVX512DQ-LABEL: mask_replication_factor5_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: kmovw (%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm6 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] -; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] -; AVX512DQ-NEXT: vpermd %zmm4, %zmm8, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] -; AVX512DQ-NEXT: vpermd %zmm4, %zmm9, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] -; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm12 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm3, %zmm4 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm13 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm8, %zmm14 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm9, %zmm15 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm11, %zmm16 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm3, %zmm5 -; AVX512DQ-NEXT: vpermd %zmm6, %zmm7, %zmm17 -; AVX512DQ-NEXT: vpermd %zmm6, %zmm8, %zmm18 -; AVX512DQ-NEXT: vpermd %zmm6, %zmm9, %zmm19 -; AVX512DQ-NEXT: vpermd %zmm6, %zmm3, %zmm3 -; AVX512DQ-NEXT: vpermd %zmm6, %zmm11, %zmm6 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm8, %zmm8 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm9 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm2, %zmm5, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512DQ-NEXT: vpermd %zmm2, %zmm6, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] +; AVX512DQ-NEXT: vpermd %zmm2, %zmm7, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512DQ-NEXT: vpermd %zmm2, %zmm9, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512DQ-NEXT: vpermd %zmm2, %zmm11, %zmm2 +; AVX512DQ-NEXT: vpermd %zmm3, %zmm5, %zmm12 +; AVX512DQ-NEXT: vpermd %zmm3, %zmm6, %zmm13 +; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm14 +; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm15 +; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm3 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm5, %zmm16 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm6, %zmm17 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm18 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm9, %zmm19 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm4 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm20 +; AVX512DQ-NEXT: vpermd %zmm20, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpermd %zmm20, %zmm6, %zmm6 +; AVX512DQ-NEXT: vpermd %zmm20, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpermd %zmm20, %zmm9, %zmm9 +; AVX512DQ-NEXT: vpermd %zmm20, %zmm11, %zmm11 +; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm9 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm7 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm19 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm19 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 -; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm18 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm17, %k1 -; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm17 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm17 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1 -; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm16 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm16 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 +; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm3 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 -; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm15 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm15 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 -; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm14 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 -; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm13 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 -; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm12 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 -; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm10 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm12 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 +; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm10 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 +; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm8 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm1 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1216(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1152(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1088(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1024(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 960(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 896(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 832(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm15, 768(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 704(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 640(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 576(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 512(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 +; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 1216(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1152(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1088(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1024(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 960(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 896(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 768(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 704(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 640(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 576(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 512(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: mask_replication_factor5_vf64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovq (%rdi), %k5 -; AVX512BW-NEXT: kshiftrq $1, %k5, %k0 -; AVX512BW-NEXT: movw $-3, %ax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kmovw (%rdi), %k2 -; AVX512BW-NEXT: kandw %k1, %k2, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-5, %ax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-9, %ax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-17, %ax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: movw $-33, %ax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-65, %ax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-129, %ax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k2, %k0 -; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k0, %k3 -; AVX512BW-NEXT: kshiftrq $2, %k5, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $3, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k7 -; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k7, %k7 -; AVX512BW-NEXT: kshiftrw $2, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k7, %k7 -; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF -; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: kandw %k6, %k7, %k7 -; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $3, %k5, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k7, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k7, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $4, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $5, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $6, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k6 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k6} {z} -; AVX512BW-NEXT: kandw %k2, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k7, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k7, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $7, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $8, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $9, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $10, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $11, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $12, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k6 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k6} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrq $13, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $14, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $15, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $16, %k5, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $17, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $18, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $19, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $20, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $21, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $22, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $23, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $24, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $25, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $26, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $27, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $28, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k6 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k6} {z} -; AVX512BW-NEXT: kandw %k2, %k1, %k0 -; AVX512BW-NEXT: kshiftrq $29, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $30, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $31, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $32, %k5, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $33, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $34, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $35, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $36, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $37, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $38, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $39, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $40, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $41, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k7} {z} -; AVX512BW-NEXT: kandw %k3, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $42, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $43, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $44, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k6 -; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k6} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrq $45, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $46, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $47, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $48, %k5, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $49, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $50, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $51, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $52, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $53, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $54, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $55, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $56, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $57, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $58, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $59, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $60, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k6 -; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k6} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrq $61, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $62, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $63, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k5, %k2 -; AVX512BW-NEXT: korw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k1, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm19, 1216(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 1152(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 1088(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 896(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 832(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 768(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 704(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 640(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-LABEL: mask_replication_factor5_vf64: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[6,7,6,7,6,7,6,7] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6,22,22,22,23,23,23,23,23,24,24,24,24,24,25,25,25,41,41,42,42,42,42,42,43,43,43,43,43,44,44,44,44,60,61,61,61,61,61,62,62,62,62,62,63,63,63,63,63] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,6,7] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9,25,25,26,26,26,26,26,27,27,27,27,27,28,28,28,28,44,45,45,45,45,45,46,46,46,46,46,47,47,47,47,47,48,48,48,48,48,49,49,49,49,49,50,50,50,50,50,51] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,4,5,4,5] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12,28,29,29,29,29,29,30,30,30,30,30,31,31,31,31,31,32,32,32,32,32,33,33,33,33,33,34,34,34,34,34,35,51,51,51,51,52,52,52,52,52,53,53,53,53,53,54,54] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k3 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3,2,3,2,3] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15,16,16,16,16,16,17,17,17,17,17,18,18,18,18,18,19,35,35,35,35,36,36,36,36,36,37,37,37,37,37,38,38,54,54,54,55,55,55,55,55,56,56,56,56,56,57,57,57] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k4 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,19,19,19,19,20,20,20,20,20,21,21,21,21,21,22,22,38,38,38,39,39,39,39,39,40,40,40,40,40,41,41,41,57,57,58,58,58,58,58,59,59,59,59,59,60,60,60,60] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k5 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k5} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k5, %k6 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k6} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k5, %k5 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k5} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k5, %k5 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k5} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k5 +; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k5} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k4, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k4} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k3} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm19, 1216(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm18, 1152(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm17, 1088(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm16, 1024(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm15, 960(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm14, 896(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm12, 768(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor5_vf64: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [51,51,51,51,52,52,52,52,52,53,53,53,53,53,54,54,54,54,54,55,55,55,55,55,56,56,56,56,56,57,57,57,57,57,58,58,58,58,58,59,59,59,59,59,60,60,60,60,60,61,61,61,61,61,62,62,62,62,62,63,63,63,63,63] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [38,38,38,39,39,39,39,39,40,40,40,40,40,41,41,41,41,41,42,42,42,42,42,43,43,43,43,43,44,44,44,44,44,45,45,45,45,45,46,46,46,46,46,47,47,47,47,47,48,48,48,48,48,49,49,49,49,49,50,50,50,50,50,51] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [25,25,26,26,26,26,26,27,27,27,27,27,28,28,28,28,28,29,29,29,29,29,30,30,30,30,30,31,31,31,31,31,32,32,32,32,32,33,33,33,33,33,34,34,34,34,34,35,35,35,35,35,36,36,36,36,36,37,37,37,37,37,38,38] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15,16,16,16,16,16,17,17,17,17,17,18,18,18,18,18,19,19,19,19,19,20,20,20,20,20,21,21,21,21,21,22,22,22,22,22,23,23,23,23,23,24,24,24,24,24,25,25,25] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6,6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9,9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k5 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k5} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k5, %k6 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k6} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k5, %k5 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k5} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k5, %k5 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k5} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k5 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k5} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k4, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm19, 1216(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm18, 1152(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm17, 1088(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm16, 1024(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm15, 960(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm14, 896(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm12, 768(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <320 x i32> %data = call <320 x i32> @llvm.masked.load.v320i32.p0(ptr %in.vec, i32 64, <320 x i1> %tgt.mask, <320 x i32> poison) @@ -6021,25 +2983,20 @@ ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq @@ -6048,50 +3005,59 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: mask_replication_factor6_vf8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovw (%rdi), %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] -; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] -; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] -; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-LABEL: mask_replication_factor6_vf8: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,18,18,19,19,19,19,19,19,20,20,20,20,20,20,21,21,37,37,37,37,38,38,38,38,38,38,39,39,39,39,39,39,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor6_vf8: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5,5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <48 x i32> @@ -6105,40 +3071,35 @@ ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k6 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k6} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k5} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k5} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq @@ -6147,80 +3108,98 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k6 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k6} {z} -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k5} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k5} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: mask_replication_factor6_vf16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovw (%rdi), %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] -; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] -; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] -; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm3 -; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] -; AVX512BW-NEXT: vpermd %zmm0, %zmm4, %zmm4 -; AVX512BW-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] -; AVX512BW-NEXT: vpermd %zmm0, %zmm5, %zmm5 -; AVX512BW-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] -; AVX512BW-NEXT: vpermd %zmm0, %zmm6, %zmm0 -; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-LABEL: mask_replication_factor6_vf16: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,18,18,19,19,19,19,19,19,20,20,20,20,20,20,21,21,37,37,37,37,38,38,38,38,38,38,39,39,39,39,39,39,56,56,56,56,56,56,57,57,57,57,57,57,58,58,58,58] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512BW-ONLY-NEXT: vpmovm2w %k0, %zmm4 +; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13,13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512BW-ONLY-NEXT: vpermw %zmm4, %zmm5, %zmm4 +; AVX512BW-ONLY-NEXT: vpmovw2m %zmm4, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor6_vf16: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5,5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7,8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vpmovm2w %k0, %zmm4 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13,13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512VBMI-ONLY-NEXT: vpermw %zmm4, %zmm5, %zmm4 +; AVX512VBMI-ONLY-NEXT: vpmovw2m %zmm4, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <96 x i32> @@ -6232,986 +3211,227 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { ; AVX512F-ONLY-LABEL: mask_replication_factor6_vf32: ; AVX512F-ONLY: # %bb.0: -; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm4 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm0 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm2 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm5, %zmm5 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm7 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm9 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm3 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm12, %zmm12, %zmm12 {%k1} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm12, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vpermd %zmm12, %zmm3, %zmm3 +; AVX512F-ONLY-NEXT: vpermd %zmm12, %zmm5, %zmm5 +; AVX512F-ONLY-NEXT: vpermd %zmm12, %zmm7, %zmm7 +; AVX512F-ONLY-NEXT: vpermd %zmm12, %zmm9, %zmm9 +; AVX512F-ONLY-NEXT: vpermd %zmm12, %zmm11, %zmm11 +; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm9 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm9 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm7 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm7 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm5 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm10 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm6 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 512(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 576(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 640(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 704(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 704(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 640(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 576(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; ; AVX512DQ-LABEL: mask_replication_factor6_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm5, %zmm5 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm9 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm3 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm12 +; AVX512DQ-NEXT: vpermd %zmm12, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpermd %zmm12, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpermd %zmm12, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpermd %zmm12, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpermd %zmm12, %zmm9, %zmm9 +; AVX512DQ-NEXT: vpermd %zmm12, %zmm11, %zmm11 +; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 -; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm9 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm9 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 -; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm7 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm7 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm5 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm10 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm6 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 512(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 576(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 640(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 704(%rdx) +; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 +; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 704(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 640(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 576(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: mask_replication_factor6_vf32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovd (%rdi), %k5 -; AVX512BW-NEXT: movw $-3, %ax -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kmovw (%rdi), %k1 -; AVX512BW-NEXT: kandw %k0, %k1, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-5, %ax -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-9, %ax -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-17, %ax -; AVX512BW-NEXT: kmovd %eax, %k7 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-33, %ax -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: movw $-65, %ax -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k1, %k2 -; AVX512BW-NEXT: kshiftrd $1, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-129, %ax -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF -; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $5, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $2, %k5, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k3 -; AVX512BW-NEXT: kmovq %k2, %k4 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kshiftrw $3, %k3, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k3, %k2 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k4, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $29, %k5, %k0 -; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k4 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $30, %k5, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kandw %k6, %k1, %k4 -; AVX512BW-NEXT: kshiftrd $31, %k5, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k7, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftlw $14, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 -; AVX512BW-NEXT: korw %k1, %k4, %k1 -; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $26, %k5, %k4 -; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k2, %k6, %k6 -; AVX512BW-NEXT: kshiftrd $27, %k5, %k7 -; AVX512BW-NEXT: kmovq %k5, %k2 -; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kshiftlw $15, %k7, %k7 -; AVX512BW-NEXT: kshiftrw $13, %k7, %k5 -; AVX512BW-NEXT: korw %k5, %k6, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $12, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $9, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrd $28, %k2, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k5, %k3 -; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k2} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $24, %k0, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $10, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $25, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $3, %k4, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $2, %k4, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k4, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $21, %k1, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kandw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftrd $22, %k1, %k4 -; AVX512BW-NEXT: kmovq %k1, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k4 -; AVX512BW-NEXT: kshiftrd $23, %k7, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 -; AVX512BW-NEXT: korw %k3, %k4, %k3 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k3} {z} -; AVX512BW-NEXT: kmovq %k7, %k4 -; AVX512BW-NEXT: kshiftrd $18, %k7, %k6 -; AVX512BW-NEXT: kmovd %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrd $19, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrd $20, %k4, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: korw %k0, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: kmovq %k4, %k0 -; AVX512BW-NEXT: kshiftrd $16, %k4, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $17, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $3, %k3, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k3, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $13, %k0, %k3 -; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k5 -; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k5, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k5, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k5, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $14, %k0, %k3 -; AVX512BW-NEXT: kmovq %k0, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k3 -; AVX512BW-NEXT: kshiftrd $15, %k7, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $3, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $2, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k2} {z} -; AVX512BW-NEXT: kmovq %k7, %k2 -; AVX512BW-NEXT: kshiftrd $10, %k7, %k0 -; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k1, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrd $11, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kshiftrd $12, %k2, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k5, %k4 -; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: korw %k1, %k4, %k1 -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $8, %k2, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kandw %k0, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k4, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $9, %k2, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $3, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k2, %k4 -; AVX512BW-NEXT: kmovq %k2, %k5 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $5, %k1, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k6, %k2, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k7 -; AVX512BW-NEXT: kshiftrw $14, %k7, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k7, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k7, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrd $6, %k1, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k4 -; AVX512BW-NEXT: kshiftrd $7, %k1, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 -; AVX512BW-NEXT: korw %k3, %k4, %k3 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k3} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $14, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrd $3, %k1, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $13, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrd $4, %k1, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k3, %k0 -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 -; AVX512BW-NEXT: korw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k7, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm11 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 512(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 576(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 640(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 704(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-LABEL: mask_replication_factor6_vf32: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovd (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7,24,24,24,24,24,24,25,25,25,25,25,25,26,26,26,26,42,42,43,43,43,43,43,43,44,44,44,44,44,44,45,45,61,61,61,61,62,62,62,62,62,62,63,63,63,63,63,63] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,2,3,2,3] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13,29,29,29,29,30,30,30,30,30,30,31,31,31,31,31,31,32,32,32,32,32,32,33,33,33,33,33,33,34,34,34,34,50,50,51,51,51,51,51,51,52,52,52,52,52,52,53,53] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,18,18,19,19,19,19,19,19,20,20,20,20,20,20,21,21,37,37,37,37,38,38,38,38,38,38,39,39,39,39,39,39,56,56,56,56,56,56,57,57,57,57,57,57,58,58,58,58] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k3} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor6_vf32: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovd (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [21,21,21,21,22,22,22,22,22,22,23,23,23,23,23,23,24,24,24,24,24,24,25,25,25,25,25,25,26,26,26,26,26,26,27,27,27,27,27,27,28,28,28,28,28,28,29,29,29,29,29,29,30,30,30,30,30,30,31,31,31,31,31,31] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13,13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15,16,16,16,16,16,16,17,17,17,17,17,17,18,18,18,18,18,18,19,19,19,19,19,19,20,20,20,20,20,20,21,21] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5,5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7,8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <192 x i32> @@ -7223,75 +3443,70 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { ; AVX512F-ONLY-LABEL: mask_replication_factor6_vf64: ; AVX512F-ONLY: # %bb.0: -; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm4, %zmm1 -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm8, %zmm8, %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm9, %zmm9, %zmm9 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm10 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm10, %zmm1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] -; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm11, %zmm2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm12 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] -; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm12, %zmm3 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] -; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm13, %zmm5 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm14 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] -; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm14, %zmm6 -; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm4, %zmm7 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm10, %zmm15 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm11, %zmm16 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm12, %zmm17 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm13, %zmm18 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm14, %zmm19 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm4, %zmm8 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm10, %zmm20 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm11, %zmm21 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm12, %zmm22 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm13, %zmm23 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm4, %zmm24 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm14, %zmm9 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm10, %zmm10 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm11 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm12, %zmm12 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm13 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm14, %zmm4 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm8, %zmm8, %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm9, %zmm0 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm10 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm10, %zmm1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm11, %zmm2 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm12 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm12, %zmm3 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm13, %zmm4 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm14, %zmm6 +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm9, %zmm15 +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm10, %zmm16 +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm11, %zmm17 +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm12, %zmm18 +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm13, %zmm19 +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm14, %zmm20 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm9, %zmm21 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm10, %zmm22 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm11, %zmm23 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm12, %zmm24 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm13, %zmm25 +; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm14, %zmm8 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm9, %zmm9 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm10, %zmm10 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm11, %zmm11 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm12, %zmm12 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm13, %zmm7 +; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm14, %zmm5 +; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm7 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm12 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm11 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm10 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm24, %zmm24, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm14 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm9 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm9 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm25, %zmm25, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm13 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm24, %zmm24, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm14 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm23, %zmm23, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm23 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm23 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm22, %zmm22, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm22 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm22 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm21, %zmm21, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm21 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm21 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm20, %zmm20, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm20 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm20 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm19 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 @@ -7302,116 +3517,111 @@ ; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm15 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm7 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm6 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm4 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm2 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 1472(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 1408(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 1344(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 1280(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1216(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 1152(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 1472(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 1408(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 1344(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 1280(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 1216(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1152(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 1088(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 1024(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 960(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 896(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 832(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 768(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 704(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 640(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 576(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 512(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 768(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 704(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 640(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 576(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 512(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; ; AVX512DQ-LABEL: mask_replication_factor6_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: kmovw (%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm7 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm7 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm9 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpermd %zmm7, %zmm10, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] -; AVX512DQ-NEXT: vpermd %zmm7, %zmm11, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] -; AVX512DQ-NEXT: vpermd %zmm7, %zmm12, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] -; AVX512DQ-NEXT: vpermd %zmm7, %zmm13, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] -; AVX512DQ-NEXT: vpermd %zmm7, %zmm14, %zmm6 -; AVX512DQ-NEXT: vpermd %zmm7, %zmm4, %zmm7 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm10, %zmm15 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm11, %zmm16 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm12, %zmm17 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm13, %zmm18 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm14, %zmm19 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm4, %zmm8 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm10, %zmm20 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm11, %zmm21 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm12, %zmm22 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm13, %zmm23 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm4, %zmm24 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm14, %zmm9 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm10, %zmm10 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm11 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm12, %zmm12 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm13 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm14, %zmm4 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z} +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8 +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm5, %zmm9, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512DQ-NEXT: vpermd %zmm5, %zmm10, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512DQ-NEXT: vpermd %zmm5, %zmm11, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512DQ-NEXT: vpermd %zmm5, %zmm12, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512DQ-NEXT: vpermd %zmm5, %zmm13, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512DQ-NEXT: vpermd %zmm5, %zmm14, %zmm6 +; AVX512DQ-NEXT: vpermd %zmm7, %zmm9, %zmm15 +; AVX512DQ-NEXT: vpermd %zmm7, %zmm10, %zmm16 +; AVX512DQ-NEXT: vpermd %zmm7, %zmm11, %zmm17 +; AVX512DQ-NEXT: vpermd %zmm7, %zmm12, %zmm18 +; AVX512DQ-NEXT: vpermd %zmm7, %zmm13, %zmm19 +; AVX512DQ-NEXT: vpermd %zmm7, %zmm14, %zmm20 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm9, %zmm21 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm10, %zmm22 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm11, %zmm23 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm12, %zmm24 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm13, %zmm25 +; AVX512DQ-NEXT: vpermd %zmm8, %zmm14, %zmm8 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm9, %zmm9 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm10, %zmm10 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm11, %zmm11 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm12, %zmm12 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm13, %zmm7 +; AVX512DQ-NEXT: vpermd %zmm5, %zmm14, %zmm5 +; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm7 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm12 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm12 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm11 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm10 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm24, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm14 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm9 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm9 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm25, %k1 +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm13 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm24, %k1 +; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm14 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm23, %k1 -; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm23 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm23 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm22, %k1 -; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm22 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm22 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm21, %k1 -; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm21 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm21 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm20, %k1 -; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm20 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 -; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm20 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1 ; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm19 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 @@ -7422,1643 +3632,229 @@ ; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 ; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm15 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 -; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm7 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm6 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 +; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm4 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm3 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm2 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm1 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1472(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1408(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1344(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 1280(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1216(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1152(%rdx) +; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 +; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 1472(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1408(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1344(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1280(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1216(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1152(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 1088(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1024(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 960(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 896(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 832(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 768(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, 704(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, 640(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 576(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 512(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 768(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 704(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 640(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 576(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 512(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: mask_replication_factor6_vf64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovq (%rdi), %k5 -; AVX512BW-NEXT: movw $-3, %ax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kmovw (%rdi), %k0 -; AVX512BW-NEXT: kandw %k1, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k0 -; AVX512BW-NEXT: korw %k0, %k3, %k0 -; AVX512BW-NEXT: movw $-5, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-9, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-17, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-33, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: movw $-65, %ax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $1, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-129, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k0, %k3 -; AVX512BW-NEXT: kshiftrq $2, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k7 -; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k7, %k7 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 -; AVX512BW-NEXT: korw %k0, %k6, %k6 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovq %k5, %k3 -; AVX512BW-NEXT: kshiftrq $3, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $4, %k3, %k1 -; AVX512BW-NEXT: kmovq %k3, %k7 -; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $5, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k7} {z} -; AVX512BW-NEXT: kandw %k4, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $6, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $7, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $8, %k7, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $9, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $10, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $11, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $12, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $13, %k7, %k1 -; AVX512BW-NEXT: kmovq %k7, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovq %k2, %k7 -; AVX512BW-NEXT: kshiftrq $14, %k2, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $15, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $16, %k5, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $17, %k5, %k1 -; AVX512BW-NEXT: kmovq %k5, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kmovq %k7, %k4 -; AVX512BW-NEXT: kshiftrq $18, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $19, %k4, %k1 -; AVX512BW-NEXT: kmovq %k4, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $20, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $21, %k7, %k1 -; AVX512BW-NEXT: kmovq %k7, %k3 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kmovq %k3, %k5 -; AVX512BW-NEXT: kshiftrq $22, %k3, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $23, %k5, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $24, %k5, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $25, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $26, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kmovq %k5, %k7 -; AVX512BW-NEXT: kshiftrq $27, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $28, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $29, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $30, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $31, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $32, %k2, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $33, %k2, %k1 -; AVX512BW-NEXT: kmovq %k2, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kmovq %k7, %k5 -; AVX512BW-NEXT: kshiftrq $34, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $35, %k5, %k1 -; AVX512BW-NEXT: kmovq %k5, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $36, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $37, %k7, %k1 -; AVX512BW-NEXT: kmovq %k7, %k3 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovq %k3, %k7 -; AVX512BW-NEXT: kshiftrq $38, %k3, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $39, %k7, %k6 -; AVX512BW-NEXT: kmovq %k7, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $40, %k5, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $41, %k5, %k1 -; AVX512BW-NEXT: kmovq %k5, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $42, %k4, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kmovq %k4, %k7 -; AVX512BW-NEXT: kshiftrq $43, %k4, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $44, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $45, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k7} {z} -; AVX512BW-NEXT: kandw %k5, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $46, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $47, %k5, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k1} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $48, %k5, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $49, %k5, %k1 -; AVX512BW-NEXT: kmovq %k5, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovq %k7, %k5 -; AVX512BW-NEXT: kshiftrq $50, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $51, %k5, %k1 -; AVX512BW-NEXT: kmovq %k5, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $52, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $53, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $54, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $55, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k1} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $56, %k5, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $57, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $58, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm21 {%k7} {z} -; AVX512BW-NEXT: kandw %k2, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $59, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $60, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $61, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm22 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $62, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $63, %k5, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k6, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k2, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k1, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm23, 1472(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 1408(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 1344(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 1280(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 1216(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 1152(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 1088(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 896(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 832(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 768(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 704(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 640(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-LABEL: mask_replication_factor6_vf64: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[6,7,6,7,6,7,6,7] +; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7,8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10,10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13,13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,6,7,6,7] +; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13,13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15,0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512BW-ONLY-NEXT: vpshufb %zmm3, %zmm1, %zmm1 +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5] +; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5,5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7,8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512BW-ONLY-NEXT: vpshufb %zmm4, %zmm1, %zmm1 +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k3 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] +; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k4 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,2,3,2,3] +; AVX512BW-ONLY-NEXT: vpshufb %zmm3, %zmm1, %zmm1 +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k5 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vpshufb %zmm4, %zmm0, %zmm0 +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k6 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k6, %k7 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k7} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k6, %k6 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k6} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k6, %k6 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k6} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k5} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k5, %k6 +; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k6} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k5, %k5 +; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k5} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k5, %k5 +; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k5} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k5 +; AVX512BW-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k5} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k4, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k4} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k3} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm21 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm22 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm23, 1472(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm22, 1408(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm21, 1344(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm20, 1280(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm19, 1216(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm18, 1152(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm17, 1088(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm16, 1024(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm15, 960(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm14, 896(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm12, 768(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor6_vf64: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [53,53,53,53,54,54,54,54,54,54,55,55,55,55,55,55,56,56,56,56,56,56,57,57,57,57,57,57,58,58,58,58,58,58,59,59,59,59,59,59,60,60,60,60,60,60,61,61,61,61,61,61,62,62,62,62,62,62,63,63,63,63,63,63] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [42,42,43,43,43,43,43,43,44,44,44,44,44,44,45,45,45,45,45,45,46,46,46,46,46,46,47,47,47,47,47,47,48,48,48,48,48,48,49,49,49,49,49,49,50,50,50,50,50,50,51,51,51,51,51,51,52,52,52,52,52,52,53,53] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,32,32,32,32,32,33,33,33,33,33,33,34,34,34,34,34,34,35,35,35,35,35,35,36,36,36,36,36,36,37,37,37,37,37,37,38,38,38,38,38,38,39,39,39,39,39,39,40,40,40,40,40,40,41,41,41,41,41,41,42,42,42,42] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [21,21,21,21,22,22,22,22,22,22,23,23,23,23,23,23,24,24,24,24,24,24,25,25,25,25,25,25,26,26,26,26,26,26,27,27,27,27,27,27,28,28,28,28,28,28,29,29,29,29,29,29,30,30,30,30,30,30,31,31,31,31,31,31] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13,13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15,16,16,16,16,16,16,17,17,17,17,17,17,18,18,18,18,18,18,19,19,19,19,19,19,20,20,20,20,20,20,21,21] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k5 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5,5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7,8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k6 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k6, %k7 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k7} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k6, %k6 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k6} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k6, %k6 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k6} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k5} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k5, %k6 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k6} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k5, %k5 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k5} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k5, %k5 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k5} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k5 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k5} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k4, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm21 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm22 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm23, 1472(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm22, 1408(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm21, 1344(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm20, 1280(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm19, 1216(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm18, 1152(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm17, 1088(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm16, 1024(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm15, 960(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm14, 896(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm12, 768(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <384 x i32> %data = call <384 x i32> @llvm.masked.load.v384i32.p0(ptr %in.vec, i32 64, <384 x i1> %tgt.mask, <384 x i32> poison) @@ -9115,9 +3911,9 @@ ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx) +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; AVX512BW-NEXT: vmovq %xmm1, 48(%rdx) ; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, 48(%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 @@ -9202,69 +3998,59 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { ; AVX512F-SLOW-LABEL: mask_replication_factor7_vf8: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: kmovw (%rdi), %k1 -; AVX512F-SLOW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512F-SLOW-NEXT: kmovw (%rdi), %k2 +; AVX512F-SLOW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-SLOW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} -; AVX512F-SLOW-NEXT: movw $1, %ax -; AVX512F-SLOW-NEXT: kmovw %eax, %k2 -; AVX512F-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} -; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k2 +; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vptestmd %zmm0, %zmm0, %k4 ; AVX512F-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512F-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k2} {z} ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,3,3,6,7,7,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,3] -; AVX512F-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1 -; AVX512F-SLOW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} -; AVX512F-SLOW-NEXT: vmovdqa32 192(%rsi), %zmm1 {%k1} {z} -; AVX512F-SLOW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z} -; AVX512F-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k3} {z} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 192(%rdx) +; AVX512F-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k2 +; AVX512F-SLOW-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k2} {z} +; AVX512F-SLOW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} +; AVX512F-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k3} {z} +; AVX512F-SLOW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rdx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 192(%rdx) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: mask_replication_factor7_vf8: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: kmovw (%rdi), %k1 -; AVX512F-FAST-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512F-FAST-NEXT: kmovw (%rdi), %k2 +; AVX512F-FAST-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-FAST-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} -; AVX512F-FAST-NEXT: movw $1, %ax -; AVX512F-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} -; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k2 +; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-FAST-NEXT: vptestmd %zmm0, %zmm0, %k4 ; AVX512F-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512F-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k2} {z} ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,7,7,7,7,7,7] ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1 -; AVX512F-FAST-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} -; AVX512F-FAST-NEXT: vmovdqa32 192(%rsi), %zmm1 {%k1} {z} -; AVX512F-FAST-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z} -; AVX512F-FAST-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k3} {z} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512F-FAST-NEXT: vmovdqa %ymm1, 192(%rdx) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512F-FAST-NEXT: vptestmd %ymm0, %ymm0, %k2 +; AVX512F-FAST-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k2} {z} +; AVX512F-FAST-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} +; AVX512F-FAST-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k3} {z} +; AVX512F-FAST-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 64(%rdx) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, 192(%rdx) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -9272,32 +4058,27 @@ ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-SLOW-NEXT: vpmovm2d %k1, %zmm1 -; AVX512DQ-SLOW-NEXT: movw $1, %ax -; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm0, %k3 ; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %ymm0 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,3,3,6,7,7,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,3] ; AVX512DQ-SLOW-NEXT: vpmovd2m %ymm0, %k4 -; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512DQ-SLOW-NEXT: vmovdqa32 192(%rsi), %zmm1 {%k4} {z} -; AVX512DQ-SLOW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} -; AVX512DQ-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k2} {z} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, 192(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k4} {z} +; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} +; AVX512DQ-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} +; AVX512DQ-SLOW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 192(%rdx) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; @@ -9305,32 +4086,27 @@ ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-FAST-NEXT: vpmovm2d %k1, %zmm1 -; AVX512DQ-FAST-NEXT: movw $1, %ax -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k1 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm0, %k3 ; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,7,7,7,7,7,7] ; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vpmovd2m %ymm0, %k4 -; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512DQ-FAST-NEXT: vmovdqa32 192(%rsi), %zmm1 {%k4} {z} -; AVX512DQ-FAST-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} -; AVX512DQ-FAST-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k2} {z} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, 192(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k4} {z} +; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} +; AVX512DQ-FAST-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} +; AVX512DQ-FAST-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 64(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 192(%rdx) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -9348,11 +4124,11 @@ ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k3} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 -; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k1} {z} -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512BW-ONLY-NEXT: vmovdqa %ymm0, 192(%rdx) ; AVX512BW-ONLY-NEXT: vzeroupper @@ -9372,11 +4148,11 @@ ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k3} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512VBMI-ONLY-NEXT: vmovdqa %ymm0, 192(%rdx) ; AVX512VBMI-ONLY-NEXT: vzeroupper @@ -9395,45 +4171,40 @@ ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k7 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k7} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k6} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k5} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm4 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm6 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k7} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k6} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k5} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq @@ -9442,88 +4213,72 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k7 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k7} {z} -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k6} {z} -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k5} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm4 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm6 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k7} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k6} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k5} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: mask_replication_factor7_vf16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovw (%rdi), %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] -; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] -; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm3 -; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] -; AVX512BW-NEXT: vpermd %zmm0, %zmm4, %zmm4 -; AVX512BW-NEXT: vptestmd %zmm4, %zmm4, %k1 +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm0[9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11,27,27,27,27,28,28,28,28,28,28,28,29,29,29,29,29,45,45,46,46,46,46,46,46,46,47,47,47,47,47,47,47,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,18,18,18,18,18,19,19,19,19,19,19,19,20,20,20,20,36,36,36,37,37,37,37,37,37,37,38,38,38,38,38,38,54,55,55,55,55,55,55,55,56,56,56,56,56,56,56,57] +; AVX512BW-NEXT: vpmovb2m %zmm0, %k2 +; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} +; AVX512BW-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} +; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] -; AVX512BW-NEXT: vpermd %zmm0, %zmm5, %zmm5 -; AVX512BW-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] -; AVX512BW-NEXT: vpermd %zmm0, %zmm6, %zmm6 -; AVX512BW-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] -; AVX512BW-NEXT: vpermd %zmm0, %zmm7, %zmm0 -; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k2} {z} +; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 +; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} +; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 @@ -9537,1133 +4292,263 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { ; AVX512F-ONLY-LABEL: mask_replication_factor7_vf32: ; AVX512F-ONLY: # %bb.0: -; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm4 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm12 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm0 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm2 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm5, %zmm5 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm7 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm9 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm11 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm13, %zmm3 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm14, %zmm14, %zmm14 {%k1} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm14, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vpermd %zmm14, %zmm3, %zmm3 +; AVX512F-ONLY-NEXT: vpermd %zmm14, %zmm5, %zmm5 +; AVX512F-ONLY-NEXT: vpermd %zmm14, %zmm7, %zmm7 +; AVX512F-ONLY-NEXT: vpermd %zmm14, %zmm9, %zmm9 +; AVX512F-ONLY-NEXT: vpermd %zmm14, %zmm11, %zmm11 +; AVX512F-ONLY-NEXT: vpermd %zmm14, %zmm13, %zmm13 +; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm13 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm11 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm11 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm9 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm7 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm5 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm12 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm12 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm10 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm8 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm6 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 512(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 576(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 640(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 704(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 768(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 832(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 832(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 768(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 704(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 640(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 576(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 512(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; ; AVX512DQ-LABEL: mask_replication_factor7_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm5, %zmm5 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm9 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm11 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm13, %zmm3 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm14 +; AVX512DQ-NEXT: vpermd %zmm14, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpermd %zmm14, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpermd %zmm14, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpermd %zmm14, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpermd %zmm14, %zmm9, %zmm9 +; AVX512DQ-NEXT: vpermd %zmm14, %zmm11, %zmm11 +; AVX512DQ-NEXT: vpermd %zmm14, %zmm13, %zmm13 +; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm13 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm3 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 -; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm11 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm11 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 -; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm9 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 -; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm7 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm5 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm12 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm12 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm10 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm8 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm6 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 512(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 576(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 640(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 704(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 768(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 832(%rdx) +; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 +; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 832(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 768(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 704(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 640(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 576(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 512(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: mask_replication_factor7_vf32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movw $-3, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw (%rdi), %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kmovq %k2, %k3 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-5, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovq %k2, %k4 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-9, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-17, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-33, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-65, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: movw $-129, %ax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovd (%rdi), %k6 -; AVX512BW-NEXT: kshiftrd $1, %k6, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovq %k2, %k7 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $7, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $6, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrd $2, %k6, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kshiftlw $14, %k2, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: korw %k1, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: kmovq %k6, %k2 -; AVX512BW-NEXT: kshiftrd $29, %k6, %k1 -; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k3, %k1, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovq %k4, %k6 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrd $30, %k2, %k1 -; AVX512BW-NEXT: kmovq %k2, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k3 -; AVX512BW-NEXT: kshiftrd $31, %k4, %k0 -; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k3, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k1, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $27, %k4, %k1 -; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k3, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k3, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k3, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k7 -; AVX512BW-NEXT: kshiftrd $28, %k4, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k7, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k5, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k6, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kmovq %k2, %k4 -; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k5, %k6 -; AVX512BW-NEXT: kmovq %k5, %k7 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k7, %k0, %k2 -; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm2 {%k2} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $25, %k6, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k5 -; AVX512BW-NEXT: kshiftrd $26, %k6, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $2, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k6, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $23, %k2, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k3 -; AVX512BW-NEXT: kshiftrd $22, %k2, %k5 -; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovq %k2, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k2 -; AVX512BW-NEXT: kshiftrw $14, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $24, %k6, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k0, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k2} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $20, %k3, %k5 -; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k6 -; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k1, %k2, %k5 -; AVX512BW-NEXT: kshiftrd $21, %k3, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $15, %k7, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kshiftlw $14, %k7, %k1 -; AVX512BW-NEXT: korw %k1, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $18, %k2, %k4 -; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $13, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $12, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k6 -; AVX512BW-NEXT: kshiftrd $19, %k2, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k5, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $9, %k5, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $8, %k5, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k5, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $6, %k5, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $5, %k5, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k6, %k5 -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 -; AVX512BW-NEXT: kmovq %k3, %k7 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k5, %k3 -; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm6 {%k3} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $16, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k3, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k3 -; AVX512BW-NEXT: kshiftrd $17, %k1, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k3, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 -; AVX512BW-NEXT: korw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: korw %k1, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $13, %k0, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k6, %k2, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kandw %k7, %k1, %k3 -; AVX512BW-NEXT: kshiftrd $14, %k0, %k1 -; AVX512BW-NEXT: kmovq %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k5 -; AVX512BW-NEXT: kshiftrd $15, %k6, %k3 -; AVX512BW-NEXT: kmovq %k6, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k5, %k3 -; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm8 {%k1} {z} -; AVX512BW-NEXT: kmovq %k0, %k3 -; AVX512BW-NEXT: kshiftrd $11, %k0, %k0 -; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k6 -; AVX512BW-NEXT: kshiftrd $12, %k3, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k5, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $10, %k5, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $9, %k5, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $8, %k5, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k5, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $6, %k5, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $5, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k6, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 -; AVX512BW-NEXT: kmovq %k3, %k0 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k0, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm9 {%k2} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $9, %k6, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k5 -; AVX512BW-NEXT: kshiftrd $10, %k6, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $2, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $7, %k4, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k3 -; AVX512BW-NEXT: kshiftrd $6, %k4, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $14, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $8, %k4, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k0, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k2} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $4, %k6, %k3 -; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k5 -; AVX512BW-NEXT: kshiftrd $5, %k6, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm12 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $14, %k4, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k4, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k4, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k2 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm13 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 512(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 576(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 640(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 704(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 768(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 832(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-LABEL: mask_replication_factor7_vf32: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovd (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2w %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [27,27,27,27,28,28,28,28,28,28,28,29,29,29,29,29,29,29,30,30,30,30,30,30,30,31,31,31,31,31,31,31] +; AVX512BW-ONLY-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-ONLY-NEXT: vpmovw2m %zmm0, %k1 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4,20,20,20,21,21,21,21,21,21,21,22,22,22,22,22,22,38,39,39,39,39,39,39,39,40,40,40,40,40,40,40,41,57,57,57,57,57,57,58,58,58,58,58,58,58,59,59,59] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,0,1,2,3] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11,27,27,27,27,28,28,28,28,28,28,28,29,29,29,29,29,45,45,46,46,46,46,46,46,46,47,47,47,47,47,47,47,48,48,48,48,48,48,48,49,49,49,49,49,49,49,50,50] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k3 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,18,18,18,18,18,19,19,19,19,19,19,19,20,20,20,20,36,36,36,37,37,37,37,37,37,37,38,38,38,38,38,38,54,55,55,55,55,55,55,55,56,56,56,56,56,56,56,57] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k5 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k5} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k4, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k3} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm12, 768(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor7_vf32: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovd (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2w %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [27,27,27,27,28,28,28,28,28,28,28,29,29,29,29,29,29,29,30,30,30,30,30,30,30,31,31,31,31,31,31,31] +; AVX512VBMI-ONLY-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovw2m %zmm0, %k1 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [18,18,18,18,18,19,19,19,19,19,19,19,20,20,20,20,20,20,20,21,21,21,21,21,21,21,22,22,22,22,22,22,22,23,23,23,23,23,23,23,24,24,24,24,24,24,24,25,25,25,25,25,25,25,26,26,26,26,26,26,26,27,27,27] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11,11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13,13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15,16,16,16,16,16,16,16,17,17,17,17,17,17,17,18,18] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4,4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6,6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k5 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k5} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k4, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm12, 768(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <224 x i32> @@ -10675,76 +4560,72 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { ; AVX512F-ONLY-LABEL: mask_replication_factor7_vf64: ; AVX512F-ONLY: # %bb.0: -; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm0 -; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm8, %zmm8, %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm9, %zmm9, %zmm9 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm11, %zmm11, %zmm11 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm13, %zmm0 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm15 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm15, %zmm2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm16 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm16, %zmm3 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm9, %zmm9, %zmm9 {%k1} {z} +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm11, %zmm0 +; AVX512F-ONLY-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm13, %zmm1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm15 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm15, %zmm2 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm17 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm17, %zmm4 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm17, %zmm3 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm18 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm18, %zmm6 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm18, %zmm4 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm19 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm19, %zmm7 -; AVX512F-ONLY-NEXT: vpermd %zmm8, %zmm5, %zmm8 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm13, %zmm10 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm15, %zmm12 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm16, %zmm14 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm17, %zmm20 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm18, %zmm21 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm19, %zmm22 -; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm5, %zmm23 -; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm13, %zmm24 -; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm15, %zmm25 -; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm16, %zmm26 -; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm17, %zmm27 -; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm18, %zmm28 -; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm5, %zmm29 -; AVX512F-ONLY-NEXT: vpermd %zmm11, %zmm19, %zmm30 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm13, %zmm31 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm15, %zmm15 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm16, %zmm13 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm17, %zmm11 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm18, %zmm9 -; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm19, %zmm5 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm19, %zmm5 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm20, %zmm8 +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm11, %zmm10 +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm13, %zmm12 +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm15, %zmm14 +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm17, %zmm16 +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm18, %zmm21 +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm19, %zmm22 +; AVX512F-ONLY-NEXT: vpermd %zmm7, %zmm20, %zmm23 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm11, %zmm24 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm13, %zmm25 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm15, %zmm26 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm17, %zmm27 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm18, %zmm28 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm19, %zmm29 +; AVX512F-ONLY-NEXT: vpermd %zmm9, %zmm20, %zmm30 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm11, %zmm31 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm13, %zmm0 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm15, %zmm13 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm17, %zmm11 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm18, %zmm9 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm19, %zmm7 +; AVX512F-ONLY-NEXT: vpermd %zmm6, %zmm20, %zmm6 +; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm7 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm13 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm15 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm31, %zmm31, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm16 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm29, %zmm29, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm17 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm30, %zmm30, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm17 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm29, %zmm29, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm28, %zmm28, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm19 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm27, %zmm27, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm27 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm20 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm26, %zmm26, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm26 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm25, %zmm25, %k1 @@ -10757,8 +4638,8 @@ ; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm22 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm21, %zmm21, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm21 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm20, %zmm20, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm20 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm16 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm14 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 @@ -10767,121 +4648,118 @@ ; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm10 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm8 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm7 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm5 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm4 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 1728(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 1664(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 1600(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 1536(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1472(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 1408(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-NEXT: vptestmd %zmm27, %zmm27, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm27, 1728(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 1664(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 1600(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 1536(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 1472(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 1408(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1344(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1280(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1216(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 1152(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 1088(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 1088(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 1024(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 960(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 896(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm24, 832(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm25, 768(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm26, 704(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm27, 640(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 640(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 576(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 512(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 384(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 320(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 256(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 192(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; ; AVX512DQ-LABEL: mask_replication_factor7_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: kmovw (%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm8 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm6 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm9 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm7 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm11 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpermd %zmm8, %zmm13, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] -; AVX512DQ-NEXT: vpermd %zmm8, %zmm15, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] -; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm3 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm9 +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm11, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm13, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm15, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] -; AVX512DQ-NEXT: vpermd %zmm8, %zmm17, %zmm4 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm17, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] -; AVX512DQ-NEXT: vpermd %zmm8, %zmm18, %zmm6 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm18, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] -; AVX512DQ-NEXT: vpermd %zmm8, %zmm19, %zmm7 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm5, %zmm8 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm13, %zmm10 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm15, %zmm12 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm16, %zmm14 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm17, %zmm20 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm18, %zmm21 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm19, %zmm22 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm5, %zmm23 -; AVX512DQ-NEXT: vpermd %zmm11, %zmm13, %zmm24 -; AVX512DQ-NEXT: vpermd %zmm11, %zmm15, %zmm25 -; AVX512DQ-NEXT: vpermd %zmm11, %zmm16, %zmm26 -; AVX512DQ-NEXT: vpermd %zmm11, %zmm17, %zmm27 -; AVX512DQ-NEXT: vpermd %zmm11, %zmm18, %zmm28 -; AVX512DQ-NEXT: vpermd %zmm11, %zmm5, %zmm29 -; AVX512DQ-NEXT: vpermd %zmm11, %zmm19, %zmm30 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm13, %zmm31 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm15, %zmm15 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm16, %zmm13 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm17, %zmm11 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm18, %zmm9 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm19, %zmm5 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vpermd %zmm6, %zmm19, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm20, %zmm8 +; AVX512DQ-NEXT: vpermd %zmm7, %zmm11, %zmm10 +; AVX512DQ-NEXT: vpermd %zmm7, %zmm13, %zmm12 +; AVX512DQ-NEXT: vpermd %zmm7, %zmm15, %zmm14 +; AVX512DQ-NEXT: vpermd %zmm7, %zmm17, %zmm16 +; AVX512DQ-NEXT: vpermd %zmm7, %zmm18, %zmm21 +; AVX512DQ-NEXT: vpermd %zmm7, %zmm19, %zmm22 +; AVX512DQ-NEXT: vpermd %zmm7, %zmm20, %zmm23 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm11, %zmm24 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm13, %zmm25 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm15, %zmm26 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm17, %zmm27 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm18, %zmm28 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm19, %zmm29 +; AVX512DQ-NEXT: vpermd %zmm9, %zmm20, %zmm30 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm6 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm11, %zmm31 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm13, %zmm0 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm15, %zmm13 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm17, %zmm11 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm18, %zmm9 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm19, %zmm7 +; AVX512DQ-NEXT: vpermd %zmm6, %zmm20, %zmm6 +; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm7 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 ; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 ; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 ; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm13 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 ; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm15 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm31, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm16 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm29, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm17 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm30, %k1 +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm17 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm29, %k1 ; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm28, %k1 ; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm19 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm27, %k1 -; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm27 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm20 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm26, %k1 ; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm26 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm25, %k1 @@ -10894,8 +4772,8 @@ ; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm22 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm21, %k1 ; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm21 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm20, %k1 -; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm20 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1 +; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm16 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 ; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm14 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 @@ -10904,1879 +4782,263 @@ ; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm10 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 ; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm8 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 -; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm7 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 +; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm5 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 1536(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm4 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa32 1600(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1536(%rsi), %zmm3 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 1664(%rsi), %zmm2 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 1728(%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 1728(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1664(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1600(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1536(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1472(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 1408(%rdx) +; AVX512DQ-NEXT: vmovdqa32 1600(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-NEXT: vmovdqa32 1664(%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-NEXT: vpmovd2m %zmm27, %k1 +; AVX512DQ-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm27, 1728(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 1664(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 1600(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1536(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1472(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 1408(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1344(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1280(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1216(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 1152(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, 1088(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1088(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm21, 1024(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm22, 960(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm23, 896(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm24, 832(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm25, 768(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm26, 704(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 640(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 640(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 576(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 512(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 384(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 320(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 256(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 192(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: mask_replication_factor7_vf64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movw $-3, %ax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kmovw (%rdi), %k0 -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-5, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovq %k2, %k3 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $13, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-9, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-17, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-33, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-65, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: movw $-129, %ax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq (%rdi), %k4 -; AVX512BW-NEXT: kshiftrq $1, %k4, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF -; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $5, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $2, %k4, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k6 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k7, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k7, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k7, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k7, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq %k4, %k7 -; AVX512BW-NEXT: kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: kshiftrq $3, %k4, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $4, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: kandw %k2, %k6, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $5, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $6, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k5, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 -; AVX512BW-NEXT: korw %k1, %k6, %k1 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $7, %k4, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $8, %k4, %k0 -; AVX512BW-NEXT: kmovq %k4, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $9, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $10, %k5, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $11, %k5, %k6 -; AVX512BW-NEXT: kmovq %k5, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kmovq %k4, %k7 -; AVX512BW-NEXT: kshiftrq $12, %k4, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $13, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k5, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 -; AVX512BW-NEXT: korw %k1, %k6, %k6 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k6} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $14, %k5, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $15, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k6, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $16, %k5, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $17, %k5, %k0 -; AVX512BW-NEXT: kmovq %k5, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $18, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $19, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $20, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} -; AVX512BW-NEXT: kandw %k5, %k6, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $21, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $22, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 -; AVX512BW-NEXT: korw %k1, %k6, %k1 -; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} -; AVX512BW-NEXT: kandw %k5, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $23, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $24, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $25, %k2, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k7} {z} -; AVX512BW-NEXT: kandw %k5, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq %k2, %k7 -; AVX512BW-NEXT: kshiftrq $26, %k2, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $27, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $28, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $29, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k5, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 -; AVX512BW-NEXT: korw %k1, %k6, %k6 -; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k6} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $30, %k5, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $31, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k6, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $32, %k5, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $33, %k5, %k0 -; AVX512BW-NEXT: kmovq %k5, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovq %k7, %k3 -; AVX512BW-NEXT: kshiftrq $34, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $35, %k3, %k0 -; AVX512BW-NEXT: kmovq %k3, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $36, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $37, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $38, %k7, %k0 -; AVX512BW-NEXT: kmovq %k7, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 -; AVX512BW-NEXT: korw %k1, %k6, %k1 -; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq %k5, %k7 -; AVX512BW-NEXT: kshiftrq $39, %k5, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $40, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $41, %k4, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $42, %k4, %k0 -; AVX512BW-NEXT: kmovq %k4, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $43, %k3, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k6, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq %k3, %k7 -; AVX512BW-NEXT: kshiftrq $44, %k3, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $45, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 -; AVX512BW-NEXT: korw %k1, %k6, %k6 -; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k6} {z} -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $46, %k5, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $47, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k6, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $48, %k5, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $49, %k5, %k0 -; AVX512BW-NEXT: kmovq %k5, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $50, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm21 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $51, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $52, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm22 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $53, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $54, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 -; AVX512BW-NEXT: korw %k1, %k6, %k1 -; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z} -; AVX512BW-NEXT: kandw %k3, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $55, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $56, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $57, %k4, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 1536(%rsi), %zmm24 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kmovq %k4, %k7 -; AVX512BW-NEXT: kshiftrq $58, %k4, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $59, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1600(%rsi), %zmm25 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $60, %k5, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $61, %k5, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 -; AVX512BW-NEXT: korw %k1, %k6, %k6 -; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k6} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $62, %k5, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrq $63, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k5, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm27, 1728(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 1664(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 1600(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 1536(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 1472(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 1408(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 1344(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 1280(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 1216(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 1152(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 1088(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 896(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 832(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 768(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 704(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 640(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-LABEL: mask_replication_factor7_vf64: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[6,7,6,7,6,7,6,7] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9,25,25,25,25,25,25,26,26,26,26,26,26,26,27,27,27,43,43,43,43,44,44,44,44,44,44,44,45,45,45,45,45,61,61,62,62,62,62,62,62,62,63,63,63,63,63,63,63] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-ONLY-NEXT: kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,6,7,6,7] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15,16,16,16,16,16,16,16,17,17,17,17,17,17,17,18,18,34,34,34,34,34,35,35,35,35,35,35,35,36,36,36,36,52,52,52,53,53,53,53,53,53,53,54,54,54,54,54,54] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6,22,23,23,23,23,23,23,23,24,24,24,24,24,24,24,25,41,41,41,41,41,41,42,42,42,42,42,42,42,43,43,43,59,59,59,59,60,60,60,60,60,60,60,61,61,61,61,61] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k3 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,4,5,4,5] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13,29,29,30,30,30,30,30,30,30,31,31,31,31,31,31,31,32,32,32,32,32,32,32,33,33,33,33,33,33,33,34,34,50,50,50,50,50,51,51,51,51,51,51,51,52,52,52,52] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k4 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4,20,20,20,21,21,21,21,21,21,21,22,22,22,22,22,22,38,39,39,39,39,39,39,39,40,40,40,40,40,40,40,41,57,57,57,57,57,57,58,58,58,58,58,58,58,59,59,59] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k5 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,0,1,2,3] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11,27,27,27,27,28,28,28,28,28,28,28,29,29,29,29,29,45,45,46,46,46,46,46,46,46,47,47,47,47,47,47,47,48,48,48,48,48,48,48,49,49,49,49,49,49,49,50,50] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k6 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,18,18,18,18,18,19,19,19,19,19,19,19,20,20,20,20,36,36,36,37,37,37,37,37,37,37,38,38,38,38,38,38,54,55,55,55,55,55,55,55,56,56,56,56,56,56,56,57] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k7 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k7} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k7, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k7, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k6} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k6, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k6, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k5} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k5, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k5, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k4, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm21 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm22 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z} +; AVX512BW-ONLY-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload +; AVX512BW-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm24 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm25 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm27, 1728(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm26, 1664(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm25, 1600(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm24, 1536(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm23, 1472(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm22, 1408(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm21, 1344(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm20, 1280(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm19, 1216(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm18, 1152(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm17, 1088(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm16, 1024(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm15, 960(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm14, 896(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm12, 768(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor7_vf64: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [54,55,55,55,55,55,55,55,56,56,56,56,56,56,56,57,57,57,57,57,57,57,58,58,58,58,58,58,58,59,59,59,59,59,59,59,60,60,60,60,60,60,60,61,61,61,61,61,61,61,62,62,62,62,62,62,62,63,63,63,63,63,63,63] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VBMI-ONLY-NEXT: kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [45,45,46,46,46,46,46,46,46,47,47,47,47,47,47,47,48,48,48,48,48,48,48,49,49,49,49,49,49,49,50,50,50,50,50,50,50,51,51,51,51,51,51,51,52,52,52,52,52,52,52,53,53,53,53,53,53,53,54,54,54,54,54,54] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [36,36,36,37,37,37,37,37,37,37,38,38,38,38,38,38,38,39,39,39,39,39,39,39,40,40,40,40,40,40,40,41,41,41,41,41,41,41,42,42,42,42,42,42,42,43,43,43,43,43,43,43,44,44,44,44,44,44,44,45,45,45,45,45] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [27,27,27,27,28,28,28,28,28,28,28,29,29,29,29,29,29,29,30,30,30,30,30,30,30,31,31,31,31,31,31,31,32,32,32,32,32,32,32,33,33,33,33,33,33,33,34,34,34,34,34,34,34,35,35,35,35,35,35,35,36,36,36,36] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [18,18,18,18,18,19,19,19,19,19,19,19,20,20,20,20,20,20,20,21,21,21,21,21,21,21,22,22,22,22,22,22,22,23,23,23,23,23,23,23,24,24,24,24,24,24,24,25,25,25,25,25,25,25,26,26,26,26,26,26,26,27,27,27] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k5 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11,11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13,13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15,16,16,16,16,16,16,16,17,17,17,17,17,17,17,18,18] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k6 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4,4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6,6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k7 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k7} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k7, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k7, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k6} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k6, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k6, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k5} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k5, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k5, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k4, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm21 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm22 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm24 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm25 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm27, 1728(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm26, 1664(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm25, 1600(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm24, 1536(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm23, 1472(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm22, 1408(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm21, 1344(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm20, 1280(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm19, 1216(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm18, 1152(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm17, 1088(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm16, 1024(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm15, 960(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm14, 896(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm12, 768(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <448 x i32> %data = call <448 x i32> @llvm.masked.load.v448i32.p0(ptr %in.vec, i32 64, <448 x i1> %tgt.mask, <448 x i32> poison) @@ -12890,95 +5152,95 @@ ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k4} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; ; AVX512DQ-LABEL: mask_replication_factor8_vf8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k4} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-ONLY-LABEL: mask_replication_factor8_vf8: ; AVX512BW-ONLY: # %bb.0: -; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k0 ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55] ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-ONLY-NEXT: vzeroupper ; AVX512BW-ONLY-NEXT: retq ; ; AVX512VBMI-ONLY-LABEL: mask_replication_factor8_vf8: ; AVX512VBMI-ONLY: # %bb.0: -; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k0 ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512VBMI-ONLY-NEXT: vzeroupper ; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 @@ -12994,48 +5256,48 @@ ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k7 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k7} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k6} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k5} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k4} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} ; AVX512F-ONLY-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; @@ -13043,48 +5305,48 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 ; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k7} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k6} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k5} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k4} {z} +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} ; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -13097,28 +5359,28 @@ ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55] ; AVX512BW-NEXT: vpmovb2m %zmm0, %k2 +; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $16, %k2, %k3 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} ; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $16, %k2, %k3 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k2} {z} ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} +; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 @@ -13308,50 +5570,50 @@ ; AVX512BW-NEXT: vpmovb2m %zmm1, %k3 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k4 +; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k4} {z} ; AVX512BW-NEXT: kshiftrd $16, %k4, %k5 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} +; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k5} {z} ; AVX512BW-NEXT: kshiftrq $32, %k4, %k4 -; AVX512BW-NEXT: kshiftrd $16, %k4, %k5 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z} -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z} +; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z} +; AVX512BW-NEXT: kshiftrd $16, %k4, %k4 +; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} +; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k3} {z} ; AVX512BW-NEXT: kshiftrd $16, %k3, %k4 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} +; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k4} {z} ; AVX512BW-NEXT: kshiftrq $32, %k3, %k3 -; AVX512BW-NEXT: kshiftrd $16, %k3, %k4 -; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z} -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z} +; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k3} {z} +; AVX512BW-NEXT: kshiftrd $16, %k3, %k3 +; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k3} {z} +; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $16, %k2, %k3 -; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z} -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k3} {z} ; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $16, %k2, %k3 -; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z} -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k2} {z} +; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z} -; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k2} {z} ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z} -; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm15, 896(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 960(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 768(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 832(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 704(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 512(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 576(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k1} {z} +; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k1} {z} +; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 896(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 768(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -1107,9 +1107,9 @@ ; SSE41-LABEL: constant_rotate_v4i32: ; SSE41: # %bb.0: ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] @@ -1118,13 +1118,13 @@ ; ; AVX1-LABEL: constant_rotate_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -901,21 +901,21 @@ ; AVX1-LABEL: constant_rotate_v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3],xmm3[4,5],xmm0[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -1827,10 +1827,10 @@ ; SSE41-NEXT: shrb $2, %cl ; SSE41-NEXT: andb $1, %cl ; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] ; SSE41-NEXT: pinsrb $8, %ecx, %xmm1 ; SSE41-NEXT: shrb $3, %al ; SSE41-NEXT: movzbl %al, %eax -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] ; SSE41-NEXT: pinsrb $12, %eax, %xmm1 ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 @@ -1842,28 +1842,28 @@ ; AVX1-LABEL: load_sext_4i1_to_4i64: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: movzbl (%rdi), %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrb $3, %cl +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: negq %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 ; AVX1-NEXT: movzbl %al, %ecx -; AVX1-NEXT: shrb %al +; AVX1-NEXT: shrb $2, %al ; AVX1-NEXT: movzbl %al, %eax ; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: andl $1, %edx -; AVX1-NEXT: negl %edx -; AVX1-NEXT: vmovd %edx, %xmm0 -; AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX1-NEXT: negq %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-NEXT: movl %ecx, %eax -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: movzbl %al, %eax ; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; AVX1-NEXT: shrb $3, %cl +; AVX1-NEXT: negq %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: shrb %cl ; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: negq %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -1957,10 +1957,10 @@ ; X86-SSE41-NEXT: shrb $2, %cl ; X86-SSE41-NEXT: andb $1, %cl ; X86-SSE41-NEXT: movzbl %cl, %ecx +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] ; X86-SSE41-NEXT: pinsrb $8, %ecx, %xmm1 ; X86-SSE41-NEXT: shrb $3, %al ; X86-SSE41-NEXT: movzbl %al, %eax -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] ; X86-SSE41-NEXT: pinsrb $12, %eax, %xmm1 ; X86-SSE41-NEXT: pslld $31, %xmm0 ; X86-SSE41-NEXT: psrad $31, %xmm0 @@ -3490,13 +3490,10 @@ ; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: movl 8(%rdi), %ecx -; SSE2-NEXT: shll $28, %ecx -; SSE2-NEXT: movq %rax, %rdx -; SSE2-NEXT: shrq $51, %rdx -; SSE2-NEXT: shll $15, %edx -; SSE2-NEXT: orl %ecx, %edx -; SSE2-NEXT: sarl $15, %edx -; SSE2-NEXT: movd %edx, %xmm1 +; SSE2-NEXT: shldq $13, %rax, %rcx +; SSE2-NEXT: shll $15, %ecx +; SSE2-NEXT: sarl $15, %ecx +; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: shrq $34, %rax ; SSE2-NEXT: shll $15, %eax ; SSE2-NEXT: sarl $15, %eax @@ -3519,13 +3516,10 @@ ; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: movl 8(%rdi), %ecx -; SSSE3-NEXT: shll $28, %ecx -; SSSE3-NEXT: movq %rax, %rdx -; SSSE3-NEXT: shrq $51, %rdx -; SSSE3-NEXT: shll $15, %edx -; SSSE3-NEXT: orl %ecx, %edx -; SSSE3-NEXT: sarl $15, %edx -; SSSE3-NEXT: movd %edx, %xmm1 +; SSSE3-NEXT: shldq $13, %rax, %rcx +; SSSE3-NEXT: shll $15, %ecx +; SSSE3-NEXT: sarl $15, %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: shrq $34, %rax ; SSSE3-NEXT: shll $15, %eax ; SSSE3-NEXT: sarl $15, %eax @@ -3537,53 +3531,47 @@ ; SSE41-LABEL: sext_4i17_to_4i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movq (%rdi), %rax -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq $17, %rcx +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: movq %rax, %rdx +; SSE41-NEXT: movl 8(%rdi), %esi +; SSE41-NEXT: shldq $13, %rax, %rsi +; SSE41-NEXT: shrq $17, %rax +; SSE41-NEXT: shll $15, %eax +; SSE41-NEXT: sarl $15, %eax ; SSE41-NEXT: shll $15, %ecx ; SSE41-NEXT: sarl $15, %ecx -; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrd $1, %eax, %xmm0 +; SSE41-NEXT: shrq $34, %rdx ; SSE41-NEXT: shll $15, %edx ; SSE41-NEXT: sarl $15, %edx -; SSE41-NEXT: movd %edx, %xmm0 -; SSE41-NEXT: pinsrd $1, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq $34, %rcx -; SSE41-NEXT: shll $15, %ecx -; SSE41-NEXT: sarl $15, %ecx -; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 -; SSE41-NEXT: movl 8(%rdi), %ecx -; SSE41-NEXT: shll $28, %ecx -; SSE41-NEXT: shrq $51, %rax -; SSE41-NEXT: shll $15, %eax -; SSE41-NEXT: orl %ecx, %eax -; SSE41-NEXT: sarl $15, %eax -; SSE41-NEXT: pinsrd $3, %eax, %xmm0 +; SSE41-NEXT: pinsrd $2, %edx, %xmm0 +; SSE41-NEXT: shll $15, %esi +; SSE41-NEXT: sarl $15, %esi +; SSE41-NEXT: pinsrd $3, %esi, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: sext_4i17_to_4i32: ; AVX: # %bb.0: ; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: shrq $17, %rcx +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: movq %rax, %rdx +; AVX-NEXT: movl 8(%rdi), %esi +; AVX-NEXT: shldq $13, %rax, %rsi +; AVX-NEXT: shrq $17, %rax +; AVX-NEXT: shll $15, %eax +; AVX-NEXT: sarl $15, %eax ; AVX-NEXT: shll $15, %ecx ; AVX-NEXT: sarl $15, %ecx -; AVX-NEXT: movl %eax, %edx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: shrq $34, %rdx ; AVX-NEXT: shll $15, %edx ; AVX-NEXT: sarl $15, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: shrq $34, %rcx -; AVX-NEXT: shll $15, %ecx -; AVX-NEXT: sarl $15, %ecx -; AVX-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movl 8(%rdi), %ecx -; AVX-NEXT: shll $28, %ecx -; AVX-NEXT: shrq $51, %rax -; AVX-NEXT: shll $15, %eax -; AVX-NEXT: orl %ecx, %eax -; AVX-NEXT: sarl $15, %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 +; AVX-NEXT: shll $15, %esi +; AVX-NEXT: sarl $15, %esi +; AVX-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X86-SSE2-LABEL: sext_4i17_to_4i32: diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -1180,7 +1180,6 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpsrad %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper @@ -1190,7 +1189,6 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1199,9 +1197,8 @@ ; ; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpsrad %xmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper @@ -1209,9 +1206,8 @@ ; ; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -1306,7 +1306,6 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq @@ -1328,8 +1327,7 @@ ; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 -; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpsraw %xmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll --- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -961,7 +961,6 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpsrld %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper @@ -971,7 +970,6 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -980,9 +978,8 @@ ; ; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpsrld %xmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper @@ -990,9 +987,8 @@ ; ; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -1074,7 +1074,6 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq @@ -1093,8 +1092,7 @@ ; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -865,7 +865,6 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpslld %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper @@ -875,7 +874,6 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -884,9 +882,8 @@ ; ; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpslld %xmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper @@ -894,9 +891,8 @@ ; ; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -995,7 +995,6 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq @@ -1013,8 +1012,7 @@ ; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpsllw %xmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -113,77 +113,51 @@ } define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(<16 x i8> %a, <16 x i8> %b) { -; SSE-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: -; SSE: # %bb.0: -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE-NEXT: retq -; -; AVX1-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: -; AVX1: # %bb.0: -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; AVX2-SLOW-NEXT: retq +; SSE2-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] -; AVX2-FAST-NEXT: retq +; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; SSSE3-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] -; AVX512VL-NEXT: retq +; SSE41-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; SSE41-NEXT: retq ; -; XOP-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: -; XOP: # %bb.0: -; XOP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; XOP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; XOP-NEXT: retq +; AVX-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: +; AVX: # %bb.0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07(<16 x i8> %a, <16 x i8> %b) { -; SSE-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: -; SSE: # %bb.0: -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE-NEXT: retq -; -; AVX1-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: -; AVX1: # %bb.0: -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; AVX2-SLOW-NEXT: retq +; SSE2-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] -; AVX2-FAST-NEXT: retq +; SSSE3-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; SSSE3-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] -; AVX512VL-NEXT: retq +; SSE41-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; SSE41-NEXT: retq ; -; XOP-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: -; XOP: # %bb.0: -; XOP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; XOP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; XOP-NEXT: retq +; AVX-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: +; AVX: # %bb.0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -969,13 +969,27 @@ ; ; SSE41-LABEL: insert_reg_lo_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pinsrq $0, %rdi, %xmm0 +; SSE41-NEXT: movq %rdi, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_reg_lo_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: insert_reg_lo_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq %rdi, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_reg_lo_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq %rdi, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_reg_lo_v2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovq %rdi, %xmm1 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512VL-NEXT: retq %v = insertelement <2 x i64> undef, i64 %a, i32 0 %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle @@ -999,12 +1013,14 @@ ; ; SSE41-LABEL: insert_mem_lo_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pinsrq $0, (%rdi), %xmm0 +; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: insert_mem_lo_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpinsrq $0, (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX-NEXT: retq %a = load i64, ptr %ptr %v = insertelement <2 x i64> undef, i64 %a, i32 0 @@ -1013,32 +1029,16 @@ } define <2 x i64> @insert_reg_hi_v2i64(i64 %a, <2 x i64> %b) { -; SSE2-LABEL: insert_reg_hi_v2i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movq %rdi, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSE3-LABEL: insert_reg_hi_v2i64: -; SSE3: # %bb.0: -; SSE3-NEXT: movq %rdi, %xmm1 -; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE3-NEXT: retq -; -; SSSE3-LABEL: insert_reg_hi_v2i64: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movq %rdi, %xmm1 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: insert_reg_hi_v2i64: -; SSE41: # %bb.0: -; SSE41-NEXT: pinsrq $1, %rdi, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: insert_reg_hi_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: movq %rdi, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq ; ; AVX-LABEL: insert_reg_hi_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 +; AVX-NEXT: vmovq %rdi, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %v = insertelement <2 x i64> undef, i64 %a, i32 0 %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> @@ -1046,32 +1046,16 @@ } define <2 x i64> @insert_mem_hi_v2i64(ptr %ptr, <2 x i64> %b) { -; SSE2-LABEL: insert_mem_hi_v2i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSE3-LABEL: insert_mem_hi_v2i64: -; SSE3: # %bb.0: -; SSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE3-NEXT: retq -; -; SSSE3-LABEL: insert_mem_hi_v2i64: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: insert_mem_hi_v2i64: -; SSE41: # %bb.0: -; SSE41-NEXT: pinsrq $1, (%rdi), %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: insert_mem_hi_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq ; ; AVX-LABEL: insert_mem_hi_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpinsrq $1, (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %a = load i64, ptr %ptr %v = insertelement <2 x i64> undef, i64 %a, i32 0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1517,13 +1517,13 @@ define <4 x i32> @shuffle_v4i32_2456(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: shuffle_v4i32_2456: ; SSE2: # %bb.0: -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm1[0,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4i32_2456: ; SSE3: # %bb.0: -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm1[0,0] ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] ; SSE3-NEXT: retq ; @@ -2052,19 +2052,19 @@ define <4 x i32> @extract3_insert3_v4i32_0127(<4 x i32> %a0, <4 x i32> %a1) { ; SSE2-LABEL: extract3_insert3_v4i32_0127: ; SSE2: # %bb.0: -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSE2-NEXT: retq ; ; SSE3-LABEL: extract3_insert3_v4i32_0127: ; SSE3: # %bb.0: -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0] ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: extract3_insert3_v4i32_0127: ; SSSE3: # %bb.0: -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0] ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSSE3-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1330,7 +1330,7 @@ ; SSE2-LABEL: shuffle_v8i16_032dXXXX: ; SSE2: # %bb.0: ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] @@ -3255,10 +3255,18 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v8i16_i32: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %xmm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: insert_dup_elt1_mem_v8i16_i32: +; AVX2: # %bb.0: +; AVX2-NEXT: movzwl 2(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_elt1_mem_v8i16_i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movzwl 2(%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: insert_dup_elt1_mem_v8i16_i32: ; XOPAVX1: # %bb.0: @@ -3269,7 +3277,9 @@ ; ; XOPAVX2-LABEL: insert_dup_elt1_mem_v8i16_i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw 2(%rdi), %xmm0 +; XOPAVX2-NEXT: movzwl 2(%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0 ; XOPAVX2-NEXT: retq %tmp = load i32, ptr %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 @@ -3279,24 +3289,12 @@ } define <8 x i16> @insert_dup_elt3_mem_v8i16_i32(ptr %ptr) { -; SSE2-LABEL: insert_dup_elt3_mem_v8i16_i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: insert_dup_elt3_mem_v8i16_i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] -; SSE41-NEXT: retq +; SSE-LABEL: insert_dup_elt3_mem_v8i16_i32: +; SSE: # %bb.0: +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: retq ; ; AVX1-LABEL: insert_dup_elt3_mem_v8i16_i32: ; AVX1: # %bb.0: @@ -3305,10 +3303,18 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v8i16_i32: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %xmm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: insert_dup_elt3_mem_v8i16_i32: +; AVX2: # %bb.0: +; AVX2-NEXT: movzwl 2(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_elt3_mem_v8i16_i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movzwl 2(%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: insert_dup_elt3_mem_v8i16_i32: ; XOPAVX1: # %bb.0: @@ -3319,7 +3325,9 @@ ; ; XOPAVX2-LABEL: insert_dup_elt3_mem_v8i16_i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw 2(%rdi), %xmm0 +; XOPAVX2-NEXT: movzwl 2(%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0 ; XOPAVX2-NEXT: retq %tmp = load i32, ptr %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1 @@ -3384,33 +3392,20 @@ } define <8 x i16> @insert_dup_elt3_mem_v8i16_sext_i16(ptr %ptr) { -; SSE2-LABEL: insert_dup_elt3_mem_v8i16_sext_i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movswl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_sext_i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movswl (%rdi), %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: insert_dup_elt3_mem_v8i16_sext_i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movswl (%rdi), %eax -; SSE41-NEXT: movd %eax, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] -; SSE41-NEXT: retq +; SSE-LABEL: insert_dup_elt3_mem_v8i16_sext_i16: +; SSE: # %bb.0: +; SSE-NEXT: movswl (%rdi), %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: retq ; ; AVX1-LABEL: insert_dup_elt3_mem_v8i16_sext_i16: ; AVX1: # %bb.0: ; AVX1-NEXT: movswl (%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_dup_elt3_mem_v8i16_sext_i16: @@ -3432,7 +3427,8 @@ ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: movswl (%rdi), %eax ; XOPAVX1-NEXT: vmovd %eax, %xmm0 -; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: insert_dup_elt3_mem_v8i16_sext_i16: @@ -3453,14 +3449,14 @@ define <8 x i16> @insert_dup_mem_v8i16_i64(ptr %ptr) { ; SSE-LABEL: insert_dup_mem_v8i16_i64: ; SSE: # %bb.0: -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; ; AVX1-LABEL: insert_dup_mem_v8i16_i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq @@ -3472,7 +3468,7 @@ ; ; XOPAVX1-LABEL: insert_dup_mem_v8i16_i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; XOPAVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; XOPAVX1-NEXT: retq @@ -3503,10 +3499,18 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v8i16_i64: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %xmm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: insert_dup_elt1_mem_v8i16_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movzwl 2(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_elt1_mem_v8i16_i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movzwl 2(%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: insert_dup_elt1_mem_v8i16_i64: ; XOPAVX1: # %bb.0: @@ -3517,7 +3521,9 @@ ; ; XOPAVX2-LABEL: insert_dup_elt1_mem_v8i16_i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw 2(%rdi), %xmm0 +; XOPAVX2-NEXT: movzwl 2(%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0 ; XOPAVX2-NEXT: retq %tmp = load i64, ptr %ptr, align 4 %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0 @@ -3541,10 +3547,18 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v8i16_i64: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastw 6(%rdi), %xmm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: insert_dup_elt3_mem_v8i16_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movzwl 6(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_elt3_mem_v8i16_i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movzwl 6(%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: insert_dup_elt3_mem_v8i16_i64: ; XOPAVX1: # %bb.0: @@ -3555,7 +3569,9 @@ ; ; XOPAVX2-LABEL: insert_dup_elt3_mem_v8i16_i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw 6(%rdi), %xmm0 +; XOPAVX2-NEXT: movzwl 6(%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0 ; XOPAVX2-NEXT: retq %tmp = load i64, ptr %ptr, align 4 %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0 @@ -3592,10 +3608,18 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: insert_dup_elt7_mem_v8i16_i64: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastw 6(%rdi), %xmm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: insert_dup_elt7_mem_v8i16_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movzwl 6(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_elt7_mem_v8i16_i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movzwl 6(%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: insert_dup_elt7_mem_v8i16_i64: ; XOPAVX1: # %bb.0: @@ -3606,7 +3630,9 @@ ; ; XOPAVX2-LABEL: insert_dup_elt7_mem_v8i16_i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw 6(%rdi), %xmm0 +; XOPAVX2-NEXT: movzwl 6(%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0 ; XOPAVX2-NEXT: retq %tmp = load i64, ptr %ptr, align 4 %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -54,24 +54,29 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1] -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: retq +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-NEXT: retq ; -; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX512VL-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; XOPAVX1: # %bb.0: @@ -109,24 +114,29 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: retq +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-NEXT: retq ; -; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX512VL-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; XOPAVX1: # %bb.0: @@ -164,24 +174,29 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: retq +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-NEXT: retq ; -; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] +; AVX512VL-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: ; XOPAVX1: # %bb.0: @@ -217,11 +232,23 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,8,9] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,8,9] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -255,11 +282,23 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,10,11,0,1] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,10,11,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -293,11 +332,23 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,12,13,0,1,0,1] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,12,13,0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -331,11 +382,23 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -1007,26 +1070,31 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,1,0,0,0,1] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: retq +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: retq ; -; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0] +; AVX512VL-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0] +; AVX512VL-FAST-CROSSLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: ; XOPAVX1: # %bb.0: @@ -1060,26 +1128,31 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: retq +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: retq ; -; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0] +; AVX512VL-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0] +; AVX512VL-FAST-CROSSLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: ; XOPAVX1: # %bb.0: @@ -1113,26 +1186,31 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: retq +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: retq ; -; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0,0,0,0,0,3,0,0,0] +; AVX512VL-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0,0,0,0,0,3,0,0,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0,0,0,0,0,3,0,0,0] +; AVX512VL-FAST-CROSSLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: ; XOPAVX1: # %bb.0: @@ -1164,12 +1242,24 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0,0,0,0,4,0,0,0,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0,0,0,0,4,0,0,0,0] +; AVX512VL-FAST-CROSSLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -1199,12 +1289,24 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0,0,0,5,0,0,0,0,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0,0,0,5,0,0,0,0,0] +; AVX512VL-FAST-CROSSLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -1234,12 +1336,24 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0,0,6,0,0,0,0,0,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0,0,6,0,0,0,0,0,0] +; AVX512VL-FAST-CROSSLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -1269,12 +1383,24 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512VL-FAST-CROSSLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -1687,12 +1813,19 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] -; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: ; AVX512VL: # %bb.0: @@ -3428,8 +3561,8 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; XOPAVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,4,5,8,9,14,15] +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,0,1,2,3,4,5],xmm3[6,7] +; XOPAVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -3686,11 +3819,12 @@ ; AVX1-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08: @@ -4353,8 +4487,8 @@ ; ; AVX2-SLOW-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23] ; AVX2-SLOW-NEXT: retq ; @@ -4367,8 +4501,8 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23] ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -4388,8 +4522,8 @@ ; ; XOPAVX2-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7] -; XOPAVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -4400,11 +4534,12 @@ ; AVX1-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12: @@ -4459,11 +4594,12 @@ ; AVX1-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12: @@ -4518,11 +4654,12 @@ ; AVX1-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12: @@ -4577,11 +4714,12 @@ ; AVX1-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08: @@ -4696,11 +4834,12 @@ ; AVX1-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12: @@ -4755,11 +4894,12 @@ ; AVX1-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12: @@ -4814,11 +4954,12 @@ ; AVX1-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12: @@ -4911,11 +5052,24 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,2] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,8,9,10,11,0,1,2,3,4,5,14,15,16,17,18,19,24,25,26,27,16,17,18,19,20,21,30,31] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,8,9,10,11,0,1,2,3,4,5,14,15,16,17,18,19,24,25,26,27,16,17,18,19,20,21,30,31] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,4,3,5,6,4,7,5] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,0,1,2,3,8,9,14,15,16,17,18,19,20,21,22,23,16,17,18,19,24,25,30,31] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,8,9,10,11,0,1,2,3,4,5,14,15,16,17,18,19,24,25,26,27,16,17,18,19,20,21,30,31] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11: ; AVX512VL: # %bb.0: @@ -5099,8 +5253,8 @@ ; ; AVX2-SLOW-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27] ; AVX2-SLOW-NEXT: retq ; @@ -5113,8 +5267,8 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27] ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -5134,8 +5288,8 @@ ; ; XOPAVX2-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7] -; XOPAVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -6094,8 +6248,7 @@ ; AVX1-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [0,1,4,5,8,9,4,5,0,1,4,5,8,9,4,5] -; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5] ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] @@ -6153,9 +6306,9 @@ ; ; AVX2-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12: ; AVX2: # %bb.0: -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; AVX2-NEXT: retq ; @@ -6177,9 +6330,9 @@ ; ; XOPAVX2-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] ; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -6335,9 +6488,9 @@ ; ; AVX2-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10: ; AVX2: # %bb.0: -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] ; AVX2-NEXT: retq ; @@ -6353,16 +6506,17 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOPAVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5] -; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,4,5,10,11] +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,0,1,2,3],xmm3[4,5] +; XOPAVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15] +; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] ; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -6519,7 +6673,7 @@ ; AVX2-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26: ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] ; AVX2-NEXT: retq @@ -6535,15 +6689,16 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; XOPAVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5] -; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,4,5,10,11] +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,0,1,2,3],xmm3[4,5] +; XOPAVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] ; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] ; XOPAVX2-NEXT: retq @@ -6597,7 +6752,7 @@ ; AVX2-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28: ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; AVX2-NEXT: retq @@ -6622,7 +6777,7 @@ ; XOPAVX2-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] ; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; XOPAVX2-NEXT: retq @@ -7537,30 +7692,17 @@ ; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: PR34369: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5,6],xmm0[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpand %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: PR34369: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,10,11,u,u,u,u,u,u,4,5] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6],xmm2[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpand %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-LABEL: PR34369: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,10,11,u,u,u,u,u,u,4,5] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25] +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6],xmm2[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: PR34369: ; AVX512VL: # %bb.0: @@ -7635,21 +7777,29 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16(ptr %ptr) { ; AVX1-LABEL: insert_dup_mem_v16i16_sext_i16: ; AVX1: # %bb.0: -; AVX1-NEXT: movzwl (%rdi), %eax +; AVX1-NEXT: movswl (%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: insert_dup_mem_v16i16_sext_i16: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastw (%rdi), %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: insert_dup_mem_v16i16_sext_i16: +; AVX2: # %bb.0: +; AVX2-NEXT: movswl (%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_mem_v16i16_sext_i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movswl (%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: insert_dup_mem_v16i16_sext_i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: movzwl (%rdi), %eax +; XOPAVX1-NEXT: movswl (%rdi), %eax ; XOPAVX1-NEXT: vmovd %eax, %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] @@ -7658,7 +7808,9 @@ ; ; XOPAVX2-LABEL: insert_dup_mem_v16i16_sext_i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw (%rdi), %ymm0 +; XOPAVX2-NEXT: movswl (%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; XOPAVX2-NEXT: retq %tmp = load i16, ptr %ptr, align 2 %tmp1 = sext i16 %tmp to i32 @@ -7677,10 +7829,18 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v16i16_i32: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: insert_dup_elt1_mem_v16i16_i32: +; AVX2: # %bb.0: +; AVX2-NEXT: movzwl 2(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_elt1_mem_v16i16_i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movzwl 2(%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: insert_dup_elt1_mem_v16i16_i32: ; XOPAVX1: # %bb.0: @@ -7692,7 +7852,9 @@ ; ; XOPAVX2-LABEL: insert_dup_elt1_mem_v16i16_i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw 2(%rdi), %ymm0 +; XOPAVX2-NEXT: movzwl 2(%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; XOPAVX2-NEXT: retq %tmp = load i32, ptr %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 @@ -7710,10 +7872,18 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v16i16_i32: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: insert_dup_elt3_mem_v16i16_i32: +; AVX2: # %bb.0: +; AVX2-NEXT: movzwl 2(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_elt3_mem_v16i16_i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movzwl 2(%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: insert_dup_elt3_mem_v16i16_i32: ; XOPAVX1: # %bb.0: @@ -7725,7 +7895,9 @@ ; ; XOPAVX2-LABEL: insert_dup_elt3_mem_v16i16_i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw 2(%rdi), %ymm0 +; XOPAVX2-NEXT: movzwl 2(%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; XOPAVX2-NEXT: retq %tmp = load i32, ptr %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1 @@ -7776,10 +7948,18 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v16i16_i64: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: insert_dup_elt1_mem_v16i16_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movzwl 2(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_elt1_mem_v16i16_i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movzwl 2(%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: insert_dup_elt1_mem_v16i16_i64: ; XOPAVX1: # %bb.0: @@ -7791,7 +7971,9 @@ ; ; XOPAVX2-LABEL: insert_dup_elt1_mem_v16i16_i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw 2(%rdi), %ymm0 +; XOPAVX2-NEXT: movzwl 2(%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; XOPAVX2-NEXT: retq %tmp = load i64, ptr %ptr, align 4 %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0 @@ -7809,10 +7991,18 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v16i16_i64: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastw 6(%rdi), %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: insert_dup_elt3_mem_v16i16_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movzwl 6(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_elt3_mem_v16i16_i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movzwl 6(%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: insert_dup_elt3_mem_v16i16_i64: ; XOPAVX1: # %bb.0: @@ -7824,7 +8014,9 @@ ; ; XOPAVX2-LABEL: insert_dup_elt3_mem_v16i16_i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw 6(%rdi), %ymm0 +; XOPAVX2-NEXT: movzwl 6(%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; XOPAVX2-NEXT: retq %tmp = load i64, ptr %ptr, align 4 %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0 @@ -7836,26 +8028,38 @@ define <16 x i16> @insert_dup_elt7_mem_v16i16_i64(ptr %ptr) { ; AVX1-LABEL: insert_dup_elt7_mem_v16i16_i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: insert_dup_elt7_mem_v16i16_i64: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastw 6(%rdi), %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: insert_dup_elt7_mem_v16i16_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movzwl 6(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_elt7_mem_v16i16_i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movzwl 6(%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: insert_dup_elt7_mem_v16i16_i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7] +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: insert_dup_elt7_mem_v16i16_i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw 6(%rdi), %ymm0 +; XOPAVX2-NEXT: movzwl 6(%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; XOPAVX2-NEXT: retq %tmp = load i64, ptr %ptr, align 4 %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 1 @@ -7867,22 +8071,30 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16_i64(ptr %ptr) { ; AVX1-LABEL: insert_dup_mem_v16i16_sext_i16_i64: ; AVX1: # %bb.0: -; AVX1-NEXT: movzwl (%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: movswq (%rdi), %rax +; AVX1-NEXT: vmovq %rax, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: insert_dup_mem_v16i16_sext_i16_i64: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastw (%rdi), %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: insert_dup_mem_v16i16_sext_i16_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movswq (%rdi), %rax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_mem_v16i16_sext_i16_i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movswq (%rdi), %rax +; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: insert_dup_mem_v16i16_sext_i16_i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: movzwl (%rdi), %eax -; XOPAVX1-NEXT: vmovd %eax, %xmm0 +; XOPAVX1-NEXT: movswq (%rdi), %rax +; XOPAVX1-NEXT: vmovq %rax, %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -7890,7 +8102,9 @@ ; ; XOPAVX2-LABEL: insert_dup_mem_v16i16_sext_i16_i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw (%rdi), %ymm0 +; XOPAVX2-NEXT: movswq (%rdi), %rax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; XOPAVX2-NEXT: retq %tmp = load i16, ptr %ptr, align 2 %tmp1 = sext i16 %tmp to i64 @@ -7960,11 +8174,19 @@ ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[26,27],zero,zero ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: pr43230: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: pr43230: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512VL-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15] +; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: pr43230: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512VL-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VL-FAST-NEXT: retq ; ; XOPAVX1-LABEL: pr43230: ; XOPAVX1: # %bb.0: @@ -7985,7 +8207,9 @@ ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; XOPAVX2-NEXT: vpshlw %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15] +; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; XOPAVX2-NEXT: retq %shr = lshr <16 x i16> %a, %b %shuf = shufflevector <16 x i16> zeroinitializer, <16 x i16> %shr, <16 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -4433,11 +4433,24 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-ALL-LABEL: shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15,16,17,20,21,18,19,22,23,24,25,28,29,26,27,30,31] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31: ; AVX512VL: # %bb.0: @@ -4770,18 +4783,11 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-ALL-LABEL: shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6] -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,30,31,30,31,30,31,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-FAST-LABEL: shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,30,31,30,31,30,31,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31: ; AVX512VL: # %bb.0: @@ -5145,7 +5151,7 @@ define <32 x i8> @insert_dup_mem_v32i8_sext_i8(ptr %ptr) { ; AVX1-LABEL: insert_dup_mem_v32i8_sext_i8: ; AVX1: # %bb.0: -; AVX1-NEXT: movzbl (%rdi), %eax +; AVX1-NEXT: movsbl (%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 @@ -5159,7 +5165,7 @@ ; ; XOPAVX1-LABEL: insert_dup_mem_v32i8_sext_i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: movzbl (%rdi), %eax +; XOPAVX1-NEXT: movsbl (%rdi), %eax ; XOPAVX1-NEXT: vmovd %eax, %xmm0 ; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -2008,14 +2008,14 @@ ; ; AVX2-LABEL: add_v4f64_024u_135u_reverse: ; AVX2: # %bb.0: -; AVX2-NEXT: vhaddpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,3,1] +; AVX2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,0] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: add_v4f64_024u_135u_reverse: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vhaddpd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,3,1] +; AVX512VL-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,0] ; AVX512VL-NEXT: retq %shuffle0 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> %shuffle1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -3686,37 +3686,43 @@ define <8 x float> @broadcast_concat_crash(<4 x float> %x, <4 x float> %y, float %z) { ; AVX1-LABEL: broadcast_concat_crash: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,1,1] ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: broadcast_concat_crash: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,1,1] ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss %xmm2, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: broadcast_concat_crash: ; AVX512VL-SLOW: # %bb.0: # %entry -; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512VL-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,1,1] ; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX512VL-SLOW-NEXT: vbroadcastss %xmm2, %ymm1 +; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: broadcast_concat_crash: -; AVX512VL-FAST: # %bb.0: # %entry -; AVX512VL-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} xmm1 = [1,4,3,3] -; AVX512VL-FAST-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 -; AVX512VL-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: broadcast_concat_crash: +; AVX512VL-FAST-ALL: # %bb.0: # %entry +; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512VL-FAST-ALL-NEXT: vbroadcastss %xmm2, %ymm2 +; AVX512VL-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [3,13,1,1,3,13,1,1] +; AVX512VL-FAST-ALL-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512VL-FAST-ALL-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: broadcast_concat_crash: +; AVX512VL-FAST-PERLANE: # %bb.0: # %entry +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,1,1] +; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vbroadcastss %xmm2, %ymm1 +; AVX512VL-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX512VL-FAST-PERLANE-NEXT: retq entry: %tmp = shufflevector <4 x float> %x, <4 x float> %y, <8 x i32> %bc = bitcast <8 x float> %tmp to <4 x i64> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -586,8 +586,8 @@ define <16 x float> @insert_sub01_8(<16 x float> %base, <4 x float> %sub1, <4 x float> %sub2, <4 x float> %sub3, <4 x float> %sub4) { ; ALL-LABEL: insert_sub01_8: ; ALL: # %bb.0: -; ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; ALL-NEXT: vinsertf32x4 $1, %xmm2, %zmm1, %zmm1 ; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %sub12 = shufflevector <4 x float> %sub1, <4 x float> %sub2, <8 x i32> @@ -600,9 +600,9 @@ define <16 x float> @insert_sub23_0(<16 x float> %base, <4 x float> %sub1, <4 x float> %sub2, <4 x float> %sub3, <4 x float> %sub4) { ; ALL-LABEL: insert_sub23_0: ; ALL: # %bb.0: -; ALL-NEXT: # kill: def $xmm3 killed $xmm3 def $ymm3 -; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm1 -; ALL-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: vinsertf32x4 $2, %xmm3, %zmm0, %zmm1 +; ALL-NEXT: vinsertf32x4 $3, %xmm4, %zmm1, %zmm1 +; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7],zmm0[4,5,6,7] ; ALL-NEXT: retq %sub12 = shufflevector <4 x float> %sub1, <4 x float> %sub2, <8 x i32> %sub34 = shufflevector <4 x float> %sub3, <4 x float> %sub4, <8 x i32> @@ -865,8 +865,8 @@ ; ALL-NEXT: vbroadcastss {{.*#+}} ymm0 = [-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0] ; ALL-NEXT: vmulps 32(%rdi), %ymm0, %ymm0 ; ALL-NEXT: vcvtps2pd %ymm0, %zmm0 -; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,4,5,4,5,6,7] -; ALL-NEXT: vmovapd %ymm0, {{[0-9]+}}(%rsp) +; ALL-NEXT: vextractf32x4 $2, %zmm0, {{[0-9]+}}(%rsp) +; ALL-NEXT: vextractf32x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; ALL-NEXT: movq %rbp, %rsp ; ALL-NEXT: popq %rbp ; ALL-NEXT: .cfi_def_cfa %rsp, 8 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -285,13 +285,16 @@ define <32 x i16> @insert_dup_mem_v32i16_sext_i16(ptr %ptr) { ; KNL-LABEL: insert_dup_mem_v32i16_sext_i16: ; KNL: ## %bb.0: -; KNL-NEXT: vpbroadcastw (%rdi), %ymm0 +; KNL-NEXT: movswl (%rdi), %eax +; KNL-NEXT: vmovd %eax, %xmm0 +; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 ; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: insert_dup_mem_v32i16_sext_i16: ; SKX: ## %bb.0: -; SKX-NEXT: vpbroadcastw (%rdi), %zmm0 +; SKX-NEXT: movswl (%rdi), %eax +; SKX-NEXT: vpbroadcastw %eax, %zmm0 ; SKX-NEXT: retq %tmp = load i16, ptr %ptr, align 2 %tmp1 = sext i16 %tmp to i32 @@ -304,13 +307,16 @@ define <32 x i16> @insert_dup_elt1_mem_v32i16_i32(ptr %ptr) #0 { ; KNL-LABEL: insert_dup_elt1_mem_v32i16_i32: ; KNL: ## %bb.0: -; KNL-NEXT: vpbroadcastw 2(%rdi), %ymm0 +; KNL-NEXT: movzwl 2(%rdi), %eax +; KNL-NEXT: vmovd %eax, %xmm0 +; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 ; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: insert_dup_elt1_mem_v32i16_i32: ; SKX: ## %bb.0: -; SKX-NEXT: vpbroadcastw 2(%rdi), %zmm0 +; SKX-NEXT: movzwl 2(%rdi), %eax +; SKX-NEXT: vpbroadcastw %eax, %zmm0 ; SKX-NEXT: retq %tmp = load i32, ptr %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 @@ -322,13 +328,16 @@ define <32 x i16> @insert_dup_elt3_mem_v32i16_i32(ptr %ptr) #0 { ; KNL-LABEL: insert_dup_elt3_mem_v32i16_i32: ; KNL: ## %bb.0: -; KNL-NEXT: vpbroadcastw 2(%rdi), %ymm0 +; KNL-NEXT: movzwl 2(%rdi), %eax +; KNL-NEXT: vmovd %eax, %xmm0 +; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 ; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: insert_dup_elt3_mem_v32i16_i32: ; SKX: ## %bb.0: -; SKX-NEXT: vpbroadcastw 2(%rdi), %zmm0 +; SKX-NEXT: movzwl 2(%rdi), %eax +; SKX-NEXT: vpbroadcastw %eax, %zmm0 ; SKX-NEXT: retq %tmp = load i32, ptr %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1 @@ -358,13 +367,16 @@ define <32 x i16> @insert_dup_elt1_mem_v16i16_i64(ptr %ptr) { ; KNL-LABEL: insert_dup_elt1_mem_v16i16_i64: ; KNL: ## %bb.0: -; KNL-NEXT: vpbroadcastw 2(%rdi), %ymm0 +; KNL-NEXT: movzwl 2(%rdi), %eax +; KNL-NEXT: vmovd %eax, %xmm0 +; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 ; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: insert_dup_elt1_mem_v16i16_i64: ; SKX: ## %bb.0: -; SKX-NEXT: vpbroadcastw 2(%rdi), %zmm0 +; SKX-NEXT: movzwl 2(%rdi), %eax +; SKX-NEXT: vpbroadcastw %eax, %zmm0 ; SKX-NEXT: retq %tmp = load i64, ptr %ptr, align 4 %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0 @@ -376,13 +388,16 @@ define <32 x i16> @insert_dup_elt3_mem_v16i16_i64(ptr %ptr) { ; KNL-LABEL: insert_dup_elt3_mem_v16i16_i64: ; KNL: ## %bb.0: -; KNL-NEXT: vpbroadcastw 6(%rdi), %ymm0 +; KNL-NEXT: movzwl 6(%rdi), %eax +; KNL-NEXT: vmovd %eax, %xmm0 +; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 ; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: insert_dup_elt3_mem_v16i16_i64: ; SKX: ## %bb.0: -; SKX-NEXT: vpbroadcastw 6(%rdi), %zmm0 +; SKX-NEXT: movzwl 6(%rdi), %eax +; SKX-NEXT: vpbroadcastw %eax, %zmm0 ; SKX-NEXT: retq %tmp = load i64, ptr %ptr, align 4 %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0 @@ -394,13 +409,16 @@ define <32 x i16> @insert_dup_elt7_mem_v16i16_i64(ptr %ptr) { ; KNL-LABEL: insert_dup_elt7_mem_v16i16_i64: ; KNL: ## %bb.0: -; KNL-NEXT: vpbroadcastw 6(%rdi), %ymm0 +; KNL-NEXT: movzwl 6(%rdi), %eax +; KNL-NEXT: vmovd %eax, %xmm0 +; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 ; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: insert_dup_elt7_mem_v16i16_i64: ; SKX: ## %bb.0: -; SKX-NEXT: vpbroadcastw 6(%rdi), %zmm0 +; SKX-NEXT: movzwl 6(%rdi), %eax +; SKX-NEXT: vpbroadcastw %eax, %zmm0 ; SKX-NEXT: retq %tmp = load i64, ptr %ptr, align 4 %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 1 @@ -412,13 +430,16 @@ define <32 x i16> @insert_dup_mem_v16i16_sext_i16_i64(ptr %ptr) { ; KNL-LABEL: insert_dup_mem_v16i16_sext_i16_i64: ; KNL: ## %bb.0: -; KNL-NEXT: vpbroadcastw (%rdi), %ymm0 +; KNL-NEXT: movswq (%rdi), %rax +; KNL-NEXT: vmovd %eax, %xmm0 +; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 ; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: insert_dup_mem_v16i16_sext_i16_i64: ; SKX: ## %bb.0: -; SKX-NEXT: vpbroadcastw (%rdi), %zmm0 +; SKX-NEXT: movswq (%rdi), %rax +; SKX-NEXT: vpbroadcastw %eax, %zmm0 ; SKX-NEXT: retq %tmp = load i16, ptr %ptr, align 2 %tmp1 = sext i16 %tmp to i64 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -290,27 +290,22 @@ ;Negative test. define <8 x float> @expand15(<4 x float> %a) { -; AVX512-SLOW-LABEL: expand15: -; AVX512-SLOW: # %bb.0: -; AVX512-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX512-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX512-SLOW-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7] -; AVX512-SLOW-NEXT: ret{{[l|q]}} -; -; AVX512-FAST-LABEL: expand15: -; AVX512-FAST: # %bb.0: -; AVX512-FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [1,0,0,0,1,0,0,0] -; AVX512-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512-FAST-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7] -; AVX512-FAST-NEXT: ret{{[l|q]}} +; AVX512-LABEL: expand15: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [0,1,8,3,9,5,6,7] +; AVX512-NEXT: vpermi2ps %ymm0, %ymm2, %ymm1 +; AVX512-NEXT: vmovaps %ymm1, %ymm0 +; AVX512-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: expand15: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX512F-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7] +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,16,3,17,5,6,7] +; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vmovaps %ymm1, %ymm0 ; AVX512F-NEXT: ret{{[l|q]}} %addV = fadd <4 x float> , %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> @@ -572,8 +567,11 @@ ; X86-AVX512-SLOW-NEXT: vpbroadcastd 44(%ecx), %xmm0 ; X86-AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; X86-AVX512-SLOW-NEXT: vmovdqa %ymm0, 672(%eax) -; X86-AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,0,2,3] -; X86-AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; X86-AVX512-SLOW-NEXT: vmovdqa 208(%ecx), %xmm0 +; X86-AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero +; X86-AVX512-SLOW-NEXT: vmovd %xmm0, %ecx +; X86-AVX512-SLOW-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm0 ; X86-AVX512-SLOW-NEXT: vmovdqa %ymm0, 832(%eax) ; X86-AVX512-SLOW-NEXT: vzeroupper ; X86-AVX512-SLOW-NEXT: retl @@ -583,8 +581,11 @@ ; X64-AVX512-SLOW-NEXT: vpbroadcastd 44(%rdi), %xmm0 ; X64-AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; X64-AVX512-SLOW-NEXT: vmovdqa %ymm0, 672(%rsi) -; X64-AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,0,2,3] -; X64-AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; X64-AVX512-SLOW-NEXT: vmovdqa 208(%rdi), %xmm0 +; X64-AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero +; X64-AVX512-SLOW-NEXT: vmovd %xmm0, %eax +; X64-AVX512-SLOW-NEXT: vpinsrd $1, %eax, %xmm1, %xmm0 ; X64-AVX512-SLOW-NEXT: vmovdqa %ymm0, 832(%rsi) ; X64-AVX512-SLOW-NEXT: vzeroupper ; X64-AVX512-SLOW-NEXT: retq @@ -597,7 +598,9 @@ ; X86-AVX512-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; X86-AVX512-FAST-NEXT: vmovdqa %ymm0, 672(%eax) ; X86-AVX512-FAST-NEXT: vmovdqa 208(%ecx), %xmm0 -; X86-AVX512-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero +; X86-AVX512-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,6,7,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; X86-AVX512-FAST-NEXT: vmovd %xmm0, %ecx +; X86-AVX512-FAST-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm0 ; X86-AVX512-FAST-NEXT: vmovdqa %ymm0, 832(%eax) ; X86-AVX512-FAST-NEXT: vzeroupper ; X86-AVX512-FAST-NEXT: retl @@ -608,7 +611,9 @@ ; X64-AVX512-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; X64-AVX512-FAST-NEXT: vmovdqa %ymm0, 672(%rsi) ; X64-AVX512-FAST-NEXT: vmovdqa 208(%rdi), %xmm0 -; X64-AVX512-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero +; X64-AVX512-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,6,7,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; X64-AVX512-FAST-NEXT: vmovd %xmm0, %eax +; X64-AVX512-FAST-NEXT: vpinsrd $1, %eax, %xmm1, %xmm0 ; X64-AVX512-FAST-NEXT: vmovdqa %ymm0, 832(%rsi) ; X64-AVX512-FAST-NEXT: vzeroupper ; X64-AVX512-FAST-NEXT: retq @@ -620,8 +625,11 @@ ; X86-AVX512F-NEXT: vpbroadcastd 44(%ecx), %xmm0 ; X86-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; X86-AVX512F-NEXT: vmovdqa %ymm0, 672(%eax) -; X86-AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,0,2,3] -; X86-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; X86-AVX512F-NEXT: vmovdqa 208(%ecx), %xmm0 +; X86-AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX512F-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero +; X86-AVX512F-NEXT: vmovd %xmm0, %ecx +; X86-AVX512F-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm0 ; X86-AVX512F-NEXT: vmovdqa %ymm0, 832(%eax) ; X86-AVX512F-NEXT: vzeroupper ; X86-AVX512F-NEXT: retl @@ -631,8 +639,11 @@ ; X64-AVX512F-NEXT: vpbroadcastd 44(%rdi), %xmm0 ; X64-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; X64-AVX512F-NEXT: vmovdqa %ymm0, 672(%rsi) -; X64-AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,0,2,3] -; X64-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; X64-AVX512F-NEXT: vmovdqa 208(%rdi), %xmm0 +; X64-AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512F-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero +; X64-AVX512F-NEXT: vmovd %xmm0, %eax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm1, %xmm0 ; X64-AVX512F-NEXT: vmovdqa %ymm0, 832(%rsi) ; X64-AVX512F-NEXT: vzeroupper ; X64-AVX512F-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -132,19 +132,10 @@ } define <8 x float> @combine_vpermilvar_vperm2f128_zero_8f32(<8 x float> %a0) { -; AVX-LABEL: combine_vpermilvar_vperm2f128_zero_8f32: -; AVX: # %bb.0: -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] -; AVX-NEXT: ret{{[l|q]}} -; -; AVX512-LABEL: combine_vpermilvar_vperm2f128_zero_8f32: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [16,17,18,19,3,2,1,0] -; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 -; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX512-NEXT: ret{{[l|q]}} +; CHECK-LABEL: combine_vpermilvar_vperm2f128_zero_8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] +; CHECK-NEXT: ret{{[l|q]}} %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> ) %2 = shufflevector <8 x float> %1, <8 x float> zeroinitializer, <8 x i32> %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %2, <8 x i32> ) @@ -439,7 +430,7 @@ ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,1,2,2] +; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[2] ; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm5 ; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[1],ymm4[0],ymm5[2],ymm4[3] @@ -491,16 +482,16 @@ ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 -; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = [1,0,2,0,8,0,9,0] -; X86-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm3 -; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm5 = [0,0,10,0,2,0,9,0] -; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1] -; X86-AVX512-NEXT: vpermt2pd %zmm4, %zmm5, %zmm6 -; X86-AVX512-NEXT: vmovapd %ymm6, (%edx) -; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm4 = [0,0,3,0,10,0,1,0] -; X86-AVX512-NEXT: vpermi2pd %zmm0, %zmm3, %zmm4 -; X86-AVX512-NEXT: vmovapd %ymm4, (%ecx) +; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm4 = [1,0,2,0,8,0,9,0] +; X86-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm4 +; X86-AVX512-NEXT: vblendpd {{.*#+}} ymm5 = ymm0[0,1],ymm4[2,3] +; X86-AVX512-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1] +; X86-AVX512-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3] +; X86-AVX512-NEXT: vmovapd %ymm3, (%edx) +; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = [0,0,3,0,10,0,1,0] +; X86-AVX512-NEXT: vpermi2pd %zmm0, %zmm4, %zmm3 +; X86-AVX512-NEXT: vmovapd %ymm3, (%ecx) ; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [3,0,11,0,3,0,11,0] ; X86-AVX512-NEXT: # ymm3 = mem[0,1,0,1] ; X86-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm3 @@ -513,7 +504,7 @@ ; X64-AVX1-LABEL: PR48908: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,1,2,2] +; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[2] ; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] ; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm5 ; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[1],ymm4[0],ymm5[2],ymm4[3] @@ -562,10 +553,10 @@ ; X64-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm4 = [1,2,8,9] ; X64-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm4 -; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm5 = [0,10,2,9] -; X64-AVX512-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1] -; X64-AVX512-NEXT: vpermt2pd %zmm3, %zmm5, %zmm6 -; X64-AVX512-NEXT: vmovapd %ymm6, (%rdi) +; X64-AVX512-NEXT: vblendpd {{.*#+}} ymm5 = ymm0[0,1],ymm4[2,3] +; X64-AVX512-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1] +; X64-AVX512-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3] +; X64-AVX512-NEXT: vmovapd %ymm3, (%rdi) ; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = [0,3,10,1] ; X64-AVX512-NEXT: vpermi2pd %zmm0, %zmm4, %zmm3 ; X64-AVX512-NEXT: vmovapd %ymm3, (%rsi) @@ -756,3 +747,5 @@ %v1 = shufflevector <16 x i64> %v0, <16 x i64> undef, <16 x i32> ret <16 x i64> %v1 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -55,6 +55,9 @@ ; CHECK-LABEL: combine_and_pshufb: ; CHECK: # %bb.0: ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] ; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i32> @@ -128,7 +131,7 @@ define <8 x float> @combine_as_vpermps(<8 x float> %a0) { ; CHECK-LABEL: combine_as_vpermps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <6,4,7,5,1,u,4,7> +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [6,4,7,5,1,0,4,7] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> @@ -837,16 +840,29 @@ ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-NEXT: ret{{[l|q]}} ; -; AVX512-LABEL: PR34577: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = <23,18,7,2,20,u,3,2> -; AVX512-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512-NEXT: ret{{[l|q]}} +; X86-AVX512-LABEL: PR34577: +; X86-AVX512: # %bb.0: # %entry +; X86-AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = <8,0,u,u,1,0,u,u> +; X86-AVX512-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; X86-AVX512-NEXT: vpermt2pd %zmm3, %zmm2, %zmm0 +; X86-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,23,18,4,5,19,18] +; X86-AVX512-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 +; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; X86-AVX512-NEXT: retl +; +; X64-AVX512-LABEL: PR34577: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = <8,u,1,u> +; X64-AVX512-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; X64-AVX512-NEXT: vpermt2pd %zmm3, %zmm2, %zmm0 +; X64-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,23,18,4,5,19,18] +; X64-AVX512-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 +; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; X64-AVX512-NEXT: retq entry: %shuf0 = shufflevector <8 x float> %inp0, <8 x float> %inp2, <8 x i32> %sel = select <8 x i1> , <8 x float> %shuf0, <8 x float> zeroinitializer diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -54,10 +54,10 @@ define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) { ; X86-LABEL: combine_pshufb_identity_mask: ; X86: # %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; X86-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X86-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 ; X86-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1} ; X86-NEXT: vpshufb %zmm2, %zmm3, %zmm1 {%k1} @@ -102,8 +102,8 @@ define <64 x i8> @combine_pshufb_as_pslldq_mask(<64 x i8> %a0, i64 %m) { ; X86-LABEL: combine_pshufb_as_pslldq_mask: ; X86: # %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -128,8 +128,8 @@ define <64 x i8> @combine_pshufb_as_psrldq_mask(<64 x i8> %a0, i64 %m) { ; X86-LABEL: combine_pshufb_as_psrldq_mask: ; X86: # %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -158,10 +158,10 @@ define <64 x i8> @combine_permi2q_pshufb_as_permi2d_mask(<8 x i64> %a0, <8 x i64> %a1, i64 %m) { ; X86-LABEL: combine_permi2q_pshufb_as_permi2d_mask: ; X86: # %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [7,0,12,0,5,0,14,0,7,0,12,0,5,0,14,0] ; X86-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; X86-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm2[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,20,21,22,23,20,21,22,23,20,21,22,23,20,21,22,23,40,41,42,43,40,41,42,43,40,41,42,43,40,41,42,43,60,61,62,63,60,61,62,63,60,61,62,63,60,61,62,63] ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll @@ -110,8 +110,8 @@ ; X86-NEXT: vpmovqw %ymm1, %xmm1 ; X86-NEXT: vpsllw $8, %xmm0, %xmm0 ; X86-NEXT: vpsraw $8, %xmm0, %xmm0 -; X86-NEXT: vpsllw $8, %xmm1, %xmm1 -; X86-NEXT: vpsraw $8, %xmm1, %xmm1 +; X86-NEXT: vpsllw $8, %ymm1, %ymm1 +; X86-NEXT: vpsraw $8, %ymm1, %ymm1 ; X86-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; X86-NEXT: vmovdqu %ymm0, (%eax) ; X86-NEXT: vzeroupper @@ -123,11 +123,14 @@ ; X64-NEXT: vmovdqu (%rax), %ymm1 ; X64-NEXT: vpmovqw %ymm0, %xmm0 ; X64-NEXT: vpmovqw %ymm1, %xmm1 -; X64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; X64-NEXT: vpsllw $8, %ymm0, %ymm0 -; X64-NEXT: vpsraw $8, %ymm0, %ymm0 -; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,1] -; X64-NEXT: vmovdqu %ymm0, (%rdi) +; X64-NEXT: vpsllw $8, %xmm0, %xmm0 +; X64-NEXT: vpsraw $8, %xmm0, %xmm0 +; X64-NEXT: vpsllw $8, %xmm1, %xmm1 +; X64-NEXT: vpsraw $8, %xmm1, %xmm1 +; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-NEXT: vmovdqu %xmm0, (%rdi) +; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovdqu %xmm0, 16(%rdi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq %2 = load <4 x i64>, ptr null, align 8 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll @@ -128,8 +128,8 @@ define <64 x i8> @combine_permi2q_pshufb_as_permi2d_mask(<8 x i64> %a0, <8 x i64> %a1, i64 %m) { ; X86-LABEL: combine_permi2q_pshufb_as_permi2d_mask: ; X86: # %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm2 = [56,57,58,59,56,57,58,59,56,57,58,59,56,57,58,59,44,45,46,47,44,45,46,47,44,45,46,47,44,45,46,47,96,97,98,99,96,97,98,99,96,97,98,99,96,97,98,99,116,117,118,119,116,117,118,119,116,117,118,119,116,117,118,119] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpermi2b %zmm0, %zmm1, %zmm2 {%k1} {z} ; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll @@ -25,35 +25,37 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) { ; SSE-LABEL: PR50049: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 ; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa (%rsi), %xmm4 ; SSE-NEXT: movdqa 16(%rsi), %xmm5 -; SSE-NEXT: movdqa 32(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = <128,128,128,128,128,128,2,5,8,11,14,u,u,u,u,u> -; SSE-NEXT: pshufb %xmm6, %xmm0 +; SSE-NEXT: pshufb %xmm6, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = <0,3,6,9,12,15,128,128,128,128,128,u,u,u,u,u> -; SSE-NEXT: pshufb %xmm7, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufb %xmm7, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] +; SSE-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE-NEXT: pshufb %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [128,128,128,128,128,128,128,128,128,128,128,1,4,7,10,13] +; SSE-NEXT: pshufb %xmm9, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshufb %xmm6, %xmm5 ; SSE-NEXT: pshufb %xmm7, %xmm4 ; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; SSE-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE-NEXT: pmullw %xmm5, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = <8,u,9,u,10,u,128,u,128,u,128,u,128,u,128,u> -; SSE-NEXT: pshufb %xmm6, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = <128,u,128,u,128,u,1,u,4,u,7,u,10,u,13,u> -; SSE-NEXT: pshufb %xmm7, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshufb %xmm6, %xmm2 -; SSE-NEXT: pshufb %xmm7, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pmullw %xmm3, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; SSE-NEXT: pshufb %xmm3, %xmm4 +; SSE-NEXT: pshufb %xmm9, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pmullw %xmm8, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pmullw %xmm2, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: retq %x1 = load <48 x i8>, ptr %p1, align 16 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -469,9 +469,15 @@ } define <8 x i16> @combine_pshufb_as_unpacklo_undef(<16 x i8> %a0) { -; CHECK-LABEL: combine_pshufb_as_unpacklo_undef: -; CHECK: # %bb.0: -; CHECK-NEXT: retq +; SSE-LABEL: combine_pshufb_as_unpacklo_undef: +; SSE: # %bb.0: +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_pshufb_as_unpacklo_undef: +; AVX: # %bb.0: +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; AVX-NEXT: retq %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) %2 = bitcast <16 x i8> %1 to <8 x i16> %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -1724,46 +1724,38 @@ define <4 x i8> @combine_test1c(ptr %a, ptr %b) { ; SSE2-LABEL: combine_test1c: ; SSE2: # %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: andps %xmm0, %xmm2 -; SSE2-NEXT: andnps %xmm1, %xmm0 -; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,1,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_test1c: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_test1c: ; SSE41: # %bb.0: -; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE41-NEXT: movaps {{.*#+}} xmm0 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, (%rsi), %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE41-NEXT: retq ; -; AVX1-LABEL: combine_test1c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: combine_test1c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: combine_test1c: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: retq %A = load <4 x i8>, ptr %a %B = load <4 x i8>, ptr %b %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> @@ -1772,18 +1764,34 @@ } define <4 x i8> @combine_test2c(ptr %a, ptr %b) { -; SSE-LABEL: combine_test2c: -; SSE: # %bb.0: -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: retq +; SSE2-LABEL: combine_test2c: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test2c: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test2c: +; SSE41: # %bb.0: +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, (%rsi), %xmm0 +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE41-NEXT: retq ; ; AVX-LABEL: combine_test2c: ; AVX: # %bb.0: ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: retq %A = load <4 x i8>, ptr %a %B = load <4 x i8>, ptr %b @@ -1793,20 +1801,34 @@ } define <4 x i8> @combine_test3c(ptr %a, ptr %b) { -; SSE-LABEL: combine_test3c: -; SSE: # %bb.0: -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: retq +; SSE2-LABEL: combine_test3c: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,3,2,3,4,5,6,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test3c: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,3,2,3,4,5,6,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test3c: +; SSE41: # %bb.0: +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, (%rdi), %xmm0 +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; SSE41-NEXT: retq ; ; AVX-LABEL: combine_test3c: ; AVX: # %bb.0: ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX-NEXT: vpinsrd $1, (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; AVX-NEXT: retq %A = load <4 x i8>, ptr %a %B = load <4 x i8>, ptr %b @@ -1818,46 +1840,38 @@ define <4 x i8> @combine_test4c(ptr %a, ptr %b) { ; SSE2-LABEL: combine_test4c: ; SSE2: # %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: andps %xmm0, %xmm2 -; SSE2-NEXT: andnps %xmm1, %xmm0 -; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_test4c: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,3,4,6,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,5,2,3,u,u,u,u,u,u,u,u,u,u,u,u] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_test4c: ; SSE41: # %bb.0: -; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE41-NEXT: movaps {{.*#+}} xmm0 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, (%rdi), %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,5,2,3,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE41-NEXT: retq ; -; AVX1-LABEL: combine_test4c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255] -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: combine_test4c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255] -; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: combine_test4c: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vpinsrd $1, (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,2,3,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: retq %A = load <4 x i8>, ptr %a %B = load <4 x i8>, ptr %b %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> @@ -2479,23 +2493,23 @@ ; AVX2-SLOW-LABEL: combine_unneeded_subvector1: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2-SLOW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-ALL-LABEL: combine_unneeded_subvector1: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] ; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-FAST-ALL-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: combine_unneeded_subvector1: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: retq %b = add <8 x i32> %a, %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> @@ -2649,14 +2663,16 @@ define void @combine_scalar_load_with_blend_with_zero(ptr %a0, ptr %a1) { ; SSE-LABEL: combine_scalar_load_with_blend_with_zero: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE-NEXT: movdqa %xmm0, (%rsi) ; SSE-NEXT: retq ; ; AVX-LABEL: combine_scalar_load_with_blend_with_zero: ; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovaps %xmm0, (%rsi) +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX-NEXT: vmovdqa %xmm0, (%rsi) ; AVX-NEXT: retq %1 = load double, ptr %a0, align 8 %2 = insertelement <2 x double> undef, double %1, i32 0 @@ -2714,15 +2730,21 @@ ; ; SSE41-LABEL: combine_constant_insertion_v4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = -; SSE41-NEXT: pinsrd $0, %edi, %xmm0 +; SSE41-NEXT: movd %edi, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_constant_insertion_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX-NEXT: vpinsrd $0, %edi, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_constant_insertion_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovd %edi, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_constant_insertion_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] +; AVX2-NEXT: retq %a0 = insertelement <4 x i32> undef, i32 %f, i32 0 %ret = shufflevector <4 x i32> %a0, <4 x i32> , <4 x i32> ret <4 x i32> %ret @@ -3008,40 +3030,56 @@ define <8 x i16> @shuffle_extract_concat_insert(<4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i16> %b) { ; SSE2-LABEL: shuffle_extract_concat_insert: ; SSE2: # %bb.0: -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pextrw $2, %xmm1, %ecx +; SSE2-NEXT: pextrw $5, %xmm2, %edx +; SSE2-NEXT: pextrw $7, %xmm2, %esi +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE2-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-NEXT: pinsrw $5, %edx, %xmm0 +; SSE2-NEXT: pinsrw $6, %eax, %xmm0 +; SSE2-NEXT: pinsrw $7, %esi, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_extract_concat_insert: ; SSSE3: # %bb.0: -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pextrw $2, %xmm1, %eax +; SSSE3-NEXT: pextrw $5, %xmm2, %ecx +; SSSE3-NEXT: movd %xmm1, %edx +; SSSE3-NEXT: pextrw $7, %xmm2, %esi ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,0,1,14,15,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pinsrw $4, %eax, %xmm0 +; SSSE3-NEXT: pinsrw $5, %ecx, %xmm0 +; SSSE3-NEXT: pinsrw $6, %edx, %xmm0 +; SSSE3-NEXT: pinsrw $7, %esi, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_extract_concat_insert: ; SSE41: # %bb.0: -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] +; SSE41-NEXT: pextrw $2, %xmm1, %eax ; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,0,1,14,15,u,u,u,u,12,13,14,15] +; SSE41-NEXT: movd %xmm1, %ecx +; SSE41-NEXT: pinsrw $4, %eax, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5],xmm0[6,7] +; SSE41-NEXT: pinsrw $6, %ecx, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_extract_concat_insert: ; AVX: # %bb.0: -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpextrw $2, %xmm1, %eax +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,0,1,14,15,u,u,u,u,12,13,14,15] +; AVX-NEXT: vmovd %xmm1, %ecx +; AVX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5],xmm0[6,7] +; AVX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] ; AVX-NEXT: retq %a = shufflevector <4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i32> %a0 = extractelement <8 x i16> %a, i32 0 @@ -3067,17 +3105,19 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: psraw $8, %xmm1 ; SSE2-NEXT: pextrw $7, %xmm1, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: movsbl (%rsi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movsbl (%rdx), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: movl $65531, %ecx # imm = 0xFFFB +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movsbl (%rsi), %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: movsbl (%rdx), %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_scalar_to_vector_extract: @@ -3093,7 +3133,9 @@ ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSSE3-NEXT: pxor %xmm0, %xmm0 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSSE3-NEXT: movl $65531, %eax # imm = 0xFFFB +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-NEXT: retq ; @@ -3149,23 +3191,58 @@ ; Bug noticed in D96345 define i32 @shuffle_binops_with_undef() { -; SSE-LABEL: shuffle_binops_with_undef: -; SSE: # %bb.0: # %entry -; SSE-NEXT: movdqa (%rax), %xmm0 -; SSE-NEXT: paddw %xmm0, %xmm0 -; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: psrlw %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rax) -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_binops_with_undef: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa (%rax), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: psrlw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rax) +; SSE2-NEXT: retq ; -; AVX-LABEL: shuffle_binops_with_undef: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovdqa (%rax), %xmm0 -; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rax) -; AVX-NEXT: retq +; SSSE3-LABEL: shuffle_binops_with_undef: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa (%rax), %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: paddw %xmm0, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: psrlw %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_binops_with_undef: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa (%rax), %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: paddw %xmm0, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: psrlw %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, (%rax) +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_binops_with_undef: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovdqa (%rax), %xmm0 +; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_binops_with_undef: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vmovdqa (%rax), %xmm0 +; AVX2-NEXT: vpaddw %xmm0, %xmm0, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rax) +; AVX2-NEXT: retq entry: %load0 = load <8 x i16>, ptr undef, align 16 %load1 = load <8 x i16>, ptr undef, align 16 @@ -3185,48 +3262,18 @@ declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) define void @PR43024() { -; SSE2-LABEL: PR43024: -; SSE2: # %bb.0: -; SSE2-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] -; SSE2-NEXT: movaps %xmm0, (%rax) -; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: addss %xmm1, %xmm0 -; SSE2-NEXT: addss %xmm1, %xmm0 -; SSE2-NEXT: movss %xmm0, (%rax) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: PR43024: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] -; SSSE3-NEXT: movaps %xmm0, (%rax) -; SSSE3-NEXT: addss %xmm0, %xmm0 -; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: addss %xmm1, %xmm0 -; SSSE3-NEXT: addss %xmm1, %xmm0 -; SSSE3-NEXT: movss %xmm0, (%rax) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: PR43024: -; SSE41: # %bb.0: -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] -; SSE41-NEXT: movaps %xmm0, (%rax) -; SSE41-NEXT: addss %xmm0, %xmm0 -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: addss %xmm1, %xmm0 -; SSE41-NEXT: addss %xmm1, %xmm0 -; SSE41-NEXT: movss %xmm0, (%rax) -; SSE41-NEXT: retq +; SSE-LABEL: PR43024: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] +; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: movl $2143289344, (%rax) # imm = 0x7FC00000 +; SSE-NEXT: retq ; ; AVX-LABEL: PR43024: ; AVX: # %bb.0: ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] ; AVX-NEXT: vmovaps %xmm0, (%rax) -; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0, %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}+12(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovss %xmm0, (%rax) +; AVX-NEXT: movl $2143289344, (%rax) # imm = 0x7FC00000 ; AVX-NEXT: retq store <4 x float> , ptr undef, align 16 %1 = load <4 x float>, ptr undef, align 16 @@ -3481,13 +3528,15 @@ ; SSE2-LABEL: SpinningCube: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; SSE2-NEXT: movaps {{.*#+}} xmm0 = +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movapd {{.*#+}} xmm2 = -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE2-NEXT: xorps %xmm3, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movaps {{.*#+}} xmm3 = <0.0E+0,-2.0E+0,u,u> +; SSE2-NEXT: mulps %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm2[2,0] ; SSE2-NEXT: addps %xmm3, %xmm1 ; SSE2-NEXT: movaps %xmm1, (%rax) ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero @@ -3500,18 +3549,22 @@ ; SSSE3-LABEL: SpinningCube: ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; SSSE3-NEXT: movaps {{.*#+}} xmm0 = -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movapd {{.*#+}} xmm2 = -; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSSE3-NEXT: xorps %xmm3, %xmm3 -; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] -; SSSE3-NEXT: addps %xmm3, %xmm1 -; SSSE3-NEXT: movaps %xmm1, (%rax) +; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm0, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2] +; SSSE3-NEXT: movaps {{.*#+}} xmm2 = <0.0E+0,0.0E+0,-2.0E+0,u> +; SSSE3-NEXT: mulps %xmm2, %xmm1 +; SSSE3-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm1, %xmm4 +; SSSE3-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] +; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0] +; SSSE3-NEXT: addps %xmm1, %xmm3 +; SSSE3-NEXT: movaps %xmm3, (%rax) ; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,2] -; SSSE3-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSSE3-NEXT: mulps %xmm2, %xmm1 ; SSSE3-NEXT: addps %xmm0, %xmm1 ; SSSE3-NEXT: movaps %xmm1, (%rax) ; SSSE3-NEXT: retq @@ -3519,31 +3572,35 @@ ; SSE41-LABEL: SpinningCube: ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; SSE41-NEXT: movaps {{.*#+}} xmm0 = -; SSE41-NEXT: movaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u> -; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE41-NEXT: movaps %xmm1, %xmm3 -; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[0] -; SSE41-NEXT: movaps %xmm0, %xmm4 -; SSE41-NEXT: insertps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[2,3] -; SSE41-NEXT: addps %xmm3, %xmm4 +; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; SSE41-NEXT: movaps {{.*#+}} xmm2 = <0.0E+0,0.0E+0,-2.0E+0,u> +; SSE41-NEXT: mulps %xmm2, %xmm0 +; SSE41-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; SSE41-NEXT: movaps %xmm1, %xmm4 +; SSE41-NEXT: insertps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[2,3] +; SSE41-NEXT: addps %xmm0, %xmm4 ; SSE41-NEXT: movaps %xmm4, (%rax) -; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,0,2] -; SSE41-NEXT: mulps %xmm1, %xmm2 -; SSE41-NEXT: addps %xmm0, %xmm2 -; SSE41-NEXT: movaps %xmm2, (%rax) +; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; SSE41-NEXT: mulps %xmm2, %xmm0 +; SSE41-NEXT: addps %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm0, (%rax) ; SSE41-NEXT: retq ; ; AVX-LABEL: SpinningCube: ; AVX: # %bb.0: # %entry ; AVX-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX-NEXT: vmovaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u> -; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0] -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3] -; AVX-NEXT: vaddps %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vbroadcastss (%rax), %xmm0 +; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,-2.0E+0] +; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[2,3] +; AVX-NEXT: vaddps %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vmovaps %xmm2, (%rax) ; AVX-NEXT: vbroadcastss (%rax), %xmm2 ; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll b/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll @@ -21,8 +21,8 @@ ; ; AVX-LABEL: concat_a_to_shuf_of_a: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vmovapd (%rdi), %xmm0 +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: vmovaps %ymm0, (%rsi) ; AVX-NEXT: vzeroupper @@ -68,8 +68,8 @@ ; ; AVX-LABEL: concat_shuf_of_a_to_a: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vmovapd (%rdi), %xmm0 +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vmovaps %ymm0, (%rdx) ; AVX-NEXT: vzeroupper @@ -609,8 +609,8 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: movdqa %xmm0, 32(%rsi) ; SSE-NEXT: movdqa %xmm0, 48(%rsi) +; SSE-NEXT: movdqa %xmm0, 32(%rsi) ; SSE-NEXT: movdqa %xmm0, 16(%rsi) ; SSE-NEXT: movdqa %xmm1, (%rsi) ; SSE-NEXT: retq @@ -637,9 +637,8 @@ ; AVX512F-LABEL: concat_aaa_to_shuf_of_a: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,1,0,1,2,3] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm0, (%rsi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -647,9 +646,8 @@ ; AVX512BW-LABEL: concat_aaa_to_shuf_of_a: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,1,0,1,2,3] +; AVX512BW-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -694,9 +692,8 @@ ; AVX512F-LABEL: concat_shuf_of_a_to_aaa: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,0,1,1,0] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm0, (%rsi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -704,9 +701,8 @@ ; AVX512BW-LABEL: concat_shuf_of_a_to_aaa: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,0,1,1,0] +; AVX512BW-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll @@ -364,8 +364,8 @@ ; AMD10H: # %bb.0: ; AMD10H-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; AMD10H-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AMD10H-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AMD10H-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; AMD10H-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AMD10H-NEXT: packuswb %xmm0, %xmm0 ; AMD10H-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll @@ -209,11 +209,12 @@ ; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512F-NEXT: vpor %xmm2, %xmm4, %xmm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,3,5,9,11,15,17,21,23,27,29,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512F-NEXT: vpshufb %ymm4, %ymm7, %ymm4 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpternlogq $234, %zmm2, %zmm0, %zmm4 @@ -222,14 +223,13 @@ ; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm2 ; AVX512F-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 ; AVX512F-NEXT: retq ; @@ -263,14 +263,13 @@ ; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm4 ; AVX512BW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512BW-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm4 ; AVX512BW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: movabsq $8796090925056, %rax # imm = 0x7FFFFE00000 ; AVX512BW-NEXT: kmovq %rax, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} @@ -344,19 +343,17 @@ ; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-NEXT: vpternlogq $216, %ymm5, %ymm2, %ymm0 -; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm8 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm9 = <2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX512F-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3,4],xmm2[5,6,7] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,18,20,24,26,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-NEXT: vpternlogq $186, %ymm2, %ymm4, %ymm0 +; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = <128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = <2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512F-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3,4],xmm2[5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm1 @@ -366,14 +363,14 @@ ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512F-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX512F-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512F-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512F-NEXT: vpshufb %xmm8, %xmm2, %xmm2 ; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,18,20,24,26,30,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vpternlogq $226, %ymm1, %ymm5, %ymm2 +; AVX512F-NEXT: vpternlogq $226, %ymm1, %ymm4, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; @@ -496,11 +493,12 @@ ; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512F-NEXT: vpor %xmm2, %xmm4, %xmm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,2,4,8,10,14,16,20,22,26,28,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512F-NEXT: vpshufb %ymm4, %ymm7, %ymm4 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpternlogq $234, %zmm2, %zmm0, %zmm4 @@ -509,14 +507,13 @@ ; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm2 ; AVX512F-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 ; AVX512F-NEXT: retq ; @@ -550,14 +547,13 @@ ; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm4 ; AVX512BW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512BW-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm4 ; AVX512BW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: movabsq $8796090925056, %rax # imm = 0x7FFFFE00000 ; AVX512BW-NEXT: kmovq %rax, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll @@ -40,11 +40,11 @@ ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512F-LABEL: foo: @@ -53,11 +53,11 @@ ; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14] ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqu 16(%rdi), %xmm2 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[0,2,3,5,6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,3,4,6,7,9,10,12,13,15],zero,zero,zero,zero,zero,ymm0[24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-NEXT: vmovdqu 16(%rdi), %xmm2 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,2,3,5,6] +; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: retq ; @@ -70,10 +70,10 @@ ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: movl $63488, %eax # imm = 0xF800 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14] ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll @@ -1224,8 +1224,8 @@ ; SSE2-NEXT: andl $3, %ecx ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; @@ -1241,8 +1241,8 @@ ; SSSE3-NEXT: andl $3, %ecx ; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-NEXT: retq ; @@ -1256,9 +1256,9 @@ ; SSE41-NEXT: andl $3, %edx ; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE41-NEXT: andl $3, %ecx -; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],mem[0],zero,zero +; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE41-NEXT: retq ; @@ -1273,9 +1273,9 @@ ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: andl $3, %ecx ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],zero,zero ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],zero,zero -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX-NEXT: retq %x0 = extractelement <4 x float> %x, i32 %i0 %x1 = extractelement <4 x float> %x, i32 %i1 diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -2115,11 +2115,9 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; SSE-LABEL: trunc_mul_const_v4i64_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_mul_const_v4i64_v4i32: @@ -4607,11 +4605,61 @@ ; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: mul_add_const_v4i64_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: mul_add_const_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] +; AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: mul_add_const_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-SLOW-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-SLOW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-ALL-LABEL: mul_add_const_v4i64_v4i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-FAST-ALL-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: mul_add_const_v4i64_v4i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-FAST-PERLANE-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-FAST-PERLANE-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX512-LABEL: mul_add_const_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = sext <4 x i32> %a0 to <4 x i64> %2 = sext <4 x i32> %a1 to <4 x i64> %3 = mul <4 x i64> %1, %2 @@ -4633,11 +4681,61 @@ ; SSE-NEXT: paddd %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: mul_add_self_v4i64_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: mul_add_self_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] +; AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: mul_add_self_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-SLOW-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-ALL-LABEL: mul_add_self_v4i64_v4i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-FAST-ALL-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: mul_add_self_v4i64_v4i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-FAST-PERLANE-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-FAST-PERLANE-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX512-LABEL: mul_add_self_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = sext <4 x i32> %a0 to <4 x i64> %2 = sext <4 x i32> %a1 to <4 x i64> %3 = mul <4 x i64> %1, %2 @@ -4659,11 +4757,61 @@ ; SSE-NEXT: paddd %xmm4, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: mul_add_multiuse_v4i64_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: mul_add_multiuse_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] +; AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpmuldq %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: mul_add_multiuse_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-SLOW-NEXT: vpmuldq %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-ALL-LABEL: mul_add_multiuse_v4i64_v4i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-FAST-ALL-NEXT: vpmuldq %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: mul_add_multiuse_v4i64_v4i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-FAST-PERLANE-NEXT: vpmuldq %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-FAST-PERLANE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX512-LABEL: mul_add_multiuse_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512-NEXT: vpmuldq %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = sext <4 x i32> %a0 to <4 x i64> %2 = sext <4 x i32> %a1 to <4 x i64> %3 = mul <4 x i64> %1, %2 diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -25,14 +25,14 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -154,14 +154,14 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -247,7 +247,8 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovusqd %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovusqd %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_packus_v2i64_v2i32_store: @@ -264,14 +265,16 @@ ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovusqd %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusqd %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_packus_v2i64_v2i32_store: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpmovusqd %xmm0, (%rdi) +; SKX-NEXT: vpmovusqd %xmm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp slt <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -289,27 +292,27 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647] -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm0 -; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm6, %xmm0 ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm5 @@ -501,65 +504,65 @@ ; SSE2-SSSE3-LABEL: trunc_packus_v8i64_v8i32: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm3 -; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm8 +; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm7 ; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm6 ; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm10, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: por %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm8 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm10, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm7 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm3 -; SSE2-SSSE3-NEXT: por %xmm8, %xmm3 -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8 -; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm8 -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm6 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm8 -; SSE2-SSSE3-NEXT: por %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: por %xmm7, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm7 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm10, %xmm7 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm6 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm7 +; SSE2-SSSE3-NEXT: por %xmm6, %xmm7 ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm6 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm1 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm1, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm1 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 @@ -568,8 +571,8 @@ ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 -; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm1 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm4 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm1 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm5 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 @@ -578,7 +581,7 @@ ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm6 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm6, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm1 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm1 ; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm4 @@ -822,14 +825,14 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -965,14 +968,14 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -1075,7 +1078,8 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovusqw %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovusqw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_packus_v2i64_v2i16_store: @@ -1092,14 +1096,16 @@ ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovusqw %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusqw %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovd %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_packus_v2i64_v2i16_store: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpmovusqw %xmm0, (%rdi) +; SKX-NEXT: vpmovusqw %xmm0, %xmm0 +; SKX-NEXT: vmovd %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp slt <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -1113,59 +1119,60 @@ define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) { ; SSE2-SSSE3-LABEL: trunc_packus_v4i64_v4i16: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm6 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183] -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535] +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm4, %xmm6 ; SSE2-SSSE3-NEXT: pand %xmm6, %xmm0 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm6 ; SSE2-SSSE3-NEXT: por %xmm6, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm6 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 -; SSE2-SSSE3-NEXT: pand %xmm6, %xmm1 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm6 -; SSE2-SSSE3-NEXT: por %xmm1, %xmm6 -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm1 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm1 ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: pand %xmm6, %xmm4 -; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm1 -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: por %xmm1, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; SSE2-SSSE3-NEXT: pslld $16, %xmm0 ; SSE2-SSSE3-NEXT: psrad $16, %xmm0 -; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_packus_v4i64_v4i16: @@ -1307,27 +1314,27 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183] -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm9 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm9, %xmm3 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm8, %xmm3 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSE2-SSSE3-NEXT: por %xmm0, %xmm3 ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm5 ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5 @@ -1458,7 +1465,8 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovusqw %ymm0, (%rdi) +; AVX512VL-NEXT: vpmovusqw %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -1476,7 +1484,8 @@ ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovusqw %ymm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusqw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, (%rdi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; @@ -1484,7 +1493,8 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 -; SKX-NEXT: vpmovusqw %ymm0, (%rdi) +; SKX-NEXT: vpmovusqw %ymm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp slt <4 x i64> %a0, @@ -1502,58 +1512,58 @@ ; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm5 ; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm0 ; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm3 -; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm8 +; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm7 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183] -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm10, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: por %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm8 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm10, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm7 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm3 -; SSE2-SSSE3-NEXT: por %xmm8, %xmm3 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm8 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm8 -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm5 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm8 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: por %xmm7, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm7 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm10, %xmm7 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm5 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm7 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm7 ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm5 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 ; SSE2-SSSE3-NEXT: pand %xmm6, %xmm0 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm6 @@ -1568,7 +1578,7 @@ ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm4 ; SSE2-SSSE3-NEXT: pand %xmm6, %xmm4 -; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm0 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm5 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 @@ -1577,7 +1587,7 @@ ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm6 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm6, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm0 ; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm4 @@ -1889,37 +1899,16 @@ ; AVX-NEXT: vmovq %xmm0, (%rdi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: trunc_packus_v4i32_v4i16_store: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rdi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_packus_v4i32_v4i16_store: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovusdw %xmm0, (%rdi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_packus_v4i32_v4i16_store: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, (%rdi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_packus_v4i32_v4i16_store: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovusdw %xmm0, (%rdi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc_packus_v4i32_v4i16_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, (%rdi) +; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_packus_v4i32_v4i16_store: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpmovusdw %xmm0, (%rdi) +; SKX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp slt <4 x i32> %a0, %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> @@ -2206,14 +2195,14 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -2239,14 +2228,14 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903] -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm2, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm0 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -2368,14 +2357,14 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -2403,14 +2392,14 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903] -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm2, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm0 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -2498,7 +2487,8 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovusqb %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovusqb %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrw $0, %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_packus_v2i64_v2i8_store: @@ -2515,14 +2505,16 @@ ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovusqb %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusqb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpextrw $0, %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_packus_v2i64_v2i8_store: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpmovusqb %xmm0, (%rdi) +; SKX-NEXT: vpmovusqb %xmm0, %xmm0 +; SKX-NEXT: vpextrw $0, %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp slt <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -2540,27 +2532,27 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903] -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm0 -; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm6, %xmm0 ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm5 @@ -2734,27 +2726,27 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903] -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm9 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm9, %xmm3 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm8, %xmm3 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSE2-SSSE3-NEXT: por %xmm0, %xmm3 ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm5 ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5 @@ -2889,7 +2881,8 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovusqb %ymm0, (%rdi) +; AVX512VL-NEXT: vpmovusqb %ymm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -2907,7 +2900,8 @@ ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovusqb %ymm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusqb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovd %xmm0, (%rdi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; @@ -2915,7 +2909,8 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 -; SKX-NEXT: vpmovusqb %ymm0, (%rdi) +; SKX-NEXT: vpmovusqb %ymm0, %xmm0 +; SKX-NEXT: vmovd %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp slt <4 x i64> %a0, @@ -2933,58 +2928,58 @@ ; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm5 ; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm0 ; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm3 -; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm8 +; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm7 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm10, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: por %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm8 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm10, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm7 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm3 -; SSE2-SSSE3-NEXT: por %xmm8, %xmm3 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm8 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm8 -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm5 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm8 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: por %xmm7, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm7 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm10, %xmm7 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm5 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm7 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm7 ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm5 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 ; SSE2-SSSE3-NEXT: pand %xmm6, %xmm0 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm6 @@ -3000,17 +2995,17 @@ ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm4 ; SSE2-SSSE3-NEXT: pand %xmm6, %xmm4 -; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm0 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm5 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: pand %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm8 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm7, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm0 +; SSE2-SSSE3-NEXT: por %xmm8, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm0 ; SSE2-SSSE3-NEXT: packuswb %xmm4, %xmm0 ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm4 @@ -3223,58 +3218,58 @@ ; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm5 ; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm3 ; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm2 -; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm8 +; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm7 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm10, %xmm1 ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm1 ; SSE2-SSSE3-NEXT: por %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm2 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm8 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm10, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm7 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm2 -; SSE2-SSSE3-NEXT: por %xmm8, %xmm2 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm8 -; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm8 -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm5 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm8 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: por %xmm7, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm7 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm10, %xmm7 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm5 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm7 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm7 ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm5 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 ; SSE2-SSSE3-NEXT: pand %xmm6, %xmm3 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm6 @@ -3290,17 +3285,17 @@ ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm3, %xmm4 ; SSE2-SSSE3-NEXT: pand %xmm6, %xmm4 -; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm3 ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm5 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: pand %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm8 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm7, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm8, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm3 ; SSE2-SSSE3-NEXT: packuswb %xmm4, %xmm3 ; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm4 @@ -3487,7 +3482,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vpmaxsq (%rdi), %zmm0, %zmm0 -; AVX512-NEXT: vpmovusqb %zmm0, (%rsi) +; AVX512-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -3517,9 +3513,9 @@ ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm7 ; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm0 -; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm12 -; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm11 -; SSE2-SSSE3-NEXT: movdqa 80(%rdi), %xmm10 +; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm11 +; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm10 +; SSE2-SSSE3-NEXT: movdqa 80(%rdi), %xmm9 ; SSE2-SSSE3-NEXT: movdqa 64(%rdi), %xmm5 ; SSE2-SSSE3-NEXT: movdqa 112(%rdi), %xmm4 ; SSE2-SSSE3-NEXT: movdqa 96(%rdi), %xmm3 @@ -3527,105 +3523,105 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483903,2147483903] -; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm14 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm14 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm15, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm12 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm12 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm13, %xmm14 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm14, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm2 ; SSE2-SSSE3-NEXT: por %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 -; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm14 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm14 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm15, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm12 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm12 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm13, %xmm14 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm14, %xmm3 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm4 ; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm3 ; SSE2-SSSE3-NEXT: por %xmm4, %xmm3 ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm4 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 -; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm14 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm14 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm15, %xmm4 +; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm12 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm12 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm13, %xmm14 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm14, %xmm4 ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm5 ; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm4 ; SSE2-SSSE3-NEXT: por %xmm5, %xmm4 -; SSE2-SSSE3-NEXT: movdqa %xmm10, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm5 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 -; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm14 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm14 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm14[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm15, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm10 +; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm12 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm12 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm5[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm13, %xmm14 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm14, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm9 ; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm10, %xmm5 -; SSE2-SSSE3-NEXT: movdqa %xmm12, %xmm10 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 -; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm14 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm10, %xmm14 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm14[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm15, %xmm10 -; SSE2-SSSE3-NEXT: pand %xmm10, %xmm12 -; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm10 -; SSE2-SSSE3-NEXT: por %xmm12, %xmm10 -; SSE2-SSSE3-NEXT: movdqa %xmm11, %xmm12 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm12 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 -; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm14 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm12, %xmm14 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm14[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm15, %xmm12 -; SSE2-SSSE3-NEXT: pand %xmm12, %xmm11 -; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm12 -; SSE2-SSSE3-NEXT: por %xmm11, %xmm12 -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm11 +; SSE2-SSSE3-NEXT: por %xmm9, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm11, %xmm9 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm9 +; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm12 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm12 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm9[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm13, %xmm14 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm14, %xmm9 +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 +; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm9 +; SSE2-SSSE3-NEXT: por %xmm11, %xmm9 +; SSE2-SSSE3-NEXT: movdqa %xmm10, %xmm11 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 -; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm14 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm11, %xmm14 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm14[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm15, %xmm11 -; SSE2-SSSE3-NEXT: pand %xmm11, %xmm7 +; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm12 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm11 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm11[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm13, %xmm14 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm14, %xmm11 +; SSE2-SSSE3-NEXT: pand %xmm11, %xmm10 ; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm11 -; SSE2-SSSE3-NEXT: por %xmm7, %xmm11 +; SSE2-SSSE3-NEXT: por %xmm10, %xmm11 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm10 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm10 +; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm12 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm10, %xmm12 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm10[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm13, %xmm14 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm14, %xmm10 +; SSE2-SSSE3-NEXT: pand %xmm10, %xmm7 +; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm10 +; SSE2-SSSE3-NEXT: por %xmm7, %xmm10 ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm7 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm12 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm12 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] ; SSE2-SSSE3-NEXT: pand %xmm13, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm7, %xmm8 ; SSE2-SSSE3-NEXT: pand %xmm8, %xmm0 ; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm8 @@ -3641,19 +3637,19 @@ ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm6 ; SSE2-SSSE3-NEXT: pand %xmm8, %xmm6 -; SSE2-SSSE3-NEXT: movdqa %xmm11, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm10, %xmm0 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm7 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm9 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm12 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm9, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm11, %xmm0 +; SSE2-SSSE3-NEXT: por %xmm12, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm10, %xmm0 ; SSE2-SSSE3-NEXT: packuswb %xmm6, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm12, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm11, %xmm6 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm6 ; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm7 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 @@ -3663,18 +3659,18 @@ ; SSE2-SSSE3-NEXT: pand %xmm8, %xmm6 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm6, %xmm7 -; SSE2-SSSE3-NEXT: pand %xmm12, %xmm7 -; SSE2-SSSE3-NEXT: movdqa %xmm10, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm11, %xmm7 +; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm6 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm6 ; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm6 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm10, %xmm6 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm6, %xmm8 -; SSE2-SSSE3-NEXT: pand %xmm10, %xmm8 +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm8 ; SSE2-SSSE3-NEXT: packuswb %xmm7, %xmm8 ; SSE2-SSSE3-NEXT: packuswb %xmm8, %xmm0 ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 @@ -4023,13 +4019,14 @@ ; SKX-NEXT: vpmovusqb %ymm1, %xmm1 ; SKX-NEXT: vpmaxsq 64(%rdi), %ymm0, %ymm2 ; SKX-NEXT: vpmovusqb %ymm2, %xmm2 -; SKX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SKX-NEXT: vpmaxsq 32(%rdi), %ymm0, %ymm2 -; SKX-NEXT: vpmovusqb %ymm2, %xmm2 +; SKX-NEXT: vpbroadcastq {{.*#+}} xmm3 = [0,4,0,4] +; SKX-NEXT: vpermi2d %xmm1, %xmm2, %xmm3 +; SKX-NEXT: vpmaxsq 32(%rdi), %ymm0, %ymm1 +; SKX-NEXT: vpmovusqb %ymm1, %xmm1 ; SKX-NEXT: vpmaxsq (%rdi), %ymm0, %ymm0 ; SKX-NEXT: vpmovusqb %ymm0, %xmm0 -; SKX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SKX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %a0 = load <16 x i64>, ptr %p0 @@ -4211,7 +4208,8 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovusdb %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovusdb %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_packus_v4i32_v4i8_store: @@ -4227,14 +4225,16 @@ ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovusdb %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusdb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovd %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_packus_v4i32_v4i8_store: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpmovusdb %xmm0, (%rdi) +; SKX-NEXT: vpmovusdb %xmm0, %xmm0 +; SKX-NEXT: vmovd %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp slt <4 x i32> %a0, %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> @@ -4354,7 +4354,8 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovusdb %ymm0, (%rdi) +; AVX512VL-NEXT: vpmovusdb %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -4371,7 +4372,8 @@ ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovusdb %ymm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, (%rdi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; @@ -4379,7 +4381,8 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; SKX-NEXT: vpmovusdb %ymm0, (%rdi) +; SKX-NEXT: vpmovusdb %ymm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp slt <8 x i32> %a0, @@ -4544,36 +4547,16 @@ ; AVX-NEXT: vmovq %xmm0, (%rdi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: trunc_packus_v8i16_v8i8_store: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rdi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_packus_v8i16_v8i8_store: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, (%rdi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_packus_v8i16_v8i8_store: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, (%rdi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_packus_v8i16_v8i8_store: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovuswb %xmm0, (%rdi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc_packus_v8i16_v8i8_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, (%rdi) +; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_packus_v8i16_v8i8_store: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpmovuswb %xmm0, (%rdi) +; SKX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp slt <8 x i16> %a0, %2 = select <8 x i1> %1, <8 x i16> %a0, <8 x i16> diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -25,26 +25,27 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-SSSE3-NEXT: por %xmm0, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [18446744069414584320,18446744069414584320] +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm3 ; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -147,26 +148,27 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-SSSE3-NEXT: por %xmm0, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [18446744069414584320,18446744069414584320] +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm3 ; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -239,7 +241,8 @@ ; ; AVX512VL-LABEL: trunc_ssat_v2i64_v2i32_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsqd %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovsqd %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_ssat_v2i64_v2i32_store: @@ -252,12 +255,14 @@ ; ; AVX512BWVL-LABEL: trunc_ssat_v2i64_v2i32_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsqd %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovsqd %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v2i64_v2i32_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovsqd %xmm0, (%rdi) +; SKX-NEXT: vpmovsqd %xmm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp slt <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -275,27 +280,27 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [4294967295,4294967295] -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm0 -; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm6, %xmm0 ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm5 @@ -303,30 +308,31 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm8 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744069414584320,18446744069414584320] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm8, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm5 +; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm5, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_ssat_v4i64_v4i32: @@ -491,109 +497,112 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295] -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm10, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: por %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm10, %xmm3 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm5 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSE2-SSSE3-NEXT: por %xmm5, %xmm3 ; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm5 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm10, %xmm5 ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm7 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5 ; SSE2-SSSE3-NEXT: por %xmm7, %xmm5 ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm7 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm7, %xmm8 -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm1 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm8 -; SSE2-SSSE3-NEXT: por %xmm1, %xmm8 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm1 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm7 +; SSE2-SSSE3-NEXT: por %xmm1, %xmm7 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968] -; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm1 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm9 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm10, %xmm9 -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm8 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm9 -; SSE2-SSSE3-NEXT: por %xmm8, %xmm9 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320] +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm1, %xmm8 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm7 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm8 +; SSE2-SSSE3-NEXT: por %xmm7, %xmm8 ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm7 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm10, %xmm1 ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm5 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm1 ; SSE2-SSSE3-NEXT: por %xmm5, %xmm1 -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2] +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm5 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm9 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm9, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm3 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm3 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm7 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm7 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm3 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm6, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm0 ; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[0,2] ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_ssat_v8i64_v8i32: @@ -814,26 +823,27 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147516415,2147516415] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-SSSE3-NEXT: por %xmm0, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562035200,18446744071562035200] +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm3 ; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -949,26 +959,27 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147516415,2147516415] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-SSSE3-NEXT: por %xmm0, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562035200,18446744071562035200] +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm3 ; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -1057,7 +1068,8 @@ ; ; AVX512VL-LABEL: trunc_ssat_v2i64_v2i16_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsqw %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovsqw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_ssat_v2i64_v2i16_store: @@ -1070,12 +1082,14 @@ ; ; AVX512BWVL-LABEL: trunc_ssat_v2i64_v2i16_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsqw %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovsqw %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovd %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v2i64_v2i16_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovsqw %xmm0, (%rdi) +; SKX-NEXT: vpmovsqw %xmm0, %xmm0 +; SKX-NEXT: vmovd %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp slt <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -1093,27 +1107,27 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147516415,2147516415] -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm0 -; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147516415,2147516415] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm6, %xmm0 ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm5 @@ -1121,30 +1135,31 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm8 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562035200,18446744071562035200] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm8, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm5 +; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm5, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: packssdw %xmm6, %xmm0 ; SSE2-SSSE3-NEXT: packssdw %xmm0, %xmm0 ; SSE2-SSSE3-NEXT: retq ; @@ -1278,27 +1293,27 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147516415,2147516415] -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm9 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm9, %xmm3 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147516415,2147516415] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm8, %xmm3 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSE2-SSSE3-NEXT: por %xmm0, %xmm3 ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm5 ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5 @@ -1306,32 +1321,33 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [18446744073709518848,18446744073709518848] ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm8 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562035200,18446744071562035200] +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm1 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm8, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pandn %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm1, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm5 +; SSE2-SSSE3-NEXT: pandn %xmm0, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 ; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm5, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pandn %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: por %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm2 -; SSE2-SSSE3-NEXT: movq %xmm2, (%rdi) +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pandn %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: packssdw %xmm6, %xmm1 +; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm1 +; SSE2-SSSE3-NEXT: movq %xmm1, (%rdi) ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_ssat_v4i64_v4i16_store: @@ -1430,7 +1446,8 @@ ; ; AVX512VL-LABEL: trunc_ssat_v4i64_v4i16_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsqw %ymm0, (%rdi) +; AVX512VL-NEXT: vpmovsqw %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -1444,13 +1461,15 @@ ; ; AVX512BWVL-LABEL: trunc_ssat_v4i64_v4i16_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsqw %ymm0, (%rdi) +; AVX512BWVL-NEXT: vpmovsqw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, (%rdi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v4i64_v4i16_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovsqw %ymm0, (%rdi) +; SKX-NEXT: vpmovsqw %ymm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp slt <4 x i64> %a0, @@ -1473,110 +1492,113 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147516415,2147516415] -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm10, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: por %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm10, %xmm3 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm5 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSE2-SSSE3-NEXT: por %xmm5, %xmm3 ; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm5 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm10, %xmm5 ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm6 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5 ; SSE2-SSSE3-NEXT: por %xmm6, %xmm5 ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm6 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm6, %xmm8 -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm0 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm8 -; SSE2-SSSE3-NEXT: por %xmm0, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm7 +; SSE2-SSSE3-NEXT: por %xmm0, %xmm7 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744073709518848,18446744073709518848] -; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm0 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm9 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm10, %xmm9 -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm8 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm9 -; SSE2-SSSE3-NEXT: por %xmm8, %xmm9 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562035200,18446744071562035200] +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm0, %xmm8 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm7 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm8 +; SSE2-SSSE3-NEXT: por %xmm7, %xmm8 ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm0 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm7 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm10, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm5 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm0 ; SSE2-SSSE3-NEXT: por %xmm5, %xmm0 -; SSE2-SSSE3-NEXT: packssdw %xmm9, %xmm0 +; SSE2-SSSE3-NEXT: packssdw %xmm8, %xmm0 ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm5 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm9 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm9, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm3 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm3 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm7 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm7 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm3 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm1 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm6, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: por %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: packssdw %xmm5, %xmm1 -; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: packssdw %xmm7, %xmm3 +; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_ssat_v8i64_v8i16: @@ -1794,31 +1816,16 @@ ; AVX-NEXT: vmovq %xmm0, (%rdi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: trunc_ssat_v4i32_v4i16_store: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rdi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_ssat_v4i32_v4i16_store: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsdw %xmm0, (%rdi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_ssat_v4i32_v4i16_store: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, (%rdi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_ssat_v4i32_v4i16_store: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsdw %xmm0, (%rdi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc_ssat_v4i32_v4i16_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, (%rdi) +; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v4i32_v4i16_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovsdw %xmm0, (%rdi) +; SKX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp slt <4 x i32> %a0, %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> @@ -1944,30 +1951,31 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483775,2147483775] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: packssdw %xmm0, %xmm0 ; SSE2-NEXT: packssdw %xmm0, %xmm0 ; SSE2-NEXT: packsswb %xmm0, %xmm0 @@ -1978,30 +1986,31 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483775,2147483775] -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm2, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm0 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSSE3-NEXT: por %xmm3, %xmm0 ; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 -; SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067840,18446744071562067840] +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSSE3-NEXT: por %xmm2, %xmm0 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSSE3-NEXT: retq ; @@ -2099,26 +2108,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483775,2147483775] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -2135,26 +2145,27 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483775,2147483775] -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm2, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm0 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSSE3-NEXT: por %xmm0, %xmm3 ; SSSE3-NEXT: pxor %xmm3, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562067840,18446744071562067840] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 -; SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSSE3-NEXT: por %xmm0, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm3 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -2228,7 +2239,8 @@ ; ; AVX512VL-LABEL: trunc_ssat_v2i64_v2i8_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsqb %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovsqb %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrw $0, %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_ssat_v2i64_v2i8_store: @@ -2241,12 +2253,14 @@ ; ; AVX512BWVL-LABEL: trunc_ssat_v2i64_v2i8_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsqb %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovsqb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpextrw $0, %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v2i64_v2i8_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovsqb %xmm0, (%rdi) +; SKX-NEXT: vpmovsqb %xmm0, %xmm0 +; SKX-NEXT: vpextrw $0, %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp slt <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -2264,27 +2278,27 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm0 -; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483775,2147483775] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm6, %xmm0 ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm5 @@ -2292,30 +2306,31 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm8, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562067840,18446744071562067840] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm5 +; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm5, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: packssdw %xmm6, %xmm0 ; SSE2-SSSE3-NEXT: packssdw %xmm0, %xmm0 ; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0 ; SSE2-SSSE3-NEXT: retq @@ -2453,27 +2468,27 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm9 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm9, %xmm3 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483775,2147483775] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm8, %xmm3 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSE2-SSSE3-NEXT: por %xmm0, %xmm3 ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm0, %xmm5 ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5 @@ -2481,33 +2496,34 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [18446744073709551488,18446744073709551488] ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm8 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562067840,18446744071562067840] +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm1 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm8, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pandn %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm1, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm5 +; SSE2-SSSE3-NEXT: pandn %xmm0, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 ; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm5, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pandn %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: por %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm2 -; SSE2-SSSE3-NEXT: packsswb %xmm2, %xmm2 -; SSE2-SSSE3-NEXT: movd %xmm2, (%rdi) +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pandn %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: packssdw %xmm6, %xmm1 +; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm1 +; SSE2-SSSE3-NEXT: packsswb %xmm1, %xmm1 +; SSE2-SSSE3-NEXT: movd %xmm1, (%rdi) ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_ssat_v4i64_v4i8_store: @@ -2609,7 +2625,8 @@ ; ; AVX512VL-LABEL: trunc_ssat_v4i64_v4i8_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsqb %ymm0, (%rdi) +; AVX512VL-NEXT: vpmovsqb %ymm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -2623,13 +2640,15 @@ ; ; AVX512BWVL-LABEL: trunc_ssat_v4i64_v4i8_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsqb %ymm0, (%rdi) +; AVX512BWVL-NEXT: vpmovsqb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovd %xmm0, (%rdi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v4i64_v4i8_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovsqb %ymm0, (%rdi) +; SKX-NEXT: vpmovsqb %ymm0, %xmm0 +; SKX-NEXT: vmovd %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp slt <4 x i64> %a0, @@ -2652,110 +2671,113 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm10, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: por %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm10, %xmm3 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm5 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSE2-SSSE3-NEXT: por %xmm5, %xmm3 ; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm5 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm10, %xmm5 ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm6 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5 ; SSE2-SSSE3-NEXT: por %xmm6, %xmm5 ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm6 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm6, %xmm8 -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm0 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm8 -; SSE2-SSSE3-NEXT: por %xmm0, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm7 +; SSE2-SSSE3-NEXT: por %xmm0, %xmm7 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] -; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm0 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm9 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm10, %xmm9 -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm8 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm9 -; SSE2-SSSE3-NEXT: por %xmm8, %xmm9 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562067840,18446744071562067840] +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm0, %xmm8 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm7 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm8 +; SSE2-SSSE3-NEXT: por %xmm7, %xmm8 ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm0 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm7 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm10, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm5 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm0 ; SSE2-SSSE3-NEXT: por %xmm5, %xmm0 -; SSE2-SSSE3-NEXT: packssdw %xmm9, %xmm0 +; SSE2-SSSE3-NEXT: packssdw %xmm8, %xmm0 ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm5 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm9 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm9, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm3 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm3 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm7 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm7 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm3 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm6, %xmm1 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm6, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: por %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: packssdw %xmm5, %xmm1 -; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: packssdw %xmm7, %xmm3 +; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0 ; SSE2-SSSE3-NEXT: retq ; @@ -2949,112 +2971,115 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm10, %xmm1 ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm1 ; SSE2-SSSE3-NEXT: por %xmm2, %xmm1 ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm2 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm10, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm5 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: por %xmm5, %xmm2 ; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm5 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm10, %xmm5 ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm6 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5 ; SSE2-SSSE3-NEXT: por %xmm6, %xmm5 ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm6 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm6, %xmm7 ; SSE2-SSSE3-NEXT: pand %xmm7, %xmm3 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm7 ; SSE2-SSSE3-NEXT: por %xmm3, %xmm7 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm9 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562067840,18446744071562067840] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm6 +; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm6 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562067840,18446744071562067840] +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm6 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm10, %xmm8 +; SSE2-SSSE3-NEXT: por %xmm6, %xmm8 ; SSE2-SSSE3-NEXT: pand %xmm8, %xmm7 ; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm8 ; SSE2-SSSE3-NEXT: por %xmm7, %xmm8 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm7 -; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm9 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3] ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm10, %xmm7 -; SSE2-SSSE3-NEXT: pand %xmm7, %xmm5 -; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm7 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm7 -; SSE2-SSSE3-NEXT: packssdw %xmm8, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm10, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm5 +; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: packssdw %xmm8, %xmm6 ; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm5 ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm9 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm9, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm2 -; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm2, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm2 +; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm7 +; SSE2-SSSE3-NEXT: por %xmm2, %xmm7 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm4 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm4, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm0 -; SSE2-SSSE3-NEXT: por %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: packssdw %xmm5, %xmm0 -; SSE2-SSSE3-NEXT: packssdw %xmm0, %xmm7 -; SSE2-SSSE3-NEXT: packsswb %xmm7, %xmm7 -; SSE2-SSSE3-NEXT: movq %xmm7, (%rsi) +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pandn %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: por %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: packssdw %xmm7, %xmm2 +; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm6 +; SSE2-SSSE3-NEXT: packsswb %xmm6, %xmm6 +; SSE2-SSSE3-NEXT: movq %xmm6, (%rsi) ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_ssat_v8i64_v8i8_store: @@ -3214,7 +3239,8 @@ ; AVX512-LABEL: trunc_ssat_v8i64_v8i8_store: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vpmovsqb %zmm0, (%rsi) +; AVX512-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -3241,11 +3267,11 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256" { ; SSE2-SSSE3-LABEL: trunc_ssat_v16i64_v16i8: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm8 +; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm7 ; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm0 -; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm12 -; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm11 -; SSE2-SSSE3-NEXT: movdqa 80(%rdi), %xmm7 +; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm11 +; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm10 +; SSE2-SSSE3-NEXT: movdqa 80(%rdi), %xmm8 ; SSE2-SSSE3-NEXT: movdqa 64(%rdi), %xmm5 ; SSE2-SSSE3-NEXT: movdqa 112(%rdi), %xmm4 ; SSE2-SSSE3-NEXT: movdqa 96(%rdi), %xmm3 @@ -3253,213 +3279,220 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm10, %xmm10 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775] -; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm14 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm14 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm15, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm12 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm12 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm13, %xmm14 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm14, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm2 ; SSE2-SSSE3-NEXT: por %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 -; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm14 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm14 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm15, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm12 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm12 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm13, %xmm14 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm14, %xmm3 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm4 ; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm3 ; SSE2-SSSE3-NEXT: por %xmm4, %xmm3 ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm4 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 -; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm14 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm14 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm15, %xmm4 +; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm12 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm12 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm13, %xmm14 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm14, %xmm4 ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm5 ; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm4 ; SSE2-SSSE3-NEXT: por %xmm5, %xmm4 -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm5 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 -; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm14 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm14 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm14[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm15, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm12 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm12 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm5[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm13, %xmm14 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm14, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm8 ; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm7, %xmm5 -; SSE2-SSSE3-NEXT: movdqa %xmm12, %xmm7 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 -; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm14 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm14 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm14[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm15, %xmm7 -; SSE2-SSSE3-NEXT: pand %xmm7, %xmm12 -; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm7 -; SSE2-SSSE3-NEXT: por %xmm12, %xmm7 -; SSE2-SSSE3-NEXT: movdqa %xmm11, %xmm12 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm12 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 -; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm14 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm12, %xmm14 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm14[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm15, %xmm12 -; SSE2-SSSE3-NEXT: pand %xmm12, %xmm11 -; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm12 -; SSE2-SSSE3-NEXT: por %xmm11, %xmm12 -; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm11 +; SSE2-SSSE3-NEXT: por %xmm8, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm11, %xmm8 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm8 +; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm12 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm12 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm9, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm13, %xmm14 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm14, %xmm8 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm11 +; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: por %xmm11, %xmm8 +; SSE2-SSSE3-NEXT: movdqa %xmm10, %xmm11 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 -; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm14 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm11, %xmm14 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm13, %xmm15 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm14[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm15, %xmm11 -; SSE2-SSSE3-NEXT: pand %xmm11, %xmm8 +; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm12 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm9, %xmm11 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm11[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm13, %xmm14 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm14, %xmm11 +; SSE2-SSSE3-NEXT: pand %xmm11, %xmm10 ; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm11 -; SSE2-SSSE3-NEXT: por %xmm8, %xmm11 -; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm8 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm8[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm13, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm8, %xmm10 -; SSE2-SSSE3-NEXT: pand %xmm10, %xmm0 +; SSE2-SSSE3-NEXT: por %xmm10, %xmm11 +; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm10 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm10 +; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm12 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm10, %xmm12 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm10[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm13, %xmm14 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm14, %xmm10 +; SSE2-SSSE3-NEXT: pand %xmm10, %xmm7 ; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm10 -; SSE2-SSSE3-NEXT: por %xmm0, %xmm10 +; SSE2-SSSE3-NEXT: por %xmm7, %xmm10 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm7 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm7 +; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm12 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm12 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm9, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm13, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm7, %xmm9 +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm9 +; SSE2-SSSE3-NEXT: por %xmm0, %xmm9 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488] +; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm0 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840] +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm12 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm12 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm13, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm0, %xmm12 +; SSE2-SSSE3-NEXT: pand %xmm12, %xmm9 +; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm12 +; SSE2-SSSE3-NEXT: por %xmm9, %xmm12 ; SSE2-SSSE3-NEXT: movdqa %xmm10, %xmm0 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm13 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm9 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm9[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3] ; SSE2-SSSE3-NEXT: pand %xmm13, %xmm14 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm14, %xmm13 -; SSE2-SSSE3-NEXT: pand %xmm13, %xmm10 -; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm13 -; SSE2-SSSE3-NEXT: por %xmm10, %xmm13 -; SSE2-SSSE3-NEXT: movdqa %xmm11, %xmm0 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm10, %xmm14 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm14, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm0, %xmm11 +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm10 ; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm0 -; SSE2-SSSE3-NEXT: por %xmm11, %xmm0 -; SSE2-SSSE3-NEXT: packssdw %xmm13, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm12, %xmm10 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm11 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm11, %xmm13 +; SSE2-SSSE3-NEXT: por %xmm10, %xmm0 +; SSE2-SSSE3-NEXT: packssdw %xmm12, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm11, %xmm9 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm9 +; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm12, %xmm9 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm13, %xmm10 -; SSE2-SSSE3-NEXT: pand %xmm10, %xmm12 +; SSE2-SSSE3-NEXT: por %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pand %xmm10, %xmm11 ; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm10 -; SSE2-SSSE3-NEXT: por %xmm12, %xmm10 -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm11 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm12 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm12, %xmm13 +; SSE2-SSSE3-NEXT: por %xmm11, %xmm10 +; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm9 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm9 +; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm11 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm11 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm12, %xmm9 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm13, %xmm11 -; SSE2-SSSE3-NEXT: pand %xmm11, %xmm7 +; SSE2-SSSE3-NEXT: por %xmm9, %xmm11 +; SSE2-SSSE3-NEXT: pand %xmm11, %xmm8 ; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm11 -; SSE2-SSSE3-NEXT: por %xmm7, %xmm11 +; SSE2-SSSE3-NEXT: por %xmm8, %xmm11 ; SSE2-SSSE3-NEXT: packssdw %xmm10, %xmm11 ; SSE2-SSSE3-NEXT: packssdw %xmm11, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm7 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm7[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm10, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm7 -; SSE2-SSSE3-NEXT: pand %xmm7, %xmm5 -; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm7 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm8 +; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm9 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm10, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm8, %xmm9 +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm5 +; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm9 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm9 ; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm5 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,0,2,2] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm5[1,1,3,3] ; SSE2-SSSE3-NEXT: pand %xmm10, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm11, %xmm5 ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 ; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm5 ; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: packssdw %xmm7, %xmm5 +; SSE2-SSSE3-NEXT: packssdw %xmm9, %xmm5 ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm7 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm7, %xmm10 +; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm4 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm10, %xmm4 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 -; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm4 -; SSE2-SSSE3-NEXT: por %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm4, %xmm8 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm3 +; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm8 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm8, %xmm3 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm7 +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm1 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm7, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm1 -; SSE2-SSSE3-NEXT: por %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: packssdw %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pandn %xmm6, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: packssdw %xmm8, %xmm3 +; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm5 ; SSE2-SSSE3-NEXT: packsswb %xmm5, %xmm0 ; SSE2-SSSE3-NEXT: retq ; @@ -3760,11 +3793,12 @@ ; SKX-NEXT: vmovdqa 96(%rdi), %ymm3 ; SKX-NEXT: vpmovsqb %ymm3, %xmm3 ; SKX-NEXT: vpmovsqb %ymm2, %xmm2 -; SKX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SKX-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,4,0,4] +; SKX-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 ; SKX-NEXT: vpmovsqb %ymm1, %xmm1 ; SKX-NEXT: vpmovsqb %ymm0, %xmm0 ; SKX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %a0 = load <16 x i64>, ptr %p0 @@ -3935,7 +3969,8 @@ ; ; AVX512VL-LABEL: trunc_ssat_v4i32_v4i8_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsdb %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovsdb %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_ssat_v4i32_v4i8_store: @@ -3948,12 +3983,14 @@ ; ; AVX512BWVL-LABEL: trunc_ssat_v4i32_v4i8_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsdb %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovsdb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovd %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v4i32_v4i8_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovsdb %xmm0, (%rdi) +; SKX-NEXT: vpmovsdb %xmm0, %xmm0 +; SKX-NEXT: vmovd %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp slt <4 x i32> %a0, %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> @@ -4065,7 +4102,8 @@ ; ; AVX512VL-LABEL: trunc_ssat_v8i32_v8i8_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsdb %ymm0, (%rdi) +; AVX512VL-NEXT: vpmovsdb %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -4080,13 +4118,15 @@ ; ; AVX512BWVL-LABEL: trunc_ssat_v8i32_v8i8_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsdb %ymm0, (%rdi) +; AVX512BWVL-NEXT: vpmovsdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, (%rdi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v8i32_v8i8_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovsdb %ymm0, (%rdi) +; SKX-NEXT: vpmovsdb %ymm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp slt <8 x i32> %a0, @@ -4252,32 +4292,16 @@ ; AVX-NEXT: vmovq %xmm0, (%rdi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: trunc_ssat_v8i16_v8i8_store: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rdi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_ssat_v8i16_v8i8_store: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, (%rdi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_ssat_v8i16_v8i8_store: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, (%rdi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_ssat_v8i16_v8i8_store: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovswb %xmm0, (%rdi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc_ssat_v8i16_v8i8_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, (%rdi) +; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v8i16_v8i8_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovswb %xmm0, (%rdi) +; SKX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp slt <8 x i16> %a0, %2 = select <8 x i1> %1, <8 x i16> %a0, <8 x i16> diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -24,11 +24,13 @@ ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259455,9223372039002259455] +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1 ; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1 @@ -40,12 +42,14 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259455,9223372039002259455] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pandn %xmm3, %xmm0 +; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE41-NEXT: retq @@ -99,11 +103,13 @@ ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259455,9223372039002259455] +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1 ; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1 @@ -116,12 +122,14 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259455,9223372039002259455] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pandn %xmm3, %xmm0 +; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE41-NEXT: movq %xmm0, (%rdi) @@ -146,7 +154,8 @@ ; ; AVX512VL-LABEL: trunc_usat_v2i64_v2i32_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovusqd %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovusqd %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_usat_v2i64_v2i32_store: @@ -159,12 +168,14 @@ ; ; AVX512BWVL-LABEL: trunc_usat_v2i64_v2i32_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovusqd %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusqd %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v2i64_v2i32_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovusqd %xmm0, (%rdi) +; SKX-NEXT: vpmovusqd %xmm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp ult <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -179,47 +190,48 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647] -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm6, %xmm3 -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSE2-SSSE3-NEXT: por %xmm1, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-SSSE3-NEXT: por %xmm1, %xmm2 ; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm1 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: por %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v4i64_v4i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: pxor %xmm4, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: movdqa %xmm5, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483647,2147483647,2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm7, %xmm3 -; SSE41-NEXT: pand %xmm6, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm3 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295] ; SSE41-NEXT: movapd {{.*#+}} xmm5 = [4294967295,429496729] @@ -346,109 +358,108 @@ ; SSE2-SSSE3-LABEL: trunc_usat_v8i64_v8i32: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm2 -; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm5 ; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm6 ; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm1 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647] -; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm7 -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm7 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm7, %xmm1 -; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm7 -; SSE2-SSSE3-NEXT: por %xmm1, %xmm7 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm4 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259455,9223372039002259455] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm7 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm1 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm8 +; SSE2-SSSE3-NEXT: por %xmm1, %xmm8 ; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm1 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm9 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm7 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,2] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm1 ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm6 -; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm1 ; SSE2-SSSE3-NEXT: por %xmm6, %xmm1 -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2] -; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm6 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm7 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,2] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm6 -; SSE2-SSSE3-NEXT: pand %xmm6, %xmm0 -; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm6 -; SSE2-SSSE3-NEXT: por %xmm0, %xmm6 -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm5 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 -; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm4 ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: por %xmm4, %xmm0 ; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v8i64_v8i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm4 -; SSE41-NEXT: movdqa 16(%rdi), %xmm7 -; SSE41-NEXT: movdqa 32(%rdi), %xmm8 +; SSE41-NEXT: movdqa (%rdi), %xmm3 +; SSE41-NEXT: movdqa 16(%rdi), %xmm5 +; SSE41-NEXT: movdqa 32(%rdi), %xmm7 ; SSE41-NEXT: movdqa 48(%rdi), %xmm1 -; SSE41-NEXT: movapd {{.*#+}} xmm3 = [4294967295,4294967295] -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: movdqa %xmm5, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2] -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259455,9223372039002259455] +; SSE41-NEXT: movdqa %xmm6, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE41-NEXT: movapd %xmm2, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2] +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 +; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm5, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm8[0,2] -; SSE41-NEXT: movaps %xmm3, %xmm0 +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2] +; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v8i64_v8i32: @@ -562,11 +573,13 @@ ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002324991,9223372039002324991] +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1 ; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1 @@ -579,12 +592,14 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002324991,9223372039002324991] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pandn %xmm3, %xmm0 +; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] @@ -655,11 +670,13 @@ ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002324991,9223372039002324991] +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1 ; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1 @@ -673,12 +690,14 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002324991,9223372039002324991] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pandn %xmm3, %xmm0 +; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] @@ -724,7 +743,8 @@ ; ; AVX512VL-LABEL: trunc_usat_v2i64_v2i16_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovusqw %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovusqw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_usat_v2i64_v2i16_store: @@ -737,12 +757,14 @@ ; ; AVX512BWVL-LABEL: trunc_usat_v2i64_v2i16_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovusqw %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusqw %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovd %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v2i64_v2i16_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovusqw %xmm0, (%rdi) +; SKX-NEXT: vpmovusqw %xmm0, %xmm0 +; SKX-NEXT: vmovd %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp ult <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -757,27 +779,27 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002324991,9223372039002324991] +; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm6, %xmm3 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1 -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm3 ; SSE2-SSSE3-NEXT: por %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: por %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm0 ; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; SSE2-SSSE3-NEXT: pslld $16, %xmm0 ; SSE2-SSSE3-NEXT: psrad $16, %xmm0 @@ -787,31 +809,30 @@ ; ; SSE41-LABEL: trunc_usat_v4i64_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [65535,65535] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm5, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 +; SSE41-NEXT: pxor %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pand %xmm6, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: packusdw %xmm7, %xmm2 -; SSE41-NEXT: packusdw %xmm2, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: packusdw %xmm6, %xmm3 +; SSE41-NEXT: packusdw %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v4i64_v4i16: @@ -891,61 +912,60 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm3 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002324991,9223372039002324991] +; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm6, %xmm3 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 ; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1 -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm3 ; SSE2-SSSE3-NEXT: por %xmm1, %xmm3 -; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: por %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; SSE2-SSSE3-NEXT: pslld $16, %xmm1 -; SSE2-SSSE3-NEXT: psrad $16, %xmm1 -; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm1 -; SSE2-SSSE3-NEXT: movq %xmm1, (%rdi) +; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE2-SSSE3-NEXT: pslld $16, %xmm2 +; SSE2-SSSE3-NEXT: psrad $16, %xmm2 +; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm2 +; SSE2-SSSE3-NEXT: movq %xmm2, (%rdi) ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v4i64_v4i16_store: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [65535,65535] -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [65535,65535] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE41-NEXT: pxor %xmm2, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm5, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 +; SSE41-NEXT: pxor %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pand %xmm6, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE41-NEXT: packusdw %xmm7, %xmm4 -; SSE41-NEXT: packusdw %xmm4, %xmm4 -; SSE41-NEXT: movq %xmm4, (%rdi) +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: packusdw %xmm6, %xmm3 +; SSE41-NEXT: packusdw %xmm3, %xmm3 +; SSE41-NEXT: movq %xmm3, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v4i64_v4i16_store: @@ -994,7 +1014,8 @@ ; ; AVX512VL-LABEL: trunc_usat_v4i64_v4i16_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovusqw %ymm0, (%rdi) +; AVX512VL-NEXT: vpmovusqw %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -1008,13 +1029,15 @@ ; ; AVX512BWVL-LABEL: trunc_usat_v4i64_v4i16_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovusqw %ymm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusqw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, (%rdi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v4i64_v4i16_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovusqw %ymm0, (%rdi) +; SKX-NEXT: vpmovusqw %ymm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp ult <4 x i64> %a0, @@ -1034,109 +1057,108 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm7 -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm7 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002324991,9223372039002324991] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm7 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm8 ; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-SSSE3-NEXT: pand %xmm7, %xmm0 -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm7 -; SSE2-SSSE3-NEXT: por %xmm0, %xmm7 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm0 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm8 +; SSE2-SSSE3-NEXT: por %xmm0, %xmm8 ; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm0 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm9 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm7 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm6 ; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm0 ; SSE2-SSSE3-NEXT: por %xmm6, %xmm0 -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[0,2] +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,2] ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm7 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm6 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm6 -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm6 ; SSE2-SSSE3-NEXT: pand %xmm6, %xmm5 ; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm6 ; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm4 -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2] -; SSE2-SSSE3-NEXT: pslld $16, %xmm5 -; SSE2-SSSE3-NEXT: psrad $16, %xmm5 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm2 +; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: por %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2] +; SSE2-SSSE3-NEXT: pslld $16, %xmm2 +; SSE2-SSSE3-NEXT: psrad $16, %xmm2 ; SSE2-SSSE3-NEXT: pslld $16, %xmm0 ; SSE2-SSSE3-NEXT: psrad $16, %xmm0 -; SSE2-SSSE3-NEXT: packssdw %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v8i64_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm8 -; SSE41-NEXT: movdqa 16(%rdi), %xmm2 -; SSE41-NEXT: movdqa 32(%rdi), %xmm4 -; SSE41-NEXT: movdqa 48(%rdi), %xmm7 -; SSE41-NEXT: movapd {{.*#+}} xmm3 = [65535,65535] -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: movdqa %xmm5, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147549183,2147549183,2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movdqa 32(%rdi), %xmm3 +; SSE41-NEXT: movdqa 48(%rdi), %xmm6 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 -; SSE41-NEXT: packusdw %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] ; SSE41-NEXT: movdqa %xmm5, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: movapd %xmm2, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm5, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: packusdw %xmm8, %xmm1 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 +; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: packusdw %xmm7, %xmm2 +; SSE41-NEXT: packusdw %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: packusdw %xmm8, %xmm3 -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v8i64_v8i16: @@ -1147,25 +1169,25 @@ ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] ; AVX1-NEXT: # xmm4 = mem[0,0] -; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343] ; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm8, %xmm6, %xmm8 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [65535,65535] -; AVX1-NEXT: # xmm6 = mem[0,0] -; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm6, %xmm1 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [65535,65535] +; AVX1-NEXT: # xmm7 = mem[0,0] +; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm7, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm7, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm7, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm7, %xmm2 +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_usat_v8i64_v8i16: @@ -1352,7 +1374,8 @@ ; ; AVX512VL-LABEL: trunc_usat_v4i32_v4i16_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovusdw %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovusdw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_usat_v4i32_v4i16_store: @@ -1365,12 +1388,14 @@ ; ; AVX512BWVL-LABEL: trunc_usat_v4i32_v4i16_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovusdw %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusdw %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v4i32_v4i16_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovusdw %xmm0, (%rdi) +; SKX-NEXT: vpmovusdw %xmm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp ult <4 x i32> %a0, %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> @@ -1647,11 +1672,13 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pandn %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: por %xmm1, %xmm0 @@ -1664,11 +1691,13 @@ ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSSE3-NEXT: pandn %xmm2, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259711,9223372039002259711] +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm3, %xmm1 ; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm2 ; SSSE3-NEXT: pand %xmm1, %xmm0 @@ -1680,12 +1709,14 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pandn %xmm3, %xmm0 +; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE41-NEXT: movdqa %xmm2, %xmm0 @@ -1738,11 +1769,13 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pandn %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 @@ -1757,11 +1790,13 @@ ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSSE3-NEXT: pandn %xmm2, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259711,9223372039002259711] +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm3, %xmm1 ; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm2 ; SSSE3-NEXT: pand %xmm0, %xmm1 @@ -1775,12 +1810,14 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pandn %xmm3, %xmm0 +; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE41-NEXT: pextrw $0, %xmm2, (%rdi) @@ -1805,7 +1842,8 @@ ; ; AVX512VL-LABEL: trunc_usat_v2i64_v2i8_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovusqb %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovusqb %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrw $0, %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_usat_v2i64_v2i8_store: @@ -1818,12 +1856,14 @@ ; ; AVX512BWVL-LABEL: trunc_usat_v2i64_v2i8_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovusqb %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusqb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpextrw $0, %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v2i64_v2i8_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovusqb %xmm0, (%rdi) +; SKX-NEXT: vpmovusqb %xmm0, %xmm0 +; SKX-NEXT: vpextrw $0, %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp ult <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -1839,26 +1879,26 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm7 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE2-SSSE3-NEXT: pand %xmm7, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm4 ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 ; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm4 ; SSE2-SSSE3-NEXT: por %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm6, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: por %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: packuswb %xmm4, %xmm0 ; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0 ; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0 @@ -1868,27 +1908,26 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm5, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 +; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pand %xmm6, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: packusdw %xmm7, %xmm2 +; SSE41-NEXT: packusdw %xmm6, %xmm2 ; SSE41-NEXT: packusdw %xmm2, %xmm2 ; SSE41-NEXT: packuswb %xmm2, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 @@ -1974,60 +2013,59 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm7 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE2-SSSE3-NEXT: pand %xmm7, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm4 ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 ; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm4 ; SSE2-SSSE3-NEXT: por %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm6, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: por %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: packuswb %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm1 -; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm1 -; SSE2-SSSE3-NEXT: movd %xmm1, (%rdi) +; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: packuswb %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm3 +; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm3 +; SSE2-SSSE3-NEXT: movd %xmm3, (%rdi) ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v4i64_v4i8_store: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE41-NEXT: pxor %xmm2, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm5, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 +; SSE41-NEXT: pxor %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pand %xmm6, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE41-NEXT: packusdw %xmm7, %xmm4 -; SSE41-NEXT: packusdw %xmm4, %xmm4 -; SSE41-NEXT: packuswb %xmm4, %xmm4 -; SSE41-NEXT: movd %xmm4, (%rdi) +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: packusdw %xmm6, %xmm3 +; SSE41-NEXT: packusdw %xmm3, %xmm3 +; SSE41-NEXT: packuswb %xmm3, %xmm3 +; SSE41-NEXT: movd %xmm3, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v4i64_v4i8_store: @@ -2078,7 +2116,8 @@ ; ; AVX512VL-LABEL: trunc_usat_v4i64_v4i8_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovusqb %ymm0, (%rdi) +; AVX512VL-NEXT: vpmovusqb %ymm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -2092,13 +2131,15 @@ ; ; AVX512BWVL-LABEL: trunc_usat_v4i64_v4i8_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovusqb %ymm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusqb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovd %xmm0, (%rdi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v4i64_v4i8_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovusqb %ymm0, (%rdi) +; SKX-NEXT: vpmovusqb %ymm0, %xmm0 +; SKX-NEXT: vmovd %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp ult <4 x i64> %a0, @@ -2119,106 +2160,105 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm7 ; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm9 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] +; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm7 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm7 -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm7 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm7 ; SSE2-SSSE3-NEXT: pand %xmm7, %xmm0 ; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm7 ; SSE2-SSSE3-NEXT: por %xmm0, %xmm7 ; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm0 ; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm9 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm6 ; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: por %xmm6, %xmm0 ; SSE2-SSSE3-NEXT: packuswb %xmm7, %xmm0 ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm7 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm6 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm6 ; SSE2-SSSE3-NEXT: pand %xmm6, %xmm5 ; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm6 ; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 -; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm1, %xmm5 -; SSE2-SSSE3-NEXT: packuswb %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: packuswb %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: packuswb %xmm6, %xmm3 +; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0 ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v8i64_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm8 -; SSE41-NEXT: movdqa 16(%rdi), %xmm2 -; SSE41-NEXT: movdqa 32(%rdi), %xmm4 -; SSE41-NEXT: movdqa 48(%rdi), %xmm7 -; SSE41-NEXT: movapd {{.*#+}} xmm3 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm5, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movdqa 32(%rdi), %xmm3 +; SSE41-NEXT: movdqa 48(%rdi), %xmm6 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 -; SSE41-NEXT: packusdw %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] ; SSE41-NEXT: movdqa %xmm5, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: movapd %xmm2, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm5, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: packusdw %xmm8, %xmm1 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 +; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: packusdw %xmm7, %xmm2 +; SSE41-NEXT: packusdw %xmm2, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: packusdw %xmm8, %xmm3 -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: packuswb %xmm2, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v8i64_v8i8: @@ -2229,25 +2269,25 @@ ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] ; AVX1-NEXT: # xmm4 = mem[0,0] -; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063] ; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm8, %xmm6, %xmm8 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [255,255] -; AVX1-NEXT: # xmm6 = mem[0,0] -; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm6, %xmm1 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: # xmm7 = mem[0,0] +; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm7, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm7, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm7, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm7, %xmm2 +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -2303,107 +2343,106 @@ ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm7 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm9 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259711,9223372039002259711] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm7 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm7 -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm7 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm7 ; SSE2-SSSE3-NEXT: pand %xmm7, %xmm5 ; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm7 ; SSE2-SSSE3-NEXT: por %xmm5, %xmm7 ; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm5 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm9 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm5 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm5 ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm6 ; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm5 ; SSE2-SSSE3-NEXT: por %xmm6, %xmm5 ; SSE2-SSSE3-NEXT: packuswb %xmm7, %xmm5 ; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm6 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm7 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm6 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm6 -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm6 ; SSE2-SSSE3-NEXT: pand %xmm6, %xmm4 ; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm6 ; SSE2-SSSE3-NEXT: por %xmm4, %xmm6 -; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm4 -; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 -; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: por %xmm0, %xmm4 -; SSE2-SSSE3-NEXT: packuswb %xmm6, %xmm4 -; SSE2-SSSE3-NEXT: packuswb %xmm4, %xmm5 +; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2 +; SSE2-SSSE3-NEXT: por %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: packuswb %xmm6, %xmm2 +; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm5 ; SSE2-SSSE3-NEXT: packuswb %xmm5, %xmm5 ; SSE2-SSSE3-NEXT: movq %xmm5, (%rsi) ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v8i64_v8i8_store: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm8 -; SSE41-NEXT: movdqa 16(%rdi), %xmm7 -; SSE41-NEXT: movdqa 32(%rdi), %xmm3 -; SSE41-NEXT: movdqa 48(%rdi), %xmm6 -; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm4, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 -; SSE41-NEXT: packusdw %xmm9, %xmm7 +; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa 16(%rdi), %xmm6 +; SSE41-NEXT: movdqa 32(%rdi), %xmm2 +; SSE41-NEXT: movdqa 48(%rdi), %xmm5 +; SSE41-NEXT: movapd {{.*#+}} xmm1 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] ; SSE41-NEXT: movdqa %xmm4, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm8 +; SSE41-NEXT: movapd %xmm1, %xmm8 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: packusdw %xmm8, %xmm2 -; SSE41-NEXT: packusdw %xmm2, %xmm7 -; SSE41-NEXT: packuswb %xmm7, %xmm7 -; SSE41-NEXT: movq %xmm7, (%rsi) +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm4, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm6 +; SSE41-NEXT: packusdw %xmm8, %xmm6 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm4, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 +; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: packusdw %xmm7, %xmm1 +; SSE41-NEXT: packusdw %xmm1, %xmm6 +; SSE41-NEXT: packuswb %xmm6, %xmm6 +; SSE41-NEXT: movq %xmm6, (%rsi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v8i64_v8i8_store: @@ -2414,25 +2453,25 @@ ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] ; AVX1-NEXT: # xmm4 = mem[0,0] -; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063] ; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm8, %xmm6, %xmm8 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [255,255] -; AVX1-NEXT: # xmm6 = mem[0,0] -; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm6, %xmm1 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: # xmm7 = mem[0,0] +; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm7, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm7, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm7, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm7, %xmm2 +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, (%rsi) ; AVX1-NEXT: retq @@ -2462,14 +2501,16 @@ ; AVX512-LABEL: trunc_usat_v8i64_v8i8_store: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vpmovusqb %zmm0, (%rsi) +; AVX512-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_usat_v8i64_v8i8_store: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 -; SKX-NEXT: vpmovusqb %zmm0, (%rsi) +; SKX-NEXT: vpmovusqb %zmm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rsi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %a0 = load <8 x i64>, ptr %p0 @@ -2484,7 +2525,7 @@ ; SSE2-SSSE3-LABEL: trunc_usat_v16i64_v16i8: ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movdqa 96(%rdi), %xmm1 -; SSE2-SSSE3-NEXT: movdqa 112(%rdi), %xmm4 +; SSE2-SSSE3-NEXT: movdqa 112(%rdi), %xmm3 ; SSE2-SSSE3-NEXT: movdqa 64(%rdi), %xmm6 ; SSE2-SSSE3-NEXT: movdqa 80(%rdi), %xmm7 ; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm10 @@ -2492,96 +2533,96 @@ ; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm8 ; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm9 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] ; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm11 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm13 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm12, %xmm13 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm11 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm12 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm11 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm11 -; SSE2-SSSE3-NEXT: pand %xmm13, %xmm11 +; SSE2-SSSE3-NEXT: pand %xmm12, %xmm11 ; SSE2-SSSE3-NEXT: pand %xmm11, %xmm0 ; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm11 ; SSE2-SSSE3-NEXT: por %xmm0, %xmm11 ; SSE2-SSSE3-NEXT: movdqa %xmm10, %xmm0 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm13 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm12, %xmm13 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm12 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm12 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm13, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm12, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm10 ; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: por %xmm10, %xmm0 ; SSE2-SSSE3-NEXT: packuswb %xmm11, %xmm0 ; SSE2-SSSE3-NEXT: movdqa %xmm9, %xmm10 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm12 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm10 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm11 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm10 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm10 -; SSE2-SSSE3-NEXT: pand %xmm12, %xmm10 +; SSE2-SSSE3-NEXT: pand %xmm11, %xmm10 ; SSE2-SSSE3-NEXT: pand %xmm10, %xmm9 ; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm10 ; SSE2-SSSE3-NEXT: por %xmm9, %xmm10 ; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm9 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm12 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm9 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm11 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm11 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm9 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm9 -; SSE2-SSSE3-NEXT: pand %xmm12, %xmm9 +; SSE2-SSSE3-NEXT: pand %xmm11, %xmm9 ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm8 ; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm9 ; SSE2-SSSE3-NEXT: por %xmm8, %xmm9 ; SSE2-SSSE3-NEXT: packuswb %xmm10, %xmm9 ; SSE2-SSSE3-NEXT: packuswb %xmm9, %xmm0 ; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm8 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm8 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm9 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm8 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm8 -; SSE2-SSSE3-NEXT: pand %xmm10, %xmm8 +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm8 ; SSE2-SSSE3-NEXT: pand %xmm8, %xmm7 ; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm8 ; SSE2-SSSE3-NEXT: por %xmm7, %xmm8 ; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm7 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm7 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm9 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm7 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm7 -; SSE2-SSSE3-NEXT: pand %xmm10, %xmm7 +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm7 ; SSE2-SSSE3-NEXT: pand %xmm7, %xmm6 ; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm7 ; SSE2-SSSE3-NEXT: por %xmm6, %xmm7 ; SSE2-SSSE3-NEXT: packuswb %xmm8, %xmm7 -; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm6 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm9 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm6 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm6 -; SSE2-SSSE3-NEXT: pand %xmm6, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm3 ; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm6 -; SSE2-SSSE3-NEXT: por %xmm4, %xmm6 -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm5 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm6 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm4 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm4 ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 ; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm4 ; SSE2-SSSE3-NEXT: por %xmm1, %xmm4 @@ -2592,153 +2633,151 @@ ; ; SSE41-LABEL: trunc_usat_v16i64_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa 96(%rdi), %xmm3 -; SSE41-NEXT: movdqa 112(%rdi), %xmm5 -; SSE41-NEXT: movdqa 64(%rdi), %xmm8 -; SSE41-NEXT: movdqa 80(%rdi), %xmm9 -; SSE41-NEXT: movdqa (%rdi), %xmm12 -; SSE41-NEXT: movdqa 16(%rdi), %xmm2 -; SSE41-NEXT: movdqa 32(%rdi), %xmm10 -; SSE41-NEXT: movdqa 48(%rdi), %xmm11 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm6, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm13 -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm14, %xmm0 -; SSE41-NEXT: pand %xmm13, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm13 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm13 -; SSE41-NEXT: movdqa %xmm12, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa 96(%rdi), %xmm2 +; SSE41-NEXT: movdqa 112(%rdi), %xmm4 +; SSE41-NEXT: movdqa 64(%rdi), %xmm7 +; SSE41-NEXT: movdqa 80(%rdi), %xmm8 +; SSE41-NEXT: movdqa (%rdi), %xmm11 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movdqa 32(%rdi), %xmm9 +; SSE41-NEXT: movdqa 48(%rdi), %xmm10 +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm14, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm2 -; SSE41-NEXT: packusdw %xmm13, %xmm2 -; SSE41-NEXT: movdqa %xmm11, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] ; SSE41-NEXT: movdqa %xmm6, %xmm12 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm12 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm13, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm12, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm12 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 +; SSE41-NEXT: movapd %xmm3, %xmm12 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm12 +; SSE41-NEXT: movdqa %xmm11, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm6, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm1 +; SSE41-NEXT: packusdw %xmm12, %xmm1 ; SSE41-NEXT: movdqa %xmm10, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm6, %xmm11 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm13, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] ; SSE41-NEXT: pand %xmm11, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm11 +; SSE41-NEXT: movapd %xmm3, %xmm11 ; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm11 -; SSE41-NEXT: packusdw %xmm12, %xmm11 -; SSE41-NEXT: packusdw %xmm11, %xmm2 ; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm6, %xmm10 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] ; SSE41-NEXT: pand %xmm10, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm10 +; SSE41-NEXT: movapd %xmm3, %xmm10 ; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm10 +; SSE41-NEXT: packusdw %xmm11, %xmm10 +; SSE41-NEXT: packusdw %xmm10, %xmm1 ; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm6, %xmm9 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] ; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm9 +; SSE41-NEXT: movapd %xmm3, %xmm9 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm9 -; SSE41-NEXT: packusdw %xmm10, %xmm9 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm6, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm8 -; SSE41-NEXT: pxor %xmm3, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm6, %xmm1 +; SSE41-NEXT: movapd %xmm3, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: packusdw %xmm9, %xmm8 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7 +; SSE41-NEXT: pxor %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm6, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: packusdw %xmm7, %xmm3 +; SSE41-NEXT: packusdw %xmm3, %xmm8 +; SSE41-NEXT: packuswb %xmm8, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4 -; SSE41-NEXT: packusdw %xmm8, %xmm4 -; SSE41-NEXT: packusdw %xmm4, %xmm9 -; SSE41-NEXT: packuswb %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v16i64_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] ; AVX1-NEXT: # xmm4 = mem[0,0] -; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063] ; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm8, %xmm6, %xmm8 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm9 -; AVX1-NEXT: vpcmpgtq %xmm9, %xmm6, %xmm9 -; AVX1-NEXT: vmovdqa 64(%rdi), %xmm10 -; AVX1-NEXT: vpxor %xmm4, %xmm10, %xmm11 -; AVX1-NEXT: vpcmpgtq %xmm11, %xmm6, %xmm11 -; AVX1-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX1-NEXT: vpxor %xmm4, %xmm12, %xmm13 -; AVX1-NEXT: vpcmpgtq %xmm13, %xmm6, %xmm13 -; AVX1-NEXT: vmovdqa 96(%rdi), %xmm14 -; AVX1-NEXT: vpxor %xmm4, %xmm14, %xmm15 -; AVX1-NEXT: vpcmpgtq %xmm15, %xmm6, %xmm15 -; AVX1-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: # xmm7 = mem[0,0] +; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm7, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm7, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm7, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm7, %xmm2 +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm7, %xmm1 +; AVX1-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm7, %xmm2 +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmovdqa 112(%rdi), %xmm2 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm7, %xmm2 +; AVX1-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [255,255] -; AVX1-NEXT: # xmm6 = mem[0,0] -; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vblendvpd %xmm15, %xmm14, %xmm6, %xmm4 -; AVX1-NEXT: vblendvpd %xmm13, %xmm12, %xmm6, %xmm12 -; AVX1-NEXT: vblendvpd %xmm11, %xmm10, %xmm6, %xmm10 -; AVX1-NEXT: vblendvpd %xmm9, %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm6, %xmm1 -; AVX1-NEXT: vblendvpd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm5 # 16-byte Folded Reload -; AVX1-NEXT: vpackusdw %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpackusdw %xmm12, %xmm10, %xmm4 -; AVX1-NEXT: vpackusdw %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_usat_v16i64_v16i8: @@ -2935,7 +2974,8 @@ ; ; AVX512VL-LABEL: trunc_usat_v4i32_v4i8_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovusdb %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovusdb %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_usat_v4i32_v4i8_store: @@ -2948,12 +2988,14 @@ ; ; AVX512BWVL-LABEL: trunc_usat_v4i32_v4i8_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovusdb %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusdb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovd %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v4i32_v4i8_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovusdb %xmm0, (%rdi) +; SKX-NEXT: vpmovusdb %xmm0, %xmm0 +; SKX-NEXT: vmovd %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp ult <4 x i32> %a0, %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> @@ -3117,7 +3159,8 @@ ; ; AVX512VL-LABEL: trunc_usat_v8i32_v8i8_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovusdb %ymm0, (%rdi) +; AVX512VL-NEXT: vpmovusdb %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -3131,13 +3174,15 @@ ; ; AVX512BWVL-LABEL: trunc_usat_v8i32_v8i8_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovusdb %ymm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, (%rdi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v8i32_v8i8_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovusdb %ymm0, (%rdi) +; SKX-NEXT: vpmovusdb %ymm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp ult <8 x i32> %a0, @@ -3456,12 +3501,14 @@ ; ; AVX512BWVL-LABEL: trunc_usat_v8i16_v8i8_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovuswb %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovuswb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v8i16_v8i8_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovuswb %xmm0, (%rdi) +; SKX-NEXT: vpmovuswb %xmm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp ult <8 x i16> %a0, %2 = select <8 x i1> %1, <8 x i16> %a0, <8 x i16> diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -84,7 +84,8 @@ ; ; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_ashr: ; AVX2-FAST-ALL: # %bb.0: # %entry -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7] +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [1,3,5,7,1,3,5,7] +; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -132,7 +133,8 @@ ; ; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_lshr: ; AVX2-FAST-ALL: # %bb.0: # %entry -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7] +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [1,3,5,7,1,3,5,7] +; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1793,11 +1795,22 @@ } define <8 x i16> @PR32160(<8 x i32> %x) { -; SSE-LABEL: PR32160: -; SSE: # %bb.0: -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: retq +; SSE2-LABEL: PR32160: +; SSE2: # %bb.0: +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: PR32160: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: PR32160: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9] +; SSE41-NEXT: retq ; ; AVX-LABEL: PR32160: ; AVX: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll --- a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll @@ -16,27 +16,28 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vandnps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -141,27 +142,28 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vandnps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -266,34 +268,35 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vandnps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -415,34 +418,35 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vandnps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -562,29 +566,30 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX1-LABEL: testv16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vandnps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4 ; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -710,29 +715,30 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; AVX1-LABEL: testv16i16u: ; AVX1: # %bb.0: -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vandnps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4 ; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -860,24 +866,25 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vandnps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -1002,24 +1009,25 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vandnps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll --- a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll @@ -264,30 +264,31 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512CD-LABEL: testv32i16: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm2 -; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm2 -; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512CD-NEXT: vpsrlw $4, %ymm2, %ymm2 -; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX512CD-NEXT: vpaddb %ymm4, %ymm2, %ymm2 +; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512CD-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512CD-NEXT: vpaddw %ymm2, %ymm0, %ymm2 +; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512CD-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512CD-NEXT: vpand %ymm1, %ymm4, %ymm4 +; AVX512CD-NEXT: vpshufb %ymm4, %ymm3, %ymm4 +; AVX512CD-NEXT: vpaddb %ymm2, %ymm4, %ymm2 ; AVX512CD-NEXT: vpsllw $8, %ymm2, %ymm4 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm4, %ymm2 ; AVX512CD-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm1 -; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm1 -; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm4 +; AVX512CD-NEXT: vpshufb %ymm4, %ymm3, %ymm4 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm4, %ymm0, %ymm0 ; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512CD-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -334,19 +335,20 @@ ; ; AVX512VPOPCNTDQ-LABEL: testv32i16: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm2, %zmm2 -; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; BITALG-LABEL: testv32i16: @@ -363,30 +365,31 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512CD-LABEL: testv32i16u: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm2 -; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm2 -; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512CD-NEXT: vpsrlw $4, %ymm2, %ymm2 -; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX512CD-NEXT: vpaddb %ymm4, %ymm2, %ymm2 +; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512CD-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512CD-NEXT: vpaddw %ymm2, %ymm0, %ymm2 +; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512CD-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512CD-NEXT: vpand %ymm1, %ymm4, %ymm4 +; AVX512CD-NEXT: vpshufb %ymm4, %ymm3, %ymm4 +; AVX512CD-NEXT: vpaddb %ymm2, %ymm4, %ymm2 ; AVX512CD-NEXT: vpsllw $8, %ymm2, %ymm4 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm4, %ymm2 ; AVX512CD-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm1 -; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm1 -; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm4 +; AVX512CD-NEXT: vpshufb %ymm4, %ymm3, %ymm4 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm4, %ymm0, %ymm0 ; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512CD-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -433,19 +436,20 @@ ; ; AVX512VPOPCNTDQ-LABEL: testv32i16u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm2, %zmm2 -; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; BITALG-LABEL: testv32i16u: @@ -464,25 +468,26 @@ ; AVX512CD: # %bb.0: ; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm3 -; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4 -; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512CD-NEXT: vpaddb %ymm4, %ymm1, %ymm1 +; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm2 -; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm2 -; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512CD-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512CD-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512CD-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm3 +; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0 ; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512CD-NEXT: retq ; @@ -522,25 +527,26 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm4 -; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -560,25 +566,26 @@ ; AVX512CD: # %bb.0: ; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm3 -; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4 -; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512CD-NEXT: vpaddb %ymm4, %ymm1, %ymm1 +; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm2 -; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm2 -; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512CD-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512CD-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512CD-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm3 +; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0 ; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512CD-NEXT: retq ; @@ -618,25 +625,26 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm4 -; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll --- a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll @@ -590,13 +590,13 @@ ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rdi) -; SSE-NEXT: movdqa %xmm1, 16(%rdi) +; SSE-NEXT: movdqa %xmm2, 16(%rdi) +; SSE-NEXT: movdqa %xmm1, (%rdi) ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsi) -; SSE-NEXT: movdqa %xmm1, 16(%rsi) +; SSE-NEXT: movdqa %xmm0, 16(%rsi) +; SSE-NEXT: movdqa %xmm1, (%rsi) ; SSE-NEXT: .LBB18_2: # %if.end ; SSE-NEXT: retq ; @@ -609,12 +609,12 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa %xmm2, (%rdi) -; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm2, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, (%rsi) -; AVX1-NEXT: vmovdqa %xmm1, 16(%rsi) +; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX1-NEXT: vmovdqa %xmm1, (%rsi) ; AVX1-NEXT: .LBB18_2: # %if.end ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -2332,11 +2332,8 @@ ; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: movl 8(%rdi), %ecx -; SSE2-NEXT: shll $13, %ecx -; SSE2-NEXT: movq %rax, %rdx -; SSE2-NEXT: shrq $51, %rdx -; SSE2-NEXT: orl %ecx, %edx -; SSE2-NEXT: movd %edx, %xmm1 +; SSE2-NEXT: shldq $13, %rax, %rcx +; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: shrq $34, %rax ; SSE2-NEXT: movd %eax, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] @@ -2353,11 +2350,8 @@ ; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: movl 8(%rdi), %ecx -; SSSE3-NEXT: shll $13, %ecx -; SSSE3-NEXT: movq %rax, %rdx -; SSSE3-NEXT: shrq $51, %rdx -; SSSE3-NEXT: orl %ecx, %edx -; SSSE3-NEXT: movd %edx, %xmm1 +; SSSE3-NEXT: shldq $13, %rax, %rcx +; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: shrq $34, %rax ; SSSE3-NEXT: movd %eax, %xmm2 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] @@ -2367,15 +2361,12 @@ ; ; SSE41-LABEL: zext_4i17_to_4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movl 8(%rdi), %eax -; SSE41-NEXT: shll $13, %eax -; SSE41-NEXT: movq (%rdi), %rcx -; SSE41-NEXT: movq %rcx, %rdx -; SSE41-NEXT: shrq $51, %rdx -; SSE41-NEXT: orl %eax, %edx -; SSE41-NEXT: movq %rcx, %rax +; SSE41-NEXT: movq (%rdi), %rax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: movl 8(%rdi), %edx +; SSE41-NEXT: shldq $13, %rax, %rdx ; SSE41-NEXT: shrq $17, %rax -; SSE41-NEXT: movd %ecx, %xmm0 ; SSE41-NEXT: pinsrd $1, %eax, %xmm0 ; SSE41-NEXT: shrq $34, %rcx ; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 @@ -2385,15 +2376,12 @@ ; ; AVX1-LABEL: zext_4i17_to_4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: movl 8(%rdi), %eax -; AVX1-NEXT: shll $13, %eax -; AVX1-NEXT: movq (%rdi), %rcx -; AVX1-NEXT: movq %rcx, %rdx -; AVX1-NEXT: shrq $51, %rdx -; AVX1-NEXT: orl %eax, %edx -; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: movl 8(%rdi), %edx +; AVX1-NEXT: shldq $13, %rax, %rdx ; AVX1-NEXT: shrq $17, %rax -; AVX1-NEXT: vmovd %ecx, %xmm0 ; AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX1-NEXT: shrq $34, %rcx ; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 @@ -2403,15 +2391,12 @@ ; ; AVX2-LABEL: zext_4i17_to_4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: movl 8(%rdi), %eax -; AVX2-NEXT: shll $13, %eax -; AVX2-NEXT: movq (%rdi), %rcx -; AVX2-NEXT: movq %rcx, %rdx -; AVX2-NEXT: shrq $51, %rdx -; AVX2-NEXT: orl %eax, %edx -; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: movl 8(%rdi), %edx +; AVX2-NEXT: shldq $13, %rax, %rdx ; AVX2-NEXT: shrq $17, %rax -; AVX2-NEXT: vmovd %ecx, %xmm0 ; AVX2-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX2-NEXT: shrq $34, %rcx ; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 @@ -2422,15 +2407,12 @@ ; ; AVX512-LABEL: zext_4i17_to_4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: movl 8(%rdi), %eax -; AVX512-NEXT: shll $13, %eax -; AVX512-NEXT: movq (%rdi), %rcx -; AVX512-NEXT: movq %rcx, %rdx -; AVX512-NEXT: shrq $51, %rdx -; AVX512-NEXT: orl %eax, %edx -; AVX512-NEXT: movq %rcx, %rax +; AVX512-NEXT: movq (%rdi), %rax +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: movq %rax, %rcx +; AVX512-NEXT: movl 8(%rdi), %edx +; AVX512-NEXT: shldq $13, %rax, %rdx ; AVX512-NEXT: shrq $17, %rax -; AVX512-NEXT: vmovd %ecx, %xmm0 ; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX512-NEXT: shrq $34, %rcx ; AVX512-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 @@ -2555,25 +2537,31 @@ define <4 x i64> @splatshuf_zext_v4i64(<4 x i32> %x) { ; SSE2-LABEL: splatshuf_zext_v4i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: splatshuf_zext_v4i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: splatshuf_zext_v4i64: ; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatshuf_zext_v4i64: @@ -2645,25 +2633,30 @@ define <8 x i32> @splatshuf_zext_v8i32_unmatched_undef(<8 x i16> %x) { ; SSE2-LABEL: splatshuf_zext_v8i32_unmatched_undef: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,1,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,3,2,4,5,6,7] ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,5,7,7] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: splatshuf_zext_v8i32_unmatched_undef: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[2,3],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero ; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[2,3],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,xmm1[u,u],zero,zero,xmm1[6,7],zero,zero,xmm1[14,15],zero,zero ; SSSE3-NEXT: retq ; ; SSE41-LABEL: splatshuf_zext_v8i32_unmatched_undef: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[2,3],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero ; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[2,3],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,xmm1[6,7],zero,zero,xmm1[6,7],zero,zero,xmm1[14,15],zero,zero ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatshuf_zext_v8i32_unmatched_undef: diff --git a/llvm/test/CodeGen/X86/viabs.ll b/llvm/test/CodeGen/X86/viabs.ll --- a/llvm/test/CodeGen/X86/viabs.ll +++ b/llvm/test/CodeGen/X86/viabs.ll @@ -174,35 +174,50 @@ define <4 x i32> @test_abs_le_v4i32(<4 x i32> %a) nounwind { ; SSE2-LABEL: test_abs_le_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1] +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: psubd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test_abs_le_v4i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pabsd %xmm0, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1] +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: psubd %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: test_abs_le_v4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pabsd %xmm0, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1] +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: psubd %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_abs_le_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpabsd %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_abs_le_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpabsd %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; AVX2-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_abs_le_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpabsd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1e,0xc0] +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] +; AVX512-NEXT: vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1 # encoding: [0x62,0xf3,0x7d,0x18,0x1f,0x0d,A,A,A,A,0x01] +; AVX512-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-5, kind: reloc_riprel_4byte +; AVX512-NEXT: vpsubd %xmm0, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0xfa,0xc0] ; AVX512-NEXT: retq # encoding: [0xc3] %tmp1neg = sub <4 x i32> zeroinitializer, %a %b = icmp sle <4 x i32> %a, zeroinitializer @@ -411,44 +426,68 @@ define <8 x i32> @test_abs_le_v8i32(<8 x i32> %a) nounwind { ; SSE2-LABEL: test_abs_le_v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: psubd %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: psubd %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: psubd %xmm3, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test_abs_le_v8i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pabsd %xmm0, %xmm0 -; SSSE3-NEXT: pabsd %xmm1, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: psubd %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm1 +; SSSE3-NEXT: psubd %xmm3, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: test_abs_le_v8i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pabsd %xmm0, %xmm0 -; SSE41-NEXT: pabsd %xmm1, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: psubd %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm1 +; SSE41-NEXT: psubd %xmm3, %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_abs_le_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpabsd %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpabsd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_abs_le_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpabsd %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_abs_le_v8i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpabsd %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1e,0xc0] +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] +; AVX512-NEXT: vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k1 # encoding: [0x62,0xf3,0x7d,0x38,0x1f,0x0d,A,A,A,A,0x01] +; AVX512-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-5, kind: reloc_riprel_4byte +; AVX512-NEXT: vpsubd %ymm0, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0xfa,0xc0] ; AVX512-NEXT: retq # encoding: [0xc3] %tmp1neg = sub <8 x i32> zeroinitializer, %a %b = icmp sle <8 x i32> %a, zeroinitializer @@ -459,61 +498,103 @@ define <16 x i32> @test_abs_le_16i32(<16 x i32> %a) nounwind { ; SSE2-LABEL: test_abs_le_16i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: psubd %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: psubd %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: psubd %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: psubd %xmm4, %xmm3 +; SSE2-NEXT: pxor %xmm7, %xmm1 +; SSE2-NEXT: psubd %xmm7, %xmm1 +; SSE2-NEXT: pxor %xmm6, %xmm2 +; SSE2-NEXT: psubd %xmm6, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm3 +; SSE2-NEXT: psubd %xmm5, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test_abs_le_16i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pabsd %xmm0, %xmm0 -; SSSE3-NEXT: pabsd %xmm1, %xmm1 -; SSSE3-NEXT: pabsd %xmm2, %xmm2 -; SSSE3-NEXT: pabsd %xmm3, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: movdqa %xmm4, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 +; SSSE3-NEXT: movdqa %xmm4, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: psubd %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm7, %xmm1 +; SSSE3-NEXT: psubd %xmm7, %xmm1 +; SSSE3-NEXT: pxor %xmm6, %xmm2 +; SSSE3-NEXT: psubd %xmm6, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm3 +; SSSE3-NEXT: psubd %xmm5, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: test_abs_le_16i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pabsd %xmm0, %xmm0 -; SSE41-NEXT: pabsd %xmm1, %xmm1 -; SSE41-NEXT: pabsd %xmm2, %xmm2 -; SSE41-NEXT: pabsd %xmm3, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm4, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: psubd %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm1 +; SSE41-NEXT: psubd %xmm7, %xmm1 +; SSE41-NEXT: pxor %xmm6, %xmm2 +; SSE41-NEXT: psubd %xmm6, %xmm2 +; SSE41-NEXT: pxor %xmm5, %xmm3 +; SSE41-NEXT: psubd %xmm5, %xmm3 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_abs_le_16i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpabsd %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpabsd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vpabsd %xmm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpabsd %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpsubd %xmm5, %xmm3, %xmm6 +; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1] +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm6, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 +; AVX1-NEXT: vpcmpgtd %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-NEXT: vblendvps %ymm5, %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vblendvps %ymm2, %ymm4, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_abs_le_16i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpabsd %ymm0, %ymm0 -; AVX2-NEXT: vpabsd %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_abs_le_16i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpabsd %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x1e,0xc0] +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] +; AVX512-NEXT: vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1 # encoding: [0x62,0xf3,0x7d,0x58,0x1f,0x0d,A,A,A,A,0x01] +; AVX512-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-5, kind: reloc_riprel_4byte +; AVX512-NEXT: vpsubd %zmm0, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0xfa,0xc0] ; AVX512-NEXT: retq # encoding: [0xc3] %tmp1neg = sub <16 x i32> zeroinitializer, %a %b = icmp sle <16 x i32> %a, zeroinitializer @@ -637,92 +718,203 @@ define <8 x i64> @test_abs_le_v8i64(<8 x i64> %a) nounwind { ; SSE2-LABEL: test_abs_le_v8i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: psubq %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: psubq %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: psubq %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483649,2147483649] +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm9, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: pxor %xmm5, %xmm7 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm7 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: pxor %xmm5, %xmm8 +; SSE2-NEXT: movdqa %xmm6, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm8, %xmm9 +; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm6, %xmm0 +; SSE2-NEXT: psubq %xmm6, %xmm0 +; SSE2-NEXT: pxor %xmm9, %xmm1 +; SSE2-NEXT: psubq %xmm9, %xmm1 +; SSE2-NEXT: pxor %xmm7, %xmm2 +; SSE2-NEXT: psubq %xmm7, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm3 ; SSE2-NEXT: psubq %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test_abs_le_v8i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: psubq %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: psubq %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: psubq %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm5, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483649,2147483649] +; SSSE3-NEXT: movdqa %xmm6, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm8, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm9, %xmm4 +; SSSE3-NEXT: movdqa %xmm2, %xmm7 +; SSSE3-NEXT: pxor %xmm5, %xmm7 +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm7 +; SSSE3-NEXT: movdqa %xmm1, %xmm8 +; SSSE3-NEXT: pxor %xmm5, %xmm8 +; SSSE3-NEXT: movdqa %xmm6, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSSE3-NEXT: por %xmm8, %xmm9 +; SSSE3-NEXT: pxor %xmm0, %xmm5 +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pxor %xmm6, %xmm0 +; SSSE3-NEXT: psubq %xmm6, %xmm0 +; SSSE3-NEXT: pxor %xmm9, %xmm1 +; SSSE3-NEXT: psubq %xmm9, %xmm1 +; SSSE3-NEXT: pxor %xmm7, %xmm2 +; SSSE3-NEXT: psubq %xmm7, %xmm2 ; SSSE3-NEXT: pxor %xmm4, %xmm3 ; SSSE3-NEXT: psubq %xmm4, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: test_abs_le_v8i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm5, %xmm5 -; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: psubq %xmm0, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm4 -; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: psubq %xmm1, %xmm6 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 -; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: psubq %xmm2, %xmm6 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm2 -; SSE41-NEXT: psubq %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3 -; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pxor %xmm5, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483649,2147483649] +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSE41-NEXT: por %xmm9, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm7 +; SSE41-NEXT: pxor %xmm5, %xmm7 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSE41-NEXT: pand %xmm9, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE41-NEXT: por %xmm10, %xmm7 +; SSE41-NEXT: movdqa %xmm1, %xmm8 +; SSE41-NEXT: pxor %xmm5, %xmm8 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm9 +; SSE41-NEXT: pxor %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm6 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: psubq %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm1 +; SSE41-NEXT: psubq %xmm9, %xmm1 +; SSE41-NEXT: pxor %xmm7, %xmm2 +; SSE41-NEXT: psubq %xmm7, %xmm2 +; SSE41-NEXT: pxor %xmm4, %xmm3 +; SSE41-NEXT: psubq %xmm4, %xmm3 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_abs_le_v8i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsubq %xmm0, %xmm3, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpsubq %xmm5, %xmm3, %xmm6 +; AVX1-NEXT: vpsubq %xmm0, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [1,1] +; AVX1-NEXT: # xmm6 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm6, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-NEXT: vblendvpd %ymm5, %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vblendvpd %ymm2, %ymm4, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_abs_le_v8i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpsubq %ymm0, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_abs_le_v8i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpabsq %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x1f,0xc0] +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] +; AVX512-NEXT: vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1 # encoding: [0x62,0xf3,0xfd,0x58,0x1f,0x0d,A,A,A,A,0x01] +; AVX512-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-5, kind: reloc_riprel_4byte +; AVX512-NEXT: vpsubq %zmm0, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf1,0xf5,0x49,0xfb,0xc0] ; AVX512-NEXT: retq # encoding: [0xc3] %tmp1neg = sub <8 x i64> zeroinitializer, %a %b = icmp sle <8 x i64> %a, zeroinitializer @@ -737,20 +929,53 @@ ; SSE2-NEXT: movdqu 16(%rdi), %xmm1 ; SSE2-NEXT: movdqu 32(%rdi), %xmm2 ; SSE2-NEXT: movdqu 48(%rdi), %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: psubq %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: psubq %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: psubq %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483649,2147483649] +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm9, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: pxor %xmm5, %xmm7 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm7 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: pxor %xmm5, %xmm8 +; SSE2-NEXT: movdqa %xmm6, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm8, %xmm9 +; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm6, %xmm0 +; SSE2-NEXT: psubq %xmm6, %xmm0 +; SSE2-NEXT: pxor %xmm9, %xmm1 +; SSE2-NEXT: psubq %xmm9, %xmm1 +; SSE2-NEXT: pxor %xmm7, %xmm2 +; SSE2-NEXT: psubq %xmm7, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm3 ; SSE2-NEXT: psubq %xmm4, %xmm3 ; SSE2-NEXT: retq @@ -761,81 +986,161 @@ ; SSSE3-NEXT: movdqu 16(%rdi), %xmm1 ; SSSE3-NEXT: movdqu 32(%rdi), %xmm2 ; SSSE3-NEXT: movdqu 48(%rdi), %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: psubq %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: psubq %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: psubq %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm5, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483649,2147483649] +; SSSE3-NEXT: movdqa %xmm6, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm8, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm9, %xmm4 +; SSSE3-NEXT: movdqa %xmm2, %xmm7 +; SSSE3-NEXT: pxor %xmm5, %xmm7 +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm7 +; SSSE3-NEXT: movdqa %xmm1, %xmm8 +; SSSE3-NEXT: pxor %xmm5, %xmm8 +; SSSE3-NEXT: movdqa %xmm6, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSSE3-NEXT: por %xmm8, %xmm9 +; SSSE3-NEXT: pxor %xmm0, %xmm5 +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pxor %xmm6, %xmm0 +; SSSE3-NEXT: psubq %xmm6, %xmm0 +; SSSE3-NEXT: pxor %xmm9, %xmm1 +; SSSE3-NEXT: psubq %xmm9, %xmm1 +; SSSE3-NEXT: pxor %xmm7, %xmm2 +; SSSE3-NEXT: psubq %xmm7, %xmm2 ; SSSE3-NEXT: pxor %xmm4, %xmm3 ; SSSE3-NEXT: psubq %xmm4, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: test_abs_le_v8i64_fold: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqu (%rdi), %xmm1 -; SSE41-NEXT: movdqu 16(%rdi), %xmm2 -; SSE41-NEXT: movdqu 32(%rdi), %xmm3 -; SSE41-NEXT: movdqu 48(%rdi), %xmm4 -; SSE41-NEXT: pxor %xmm5, %xmm5 -; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: psubq %xmm1, %xmm6 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 -; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: psubq %xmm2, %xmm6 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm2 -; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: psubq %xmm3, %xmm6 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3 -; SSE41-NEXT: psubq %xmm4, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm3, %xmm2 -; SSE41-NEXT: movapd %xmm4, %xmm3 +; SSE41-NEXT: movdqu (%rdi), %xmm0 +; SSE41-NEXT: movdqu 16(%rdi), %xmm1 +; SSE41-NEXT: movdqu 32(%rdi), %xmm2 +; SSE41-NEXT: movdqu 48(%rdi), %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pxor %xmm5, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483649,2147483649] +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSE41-NEXT: por %xmm9, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm7 +; SSE41-NEXT: pxor %xmm5, %xmm7 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSE41-NEXT: pand %xmm9, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE41-NEXT: por %xmm10, %xmm7 +; SSE41-NEXT: movdqa %xmm1, %xmm8 +; SSE41-NEXT: pxor %xmm5, %xmm8 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm9 +; SSE41-NEXT: pxor %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm6 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: psubq %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm1 +; SSE41-NEXT: psubq %xmm9, %xmm1 +; SSE41-NEXT: pxor %xmm7, %xmm2 +; SSE41-NEXT: psubq %xmm7, %xmm2 +; SSE41-NEXT: pxor %xmm4, %xmm3 +; SSE41-NEXT: psubq %xmm4, %xmm3 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_abs_le_v8i64_fold: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovupd (%rdi), %ymm0 -; AVX1-NEXT: vmovupd 32(%rdi), %ymm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsubq 16(%rdi), %xmm2, %xmm3 -; AVX1-NEXT: vpsubq (%rdi), %xmm2, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-NEXT: vblendvpd %ymm0, %ymm3, %ymm0, %ymm0 -; AVX1-NEXT: vpsubq 48(%rdi), %xmm2, %xmm3 -; AVX1-NEXT: vpsubq 32(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovupd 32(%rdi), %ymm0 +; AVX1-NEXT: vmovupd (%rdi), %ymm2 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqu 48(%rdi), %xmm3 +; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vmovdqu (%rdi), %xmm5 +; AVX1-NEXT: vmovdqu 16(%rdi), %xmm6 +; AVX1-NEXT: vmovdqu 32(%rdi), %xmm7 +; AVX1-NEXT: vpsubq %xmm7, %xmm1, %xmm8 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 +; AVX1-NEXT: vpsubq %xmm6, %xmm1, %xmm8 +; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm8 +; AVX1-NEXT: vmovddup {{.*#+}} xmm9 = [1,1] +; AVX1-NEXT: # xmm9 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm9, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm9, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vblendvpd %ymm1, %ymm4, %ymm0, %ymm1 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm9, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm9, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vblendvpd %ymm0, %ymm8, %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_abs_le_v8i64_fold: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpsubq %ymm0, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_abs_le_v8i64_fold: ; AVX512: # %bb.0: -; AVX512-NEXT: vpabsq (%rdi), %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x1f,0x07] +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 # encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x07] +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] +; AVX512-NEXT: vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1 # encoding: [0x62,0xf3,0xfd,0x58,0x1f,0x0d,A,A,A,A,0x01] +; AVX512-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-5, kind: reloc_riprel_4byte +; AVX512-NEXT: vpsubq %zmm0, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf1,0xf5,0x49,0xfb,0xc0] ; AVX512-NEXT: retq # encoding: [0xc3] %a = load <8 x i64>, ptr %a.ptr, align 8 %tmp1neg = sub <8 x i64> zeroinitializer, %a diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll --- a/llvm/test/CodeGen/X86/vselect-avx.ll +++ b/llvm/test/CodeGen/X86/vselect-avx.ll @@ -95,9 +95,18 @@ define void @test3(<4 x i32> %induction30, ptr %tmp16, ptr %tmp17, <4 x i16> %tmp3, <4 x i16> %tmp12) { ; AVX1-LABEL: test3: ; AVX1: ## %bb.0: -; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1431655766,1431655766,1431655766,1431655766] +; AVX1-NEXT: vpmuldq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmuldq %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; AVX1-NEXT: vpsrld $31, %xmm3, %xmm4 +; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm4 +; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 @@ -107,12 +116,18 @@ ; ; AVX2-LABEL: test3: ; AVX2: ## %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2863311531,2863311531,2863311531,2863311531] -; AVX2-NEXT: vpmulld %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [715827882,715827882,715827882,715827882] -; AVX2-NEXT: vpaddd %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1431655764,1431655764,1431655764,1431655764] -; AVX2-NEXT: vpminud %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1431655766,1431655766,1431655766,1431655766] +; AVX2-NEXT: vpmuldq %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpmuldq %xmm4, %xmm0, %xmm4 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; AVX2-NEXT: vpsrld $31, %xmm3, %xmm4 +; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpaddd %xmm3, %xmm3, %xmm4 +; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpsubd %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 @@ -122,9 +137,18 @@ ; ; AVX512-LABEL: test3: ; AVX512: ## %bb.0: -; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-NEXT: vpcmpleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1 +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1431655766,1431655766,1431655766,1431655766] +; AVX512-NEXT: vpmuldq %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vpmuldq %xmm4, %xmm0, %xmm4 +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; AVX512-NEXT: vpsrld $31, %xmm3, %xmm4 +; AVX512-NEXT: vpaddd %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vpaddd %xmm3, %xmm3, %xmm4 +; AVX512-NEXT: vpaddd %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vpsubd %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512-NEXT: vpmovdw %ymm0, %xmm0 @@ -284,17 +308,18 @@ ; AVX1-NEXT: vmovups (%rax), %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,3,2,1] ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,3,2] -; AVX1-NEXT: vmovups 16, %xmm2 -; AVX1-NEXT: vmovups 32, %xmm3 -; AVX1-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm3[1],mem[2,3] -; AVX1-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2],xmm4[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,3,2,1] -; AVX1-NEXT: vblendps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] -; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] +; AVX1-NEXT: vmovups 0, %xmm2 +; AVX1-NEXT: vmovups 16, %xmm3 +; AVX1-NEXT: vmovups 32, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0],xmm4[1],xmm2[2,3] +; AVX1-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2],xmm5[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,3,2,1] +; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3] ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,3,2] ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vcmpneqps %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vblendvps %xmm3, %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vblendvps %xmm3, %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vblendvps %xmm3, %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovups %xmm0, (%rax) ; AVX1-NEXT: vmovups %xmm1, (%rax) diff --git a/llvm/test/CodeGen/X86/vselect-zero.ll b/llvm/test/CodeGen/X86/vselect-zero.ll --- a/llvm/test/CodeGen/X86/vselect-zero.ll +++ b/llvm/test/CodeGen/X86/vselect-zero.ll @@ -274,11 +274,18 @@ ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; -; AVX512-LABEL: signbit_mask_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsraw $15, %xmm0, %xmm0 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: signbit_mask_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512DQBW-LABEL: signbit_mask_v8i16: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX512DQBW-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512DQBW-NEXT: retq %cond = icmp slt <8 x i16> %a, zeroinitializer %r = select <8 x i1> %cond, <8 x i16> %b, <8 x i16> zeroinitializer ret <8 x i16> %r @@ -440,11 +447,18 @@ ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: signbit_mask_v16i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsraw $15, %ymm0, %ymm0 -; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: signbit_mask_v16i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512DQBW-LABEL: signbit_mask_v16i16: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: vpsraw $15, %ymm0, %ymm0 +; AVX512DQBW-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512DQBW-NEXT: retq %cond = icmp slt <16 x i16> %a, zeroinitializer %r = select <16 x i1> %cond, <16 x i16> %b, <16 x i16> zeroinitializer ret <16 x i16> %r @@ -639,11 +653,18 @@ ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; -; AVX512-LABEL: signbit_setmask_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsraw $15, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: signbit_setmask_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512DQBW-LABEL: signbit_setmask_v8i16: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX512DQBW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQBW-NEXT: retq %cond = icmp slt <8 x i16> %a, zeroinitializer %r = select <8 x i1> %cond, <8 x i16> , <8 x i16> %b ret <8 x i16> %r @@ -770,11 +791,18 @@ ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: signbit_setmask_v16i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsraw $15, %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: signbit_setmask_v16i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512DQBW-LABEL: signbit_setmask_v16i16: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: vpsraw $15, %ymm0, %ymm0 +; AVX512DQBW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512DQBW-NEXT: retq %cond = icmp slt <16 x i16> %a, zeroinitializer %r = select <16 x i1> %cond, <16 x i16> , <16 x i16> %b ret <16 x i16> %r @@ -980,11 +1008,18 @@ ; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; -; AVX512-LABEL: not_signbit_mask_swap_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsraw $15, %xmm0, %xmm0 -; AVX512-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: not_signbit_mask_swap_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512DQBW-LABEL: not_signbit_mask_swap_v8i16: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX512DQBW-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512DQBW-NEXT: retq %cond = icmp slt <8 x i16> %a, zeroinitializer %r = select <8 x i1> %cond, <8 x i16> zeroinitializer, <8 x i16> %b ret <8 x i16> %r diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -642,8 +642,7 @@ ; SSE2-NEXT: movd %edi, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] ; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 @@ -656,23 +655,10 @@ ; SSE41-NEXT: movd %edi, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] ; SSE41-NEXT: por %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE41-NEXT: pinsrd $1, %edi, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq -; -; AVX-LABEL: simplify_select: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX-NEXT: vmovd %edi, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] -; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX-NEXT: vpinsrd $1, %edi, %xmm2, %xmm2 -; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 -; AVX-NEXT: retq %a = insertelement <2 x i32> , i32 %x, i32 1 %b = insertelement <2 x i32> , i32 %x, i32 0 %y = or <2 x i32> %a, %b diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll --- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -671,12 +671,12 @@ ; X86-SSE2-NEXT: andl $15, %ecx ; X86-SSE2-NEXT: movl (%esp,%ecx), %edx ; X86-SSE2-NEXT: movl 4(%esp,%ecx), %esi -; X86-SSE2-NEXT: movl 12(%esp,%ecx), %edi -; X86-SSE2-NEXT: movl 8(%esp,%ecx), %ecx -; X86-SSE2-NEXT: movl %ecx, 8(%eax) -; X86-SSE2-NEXT: movl %edi, 12(%eax) -; X86-SSE2-NEXT: movl %edx, (%eax) +; X86-SSE2-NEXT: movl 8(%esp,%ecx), %edi +; X86-SSE2-NEXT: movl 12(%esp,%ecx), %ecx +; X86-SSE2-NEXT: movl %ecx, 12(%eax) +; X86-SSE2-NEXT: movl %edi, 8(%eax) ; X86-SSE2-NEXT: movl %esi, 4(%eax) +; X86-SSE2-NEXT: movl %edx, (%eax) ; X86-SSE2-NEXT: addl $32, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi @@ -827,12 +827,12 @@ ; X86-SSE2-NEXT: movsbl %cl, %ecx ; X86-SSE2-NEXT: movl 16(%esp,%ecx), %edx ; X86-SSE2-NEXT: movl 20(%esp,%ecx), %esi -; X86-SSE2-NEXT: movl 28(%esp,%ecx), %edi -; X86-SSE2-NEXT: movl 24(%esp,%ecx), %ecx -; X86-SSE2-NEXT: movl %ecx, 8(%eax) -; X86-SSE2-NEXT: movl %edi, 12(%eax) -; X86-SSE2-NEXT: movl %edx, (%eax) +; X86-SSE2-NEXT: movl 24(%esp,%ecx), %edi +; X86-SSE2-NEXT: movl 28(%esp,%ecx), %ecx +; X86-SSE2-NEXT: movl %ecx, 12(%eax) +; X86-SSE2-NEXT: movl %edi, 8(%eax) ; X86-SSE2-NEXT: movl %esi, 4(%eax) +; X86-SSE2-NEXT: movl %edx, (%eax) ; X86-SSE2-NEXT: addl $32, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi @@ -986,12 +986,12 @@ ; X86-SSE2-NEXT: andl $15, %ecx ; X86-SSE2-NEXT: movl (%esp,%ecx), %edx ; X86-SSE2-NEXT: movl 4(%esp,%ecx), %esi -; X86-SSE2-NEXT: movl 12(%esp,%ecx), %edi -; X86-SSE2-NEXT: movl 8(%esp,%ecx), %ecx -; X86-SSE2-NEXT: movl %ecx, 8(%eax) -; X86-SSE2-NEXT: movl %edi, 12(%eax) -; X86-SSE2-NEXT: movl %edx, (%eax) +; X86-SSE2-NEXT: movl 8(%esp,%ecx), %edi +; X86-SSE2-NEXT: movl 12(%esp,%ecx), %ecx +; X86-SSE2-NEXT: movl %ecx, 12(%eax) +; X86-SSE2-NEXT: movl %edi, 8(%eax) ; X86-SSE2-NEXT: movl %esi, 4(%eax) +; X86-SSE2-NEXT: movl %edx, (%eax) ; X86-SSE2-NEXT: addl $32, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi @@ -1088,12 +1088,12 @@ ; X64-SSE2-NEXT: andl $31, %esi ; X64-SSE2-NEXT: movq -64(%rsp,%rsi), %rax ; X64-SSE2-NEXT: movq -56(%rsp,%rsi), %rcx -; X64-SSE2-NEXT: movq -40(%rsp,%rsi), %rdi -; X64-SSE2-NEXT: movq -48(%rsp,%rsi), %rsi -; X64-SSE2-NEXT: movq %rsi, 16(%rdx) -; X64-SSE2-NEXT: movq %rdi, 24(%rdx) -; X64-SSE2-NEXT: movq %rax, (%rdx) +; X64-SSE2-NEXT: movq -48(%rsp,%rsi), %rdi +; X64-SSE2-NEXT: movq -40(%rsp,%rsi), %rsi +; X64-SSE2-NEXT: movq %rsi, 24(%rdx) +; X64-SSE2-NEXT: movq %rdi, 16(%rdx) ; X64-SSE2-NEXT: movq %rcx, 8(%rdx) +; X64-SSE2-NEXT: movq %rax, (%rdx) ; X64-SSE2-NEXT: retq ; ; X64-SSE42-LABEL: lshr_32bytes: @@ -1121,10 +1121,8 @@ ; X64-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: andl $31, %eax -; X64-AVX-NEXT: vmovups -64(%rsp,%rax), %xmm0 -; X64-AVX-NEXT: vmovups -48(%rsp,%rax), %xmm1 -; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX-NEXT: vmovups -64(%rsp,%rax), %ymm0 +; X64-AVX-NEXT: vmovups %ymm0, (%rdx) ; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq ; @@ -1171,23 +1169,23 @@ ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 12(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-SSE2-NEXT: movl 20(%esp,%eax), %esi -; X86-SSE2-NEXT: movl 16(%esp,%eax), %edi -; X86-SSE2-NEXT: movl 28(%esp,%eax), %ebx -; X86-SSE2-NEXT: movl 24(%esp,%eax), %ebp -; X86-SSE2-NEXT: movl 36(%esp,%eax), %edx -; X86-SSE2-NEXT: movl 32(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 16(%esp,%eax), %esi +; X86-SSE2-NEXT: movl 20(%esp,%eax), %edi +; X86-SSE2-NEXT: movl 24(%esp,%eax), %ebx +; X86-SSE2-NEXT: movl 28(%esp,%eax), %ebp +; X86-SSE2-NEXT: movl 32(%esp,%eax), %edx +; X86-SSE2-NEXT: movl 36(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl %ecx, 24(%eax) -; X86-SSE2-NEXT: movl %edx, 28(%eax) -; X86-SSE2-NEXT: movl %ebp, 16(%eax) -; X86-SSE2-NEXT: movl %ebx, 20(%eax) -; X86-SSE2-NEXT: movl %edi, 8(%eax) -; X86-SSE2-NEXT: movl %esi, 12(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, (%eax) +; X86-SSE2-NEXT: movl %ecx, 28(%eax) +; X86-SSE2-NEXT: movl %edx, 24(%eax) +; X86-SSE2-NEXT: movl %ebp, 20(%eax) +; X86-SSE2-NEXT: movl %ebx, 16(%eax) +; X86-SSE2-NEXT: movl %edi, 12(%eax) +; X86-SSE2-NEXT: movl %esi, 8(%eax) ; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl %ecx, (%eax) ; X86-SSE2-NEXT: addl $72, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi @@ -1229,10 +1227,8 @@ ; X86-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: vmovups %ymm0, (%esp) ; X86-AVX-NEXT: andl $31, %ecx -; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0 -; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1 -; X86-AVX-NEXT: vmovups %xmm1, 16(%eax) -; X86-AVX-NEXT: vmovups %xmm0, (%eax) +; X86-AVX-NEXT: vmovups (%esp,%ecx), %ymm0 +; X86-AVX-NEXT: vmovups %ymm0, (%eax) ; X86-AVX-NEXT: addl $64, %esp ; X86-AVX-NEXT: vzeroupper ; X86-AVX-NEXT: retl @@ -1264,12 +1260,12 @@ ; X64-SSE2-NEXT: movsbq %sil, %rax ; X64-SSE2-NEXT: movq -32(%rsp,%rax), %rcx ; X64-SSE2-NEXT: movq -24(%rsp,%rax), %rsi -; X64-SSE2-NEXT: movq -8(%rsp,%rax), %rdi -; X64-SSE2-NEXT: movq -16(%rsp,%rax), %rax -; X64-SSE2-NEXT: movq %rax, 16(%rdx) -; X64-SSE2-NEXT: movq %rdi, 24(%rdx) -; X64-SSE2-NEXT: movq %rcx, (%rdx) +; X64-SSE2-NEXT: movq -16(%rsp,%rax), %rdi +; X64-SSE2-NEXT: movq -8(%rsp,%rax), %rax +; X64-SSE2-NEXT: movq %rax, 24(%rdx) +; X64-SSE2-NEXT: movq %rdi, 16(%rdx) ; X64-SSE2-NEXT: movq %rsi, 8(%rdx) +; X64-SSE2-NEXT: movq %rcx, (%rdx) ; X64-SSE2-NEXT: retq ; ; X64-SSE42-LABEL: shl_32bytes: @@ -1301,10 +1297,8 @@ ; X64-AVX-NEXT: andb $31, %al ; X64-AVX-NEXT: negb %al ; X64-AVX-NEXT: movsbq %al, %rax -; X64-AVX-NEXT: vmovups -32(%rsp,%rax), %xmm0 -; X64-AVX-NEXT: vmovups -16(%rsp,%rax), %xmm1 -; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX-NEXT: vmovups -32(%rsp,%rax), %ymm0 +; X64-AVX-NEXT: vmovups %ymm0, (%rdx) ; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq ; @@ -1348,28 +1342,28 @@ ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: andb $31, %al ; X86-SSE2-NEXT: negb %al -; X86-SSE2-NEXT: movsbl %al, %edx -; X86-SSE2-NEXT: movl 40(%esp,%edx), %eax +; X86-SSE2-NEXT: movsbl %al, %ecx +; X86-SSE2-NEXT: movl 40(%esp,%ecx), %eax ; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 44(%esp,%edx), %eax +; X86-SSE2-NEXT: movl 44(%esp,%ecx), %eax ; X86-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-SSE2-NEXT: movl 52(%esp,%edx), %esi -; X86-SSE2-NEXT: movl 48(%esp,%edx), %edi -; X86-SSE2-NEXT: movl 60(%esp,%edx), %ebx -; X86-SSE2-NEXT: movl 56(%esp,%edx), %ebp -; X86-SSE2-NEXT: movl 68(%esp,%edx), %ecx -; X86-SSE2-NEXT: movl 64(%esp,%edx), %edx +; X86-SSE2-NEXT: movl 48(%esp,%ecx), %esi +; X86-SSE2-NEXT: movl 52(%esp,%ecx), %edi +; X86-SSE2-NEXT: movl 56(%esp,%ecx), %ebx +; X86-SSE2-NEXT: movl 60(%esp,%ecx), %ebp +; X86-SSE2-NEXT: movl 64(%esp,%ecx), %edx +; X86-SSE2-NEXT: movl 68(%esp,%ecx), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl %edx, 24(%eax) ; X86-SSE2-NEXT: movl %ecx, 28(%eax) -; X86-SSE2-NEXT: movl %ebp, 16(%eax) -; X86-SSE2-NEXT: movl %ebx, 20(%eax) -; X86-SSE2-NEXT: movl %edi, 8(%eax) -; X86-SSE2-NEXT: movl %esi, 12(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, (%eax) +; X86-SSE2-NEXT: movl %edx, 24(%eax) +; X86-SSE2-NEXT: movl %ebp, 20(%eax) +; X86-SSE2-NEXT: movl %ebx, 16(%eax) +; X86-SSE2-NEXT: movl %edi, 12(%eax) +; X86-SSE2-NEXT: movl %esi, 8(%eax) ; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl %ecx, (%eax) ; X86-SSE2-NEXT: addl $72, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi @@ -1415,10 +1409,8 @@ ; X86-AVX-NEXT: andb $31, %cl ; X86-AVX-NEXT: negb %cl ; X86-AVX-NEXT: movsbl %cl, %ecx -; X86-AVX-NEXT: vmovups 32(%esp,%ecx), %xmm0 -; X86-AVX-NEXT: vmovups 48(%esp,%ecx), %xmm1 -; X86-AVX-NEXT: vmovups %xmm1, 16(%eax) -; X86-AVX-NEXT: vmovups %xmm0, (%eax) +; X86-AVX-NEXT: vmovups 32(%esp,%ecx), %ymm0 +; X86-AVX-NEXT: vmovups %ymm0, (%eax) ; X86-AVX-NEXT: addl $64, %esp ; X86-AVX-NEXT: vzeroupper ; X86-AVX-NEXT: retl @@ -1449,12 +1441,12 @@ ; X64-SSE2-NEXT: andl $31, %esi ; X64-SSE2-NEXT: movq -64(%rsp,%rsi), %rax ; X64-SSE2-NEXT: movq -56(%rsp,%rsi), %rcx -; X64-SSE2-NEXT: movq -40(%rsp,%rsi), %rdi -; X64-SSE2-NEXT: movq -48(%rsp,%rsi), %rsi -; X64-SSE2-NEXT: movq %rsi, 16(%rdx) -; X64-SSE2-NEXT: movq %rdi, 24(%rdx) -; X64-SSE2-NEXT: movq %rax, (%rdx) +; X64-SSE2-NEXT: movq -48(%rsp,%rsi), %rdi +; X64-SSE2-NEXT: movq -40(%rsp,%rsi), %rsi +; X64-SSE2-NEXT: movq %rsi, 24(%rdx) +; X64-SSE2-NEXT: movq %rdi, 16(%rdx) ; X64-SSE2-NEXT: movq %rcx, 8(%rdx) +; X64-SSE2-NEXT: movq %rax, (%rdx) ; X64-SSE2-NEXT: retq ; ; X64-SSE42-LABEL: ashr_32bytes: @@ -1493,10 +1485,9 @@ ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: andl $31, %esi -; X64-AVX-NEXT: vmovups -64(%rsp,%rsi), %xmm0 -; X64-AVX-NEXT: vmovups -48(%rsp,%rsi), %xmm1 -; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX-NEXT: vmovups -64(%rsp,%rsi), %ymm0 +; X64-AVX-NEXT: vmovups %ymm0, (%rdx) +; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq ; ; X86-SSE2-LABEL: ashr_32bytes: @@ -1543,23 +1534,23 @@ ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 12(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-SSE2-NEXT: movl 20(%esp,%eax), %esi -; X86-SSE2-NEXT: movl 16(%esp,%eax), %edi -; X86-SSE2-NEXT: movl 28(%esp,%eax), %ebx -; X86-SSE2-NEXT: movl 24(%esp,%eax), %ebp -; X86-SSE2-NEXT: movl 36(%esp,%eax), %edx -; X86-SSE2-NEXT: movl 32(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 16(%esp,%eax), %esi +; X86-SSE2-NEXT: movl 20(%esp,%eax), %edi +; X86-SSE2-NEXT: movl 24(%esp,%eax), %ebx +; X86-SSE2-NEXT: movl 28(%esp,%eax), %ebp +; X86-SSE2-NEXT: movl 32(%esp,%eax), %edx +; X86-SSE2-NEXT: movl 36(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl %ecx, 24(%eax) -; X86-SSE2-NEXT: movl %edx, 28(%eax) -; X86-SSE2-NEXT: movl %ebp, 16(%eax) -; X86-SSE2-NEXT: movl %ebx, 20(%eax) -; X86-SSE2-NEXT: movl %edi, 8(%eax) -; X86-SSE2-NEXT: movl %esi, 12(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, (%eax) +; X86-SSE2-NEXT: movl %ecx, 28(%eax) +; X86-SSE2-NEXT: movl %edx, 24(%eax) +; X86-SSE2-NEXT: movl %ebp, 20(%eax) +; X86-SSE2-NEXT: movl %ebx, 16(%eax) +; X86-SSE2-NEXT: movl %edi, 12(%eax) +; X86-SSE2-NEXT: movl %esi, 8(%eax) ; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl %ecx, (%eax) ; X86-SSE2-NEXT: addl $72, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi @@ -1637,14 +1628,13 @@ ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: andl $31, %ecx -; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0 -; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1 -; X86-AVX-NEXT: vmovups %xmm1, 16(%eax) -; X86-AVX-NEXT: vmovups %xmm0, (%eax) +; X86-AVX-NEXT: vmovups (%esp,%ecx), %ymm0 +; X86-AVX-NEXT: vmovups %ymm0, (%eax) ; X86-AVX-NEXT: addl $64, %esp ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: popl %ebx +; X86-AVX-NEXT: vzeroupper ; X86-AVX-NEXT: retl %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 @@ -1686,20 +1676,20 @@ ; X64-SSE2-NEXT: andl $63, %esi ; X64-SSE2-NEXT: movq -128(%rsp,%rsi), %rax ; X64-SSE2-NEXT: movq -120(%rsp,%rsi), %rcx -; X64-SSE2-NEXT: movq -104(%rsp,%rsi), %rdi -; X64-SSE2-NEXT: movq -112(%rsp,%rsi), %r8 -; X64-SSE2-NEXT: movq -88(%rsp,%rsi), %r9 -; X64-SSE2-NEXT: movq -96(%rsp,%rsi), %r10 -; X64-SSE2-NEXT: movq -72(%rsp,%rsi), %r11 -; X64-SSE2-NEXT: movq -80(%rsp,%rsi), %rsi -; X64-SSE2-NEXT: movq %rsi, 48(%rdx) -; X64-SSE2-NEXT: movq %r11, 56(%rdx) -; X64-SSE2-NEXT: movq %r10, 32(%rdx) -; X64-SSE2-NEXT: movq %r9, 40(%rdx) -; X64-SSE2-NEXT: movq %r8, 16(%rdx) -; X64-SSE2-NEXT: movq %rdi, 24(%rdx) -; X64-SSE2-NEXT: movq %rax, (%rdx) +; X64-SSE2-NEXT: movq -112(%rsp,%rsi), %rdi +; X64-SSE2-NEXT: movq -104(%rsp,%rsi), %r8 +; X64-SSE2-NEXT: movq -96(%rsp,%rsi), %r9 +; X64-SSE2-NEXT: movq -88(%rsp,%rsi), %r10 +; X64-SSE2-NEXT: movq -80(%rsp,%rsi), %r11 +; X64-SSE2-NEXT: movq -72(%rsp,%rsi), %rsi +; X64-SSE2-NEXT: movq %rsi, 56(%rdx) +; X64-SSE2-NEXT: movq %r11, 48(%rdx) +; X64-SSE2-NEXT: movq %r10, 40(%rdx) +; X64-SSE2-NEXT: movq %r9, 32(%rdx) +; X64-SSE2-NEXT: movq %r8, 24(%rdx) +; X64-SSE2-NEXT: movq %rdi, 16(%rdx) ; X64-SSE2-NEXT: movq %rcx, 8(%rdx) +; X64-SSE2-NEXT: movq %rax, (%rdx) ; X64-SSE2-NEXT: popq %rbx ; X64-SSE2-NEXT: retq ; @@ -1725,8 +1715,8 @@ ; X64-SSE42-NEXT: movups -96(%rsp,%rax), %xmm2 ; X64-SSE42-NEXT: movups -80(%rsp,%rax), %xmm3 ; X64-SSE42-NEXT: movups %xmm3, 48(%rdx) -; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: movups %xmm2, 32(%rdx) +; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: movups %xmm0, (%rdx) ; X64-SSE42-NEXT: retq ; @@ -1741,14 +1731,10 @@ ; X64-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; X64-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; X64-AVX1-NEXT: andl $63, %eax -; X64-AVX1-NEXT: vmovups -128(%rsp,%rax), %xmm0 -; X64-AVX1-NEXT: vmovups -112(%rsp,%rax), %xmm1 -; X64-AVX1-NEXT: vmovups -96(%rsp,%rax), %xmm2 -; X64-AVX1-NEXT: vmovups -80(%rsp,%rax), %xmm3 -; X64-AVX1-NEXT: vmovups %xmm3, 48(%rdx) -; X64-AVX1-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX1-NEXT: vmovups %xmm2, 32(%rdx) -; X64-AVX1-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX1-NEXT: vmovups -128(%rsp,%rax), %ymm0 +; X64-AVX1-NEXT: vmovups -96(%rsp,%rax), %ymm1 +; X64-AVX1-NEXT: vmovups %ymm1, 32(%rdx) +; X64-AVX1-NEXT: vmovups %ymm0, (%rdx) ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; @@ -1760,14 +1746,8 @@ ; X64-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; X64-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) ; X64-AVX512-NEXT: andl $63, %eax -; X64-AVX512-NEXT: vmovups -128(%rsp,%rax), %xmm0 -; X64-AVX512-NEXT: vmovups -112(%rsp,%rax), %xmm1 -; X64-AVX512-NEXT: vmovups -96(%rsp,%rax), %xmm2 -; X64-AVX512-NEXT: vmovups -80(%rsp,%rax), %xmm3 -; X64-AVX512-NEXT: vmovups %xmm3, 48(%rdx) -; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX512-NEXT: vmovups %xmm2, 32(%rdx) -; X64-AVX512-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX512-NEXT: vmovups -128(%rsp,%rax), %zmm0 +; X64-AVX512-NEXT: vmovups %zmm0, (%rdx) ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq ; @@ -1854,55 +1834,55 @@ ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 44(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 52(%esp,%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 48(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 60(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 52(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 56(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 68(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 60(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 64(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 76(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 68(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 72(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 76(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-SSE2-NEXT: movl 84(%esp,%eax), %ebp -; X86-SSE2-NEXT: movl 80(%esp,%eax), %ebx -; X86-SSE2-NEXT: movl 92(%esp,%eax), %edi -; X86-SSE2-NEXT: movl 88(%esp,%eax), %esi -; X86-SSE2-NEXT: movl 100(%esp,%eax), %edx -; X86-SSE2-NEXT: movl 96(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 80(%esp,%eax), %ebp +; X86-SSE2-NEXT: movl 84(%esp,%eax), %ebx +; X86-SSE2-NEXT: movl 88(%esp,%eax), %edi +; X86-SSE2-NEXT: movl 92(%esp,%eax), %esi +; X86-SSE2-NEXT: movl 96(%esp,%eax), %edx +; X86-SSE2-NEXT: movl 100(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl %ecx, 56(%eax) -; X86-SSE2-NEXT: movl %edx, 60(%eax) -; X86-SSE2-NEXT: movl %esi, 48(%eax) -; X86-SSE2-NEXT: movl %edi, 52(%eax) -; X86-SSE2-NEXT: movl %ebx, 40(%eax) -; X86-SSE2-NEXT: movl %ebp, 44(%eax) +; X86-SSE2-NEXT: movl %ecx, 60(%eax) +; X86-SSE2-NEXT: movl %edx, 56(%eax) +; X86-SSE2-NEXT: movl %esi, 52(%eax) +; X86-SSE2-NEXT: movl %edi, 48(%eax) +; X86-SSE2-NEXT: movl %ebx, 44(%eax) +; X86-SSE2-NEXT: movl %ebp, 40(%eax) ; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 32(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 36(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 24(%eax) +; X86-SSE2-NEXT: movl %ecx, 32(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 28(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 16(%eax) +; X86-SSE2-NEXT: movl %ecx, 24(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 20(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 8(%eax) +; X86-SSE2-NEXT: movl %ecx, 16(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 12(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, (%eax) +; X86-SSE2-NEXT: movl %ecx, 8(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl %ecx, (%eax) ; X86-SSE2-NEXT: addl $168, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi @@ -1957,14 +1937,10 @@ ; X86-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; X86-AVX1-NEXT: vmovups %ymm0, (%esp) ; X86-AVX1-NEXT: andl $63, %ecx -; X86-AVX1-NEXT: vmovups (%esp,%ecx), %xmm0 -; X86-AVX1-NEXT: vmovups 16(%esp,%ecx), %xmm1 -; X86-AVX1-NEXT: vmovups 32(%esp,%ecx), %xmm2 -; X86-AVX1-NEXT: vmovups 48(%esp,%ecx), %xmm3 -; X86-AVX1-NEXT: vmovups %xmm3, 48(%eax) -; X86-AVX1-NEXT: vmovups %xmm2, 32(%eax) -; X86-AVX1-NEXT: vmovups %xmm1, 16(%eax) -; X86-AVX1-NEXT: vmovups %xmm0, (%eax) +; X86-AVX1-NEXT: vmovups (%esp,%ecx), %ymm0 +; X86-AVX1-NEXT: vmovups 32(%esp,%ecx), %ymm1 +; X86-AVX1-NEXT: vmovups %ymm1, 32(%eax) +; X86-AVX1-NEXT: vmovups %ymm0, (%eax) ; X86-AVX1-NEXT: addl $128, %esp ; X86-AVX1-NEXT: vzeroupper ; X86-AVX1-NEXT: retl @@ -1981,14 +1957,8 @@ ; X86-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; X86-AVX512-NEXT: vmovups %zmm0, (%esp) ; X86-AVX512-NEXT: andl $63, %ecx -; X86-AVX512-NEXT: vmovups (%esp,%ecx), %xmm0 -; X86-AVX512-NEXT: vmovups 16(%esp,%ecx), %xmm1 -; X86-AVX512-NEXT: vmovups 32(%esp,%ecx), %xmm2 -; X86-AVX512-NEXT: vmovups 48(%esp,%ecx), %xmm3 -; X86-AVX512-NEXT: vmovups %xmm3, 48(%eax) -; X86-AVX512-NEXT: vmovups %xmm2, 32(%eax) -; X86-AVX512-NEXT: vmovups %xmm1, 16(%eax) -; X86-AVX512-NEXT: vmovups %xmm0, (%eax) +; X86-AVX512-NEXT: vmovups (%esp,%ecx), %zmm0 +; X86-AVX512-NEXT: vmovups %zmm0, (%eax) ; X86-AVX512-NEXT: addl $128, %esp ; X86-AVX512-NEXT: vzeroupper ; X86-AVX512-NEXT: retl @@ -2033,20 +2003,20 @@ ; X64-SSE2-NEXT: movslq %esi, %rax ; X64-SSE2-NEXT: movq -64(%rsp,%rax), %rcx ; X64-SSE2-NEXT: movq -56(%rsp,%rax), %rsi -; X64-SSE2-NEXT: movq -40(%rsp,%rax), %rdi -; X64-SSE2-NEXT: movq -48(%rsp,%rax), %r8 -; X64-SSE2-NEXT: movq -24(%rsp,%rax), %r9 -; X64-SSE2-NEXT: movq -32(%rsp,%rax), %r10 -; X64-SSE2-NEXT: movq -8(%rsp,%rax), %r11 -; X64-SSE2-NEXT: movq -16(%rsp,%rax), %rax -; X64-SSE2-NEXT: movq %rax, 48(%rdx) -; X64-SSE2-NEXT: movq %r11, 56(%rdx) -; X64-SSE2-NEXT: movq %r10, 32(%rdx) -; X64-SSE2-NEXT: movq %r9, 40(%rdx) -; X64-SSE2-NEXT: movq %r8, 16(%rdx) -; X64-SSE2-NEXT: movq %rdi, 24(%rdx) -; X64-SSE2-NEXT: movq %rcx, (%rdx) +; X64-SSE2-NEXT: movq -48(%rsp,%rax), %rdi +; X64-SSE2-NEXT: movq -40(%rsp,%rax), %r8 +; X64-SSE2-NEXT: movq -32(%rsp,%rax), %r9 +; X64-SSE2-NEXT: movq -24(%rsp,%rax), %r10 +; X64-SSE2-NEXT: movq -16(%rsp,%rax), %r11 +; X64-SSE2-NEXT: movq -8(%rsp,%rax), %rax +; X64-SSE2-NEXT: movq %rax, 56(%rdx) +; X64-SSE2-NEXT: movq %r11, 48(%rdx) +; X64-SSE2-NEXT: movq %r10, 40(%rdx) +; X64-SSE2-NEXT: movq %r9, 32(%rdx) +; X64-SSE2-NEXT: movq %r8, 24(%rdx) +; X64-SSE2-NEXT: movq %rdi, 16(%rdx) ; X64-SSE2-NEXT: movq %rsi, 8(%rdx) +; X64-SSE2-NEXT: movq %rcx, (%rdx) ; X64-SSE2-NEXT: popq %rbx ; X64-SSE2-NEXT: retq ; @@ -2074,8 +2044,8 @@ ; X64-SSE42-NEXT: movups -32(%rsp,%rax), %xmm2 ; X64-SSE42-NEXT: movups -16(%rsp,%rax), %xmm3 ; X64-SSE42-NEXT: movups %xmm3, 48(%rdx) -; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: movups %xmm2, 32(%rdx) +; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: movups %xmm0, (%rdx) ; X64-SSE42-NEXT: retq ; @@ -2092,14 +2062,10 @@ ; X64-AVX1-NEXT: andl $63, %eax ; X64-AVX1-NEXT: negl %eax ; X64-AVX1-NEXT: cltq -; X64-AVX1-NEXT: vmovups -64(%rsp,%rax), %xmm0 -; X64-AVX1-NEXT: vmovups -48(%rsp,%rax), %xmm1 -; X64-AVX1-NEXT: vmovups -32(%rsp,%rax), %xmm2 -; X64-AVX1-NEXT: vmovups -16(%rsp,%rax), %xmm3 -; X64-AVX1-NEXT: vmovups %xmm3, 48(%rdx) -; X64-AVX1-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX1-NEXT: vmovups %xmm2, 32(%rdx) -; X64-AVX1-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX1-NEXT: vmovups -64(%rsp,%rax), %ymm0 +; X64-AVX1-NEXT: vmovups -32(%rsp,%rax), %ymm1 +; X64-AVX1-NEXT: vmovups %ymm1, 32(%rdx) +; X64-AVX1-NEXT: vmovups %ymm0, (%rdx) ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; @@ -2113,14 +2079,8 @@ ; X64-AVX512-NEXT: andl $63, %eax ; X64-AVX512-NEXT: negl %eax ; X64-AVX512-NEXT: cltq -; X64-AVX512-NEXT: vmovups -64(%rsp,%rax), %xmm0 -; X64-AVX512-NEXT: vmovups -48(%rsp,%rax), %xmm1 -; X64-AVX512-NEXT: vmovups -32(%rsp,%rax), %xmm2 -; X64-AVX512-NEXT: vmovups -16(%rsp,%rax), %xmm3 -; X64-AVX512-NEXT: vmovups %xmm3, 48(%rdx) -; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX512-NEXT: vmovups %xmm2, 32(%rdx) -; X64-AVX512-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX512-NEXT: vmovups -64(%rsp,%rax), %zmm0 +; X64-AVX512-NEXT: vmovups %zmm0, (%rdx) ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq ; @@ -2187,8 +2147,8 @@ ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: andl $63, %eax -; X86-SSE2-NEXT: leal {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: subl %eax, %ecx +; X86-SSE2-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-SSE2-NEXT: subl %eax, %edx ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -2205,60 +2165,60 @@ ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-SSE2-NEXT: movl (%ecx), %edx -; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 4(%ecx), %edx -; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 12(%ecx), %edx -; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 8(%ecx), %edx -; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 20(%ecx), %edx -; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 16(%ecx), %edx -; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 28(%ecx), %edx -; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 24(%ecx), %edx -; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 36(%ecx), %edx -; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 32(%ecx), %edx -; X86-SSE2-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-SSE2-NEXT: movl 44(%ecx), %ebp -; X86-SSE2-NEXT: movl 40(%ecx), %ebx -; X86-SSE2-NEXT: movl 52(%ecx), %edi -; X86-SSE2-NEXT: movl 60(%ecx), %esi -; X86-SSE2-NEXT: movl 56(%ecx), %edx +; X86-SSE2-NEXT: movl (%edx), %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 8(%edx), %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 12(%edx), %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 16(%edx), %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 20(%edx), %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 24(%edx), %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 28(%edx), %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 32(%edx), %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 36(%edx), %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 40(%edx), %ecx +; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-SSE2-NEXT: movl 44(%edx), %ebp +; X86-SSE2-NEXT: movl 48(%edx), %ebx +; X86-SSE2-NEXT: movl 52(%edx), %edi +; X86-SSE2-NEXT: movl 56(%edx), %esi +; X86-SSE2-NEXT: movl 60(%edx), %ecx ; X86-SSE2-NEXT: negl %eax -; X86-SSE2-NEXT: movl 152(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 108(%esp,%eax), %edx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl %edx, 56(%eax) -; X86-SSE2-NEXT: movl %esi, 60(%eax) -; X86-SSE2-NEXT: movl %ecx, 48(%eax) +; X86-SSE2-NEXT: movl %ecx, 60(%eax) +; X86-SSE2-NEXT: movl %esi, 56(%eax) ; X86-SSE2-NEXT: movl %edi, 52(%eax) -; X86-SSE2-NEXT: movl %ebx, 40(%eax) +; X86-SSE2-NEXT: movl %ebx, 48(%eax) ; X86-SSE2-NEXT: movl %ebp, 44(%eax) ; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 32(%eax) +; X86-SSE2-NEXT: movl %ecx, 40(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 36(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 24(%eax) +; X86-SSE2-NEXT: movl %ecx, 32(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 28(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 16(%eax) +; X86-SSE2-NEXT: movl %ecx, 24(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 20(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 8(%eax) +; X86-SSE2-NEXT: movl %ecx, 16(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 12(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, (%eax) +; X86-SSE2-NEXT: movl %ecx, 8(%eax) +; X86-SSE2-NEXT: movl %edx, 4(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-SSE2-NEXT: movl %ecx, (%eax) ; X86-SSE2-NEXT: addl $168, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi @@ -2290,13 +2250,13 @@ ; X86-SSE42-NEXT: leal {{[0-9]+}}(%esp), %edx ; X86-SSE42-NEXT: subl %ecx, %edx ; X86-SSE42-NEXT: movups (%edx), %xmm0 -; X86-SSE42-NEXT: movups 16(%edx), %xmm1 -; X86-SSE42-NEXT: movups 32(%edx), %xmm2 +; X86-SSE42-NEXT: movups 32(%edx), %xmm1 +; X86-SSE42-NEXT: movups 48(%edx), %xmm2 ; X86-SSE42-NEXT: negl %ecx -; X86-SSE42-NEXT: movups 112(%esp,%ecx), %xmm3 -; X86-SSE42-NEXT: movups %xmm3, 48(%eax) -; X86-SSE42-NEXT: movups %xmm2, 32(%eax) -; X86-SSE42-NEXT: movups %xmm1, 16(%eax) +; X86-SSE42-NEXT: movups 80(%esp,%ecx), %xmm3 +; X86-SSE42-NEXT: movups %xmm2, 48(%eax) +; X86-SSE42-NEXT: movups %xmm1, 32(%eax) +; X86-SSE42-NEXT: movups %xmm3, 16(%eax) ; X86-SSE42-NEXT: movups %xmm0, (%eax) ; X86-SSE42-NEXT: addl $128, %esp ; X86-SSE42-NEXT: retl @@ -2318,15 +2278,11 @@ ; X86-AVX1-NEXT: andl $63, %ecx ; X86-AVX1-NEXT: leal {{[0-9]+}}(%esp), %edx ; X86-AVX1-NEXT: subl %ecx, %edx -; X86-AVX1-NEXT: vmovups (%edx), %xmm0 -; X86-AVX1-NEXT: vmovups 16(%edx), %xmm1 -; X86-AVX1-NEXT: vmovups 32(%edx), %xmm2 +; X86-AVX1-NEXT: vmovups (%edx), %ymm0 ; X86-AVX1-NEXT: negl %ecx -; X86-AVX1-NEXT: vmovups 112(%esp,%ecx), %xmm3 -; X86-AVX1-NEXT: vmovups %xmm3, 48(%eax) -; X86-AVX1-NEXT: vmovups %xmm2, 32(%eax) -; X86-AVX1-NEXT: vmovups %xmm1, 16(%eax) -; X86-AVX1-NEXT: vmovups %xmm0, (%eax) +; X86-AVX1-NEXT: vmovups 96(%esp,%ecx), %ymm1 +; X86-AVX1-NEXT: vmovups %ymm1, 32(%eax) +; X86-AVX1-NEXT: vmovups %ymm0, (%eax) ; X86-AVX1-NEXT: addl $128, %esp ; X86-AVX1-NEXT: vzeroupper ; X86-AVX1-NEXT: retl @@ -2343,17 +2299,9 @@ ; X86-AVX512-NEXT: vmovups %zmm1, (%esp) ; X86-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) ; X86-AVX512-NEXT: andl $63, %ecx -; X86-AVX512-NEXT: leal {{[0-9]+}}(%esp), %edx -; X86-AVX512-NEXT: subl %ecx, %edx -; X86-AVX512-NEXT: vmovups (%edx), %xmm0 -; X86-AVX512-NEXT: vmovups 16(%edx), %xmm1 -; X86-AVX512-NEXT: vmovups 32(%edx), %xmm2 ; X86-AVX512-NEXT: negl %ecx -; X86-AVX512-NEXT: vmovups 112(%esp,%ecx), %xmm3 -; X86-AVX512-NEXT: vmovups %xmm3, 48(%eax) -; X86-AVX512-NEXT: vmovups %xmm2, 32(%eax) -; X86-AVX512-NEXT: vmovups %xmm1, 16(%eax) -; X86-AVX512-NEXT: vmovups %xmm0, (%eax) +; X86-AVX512-NEXT: vmovups 64(%esp,%ecx), %zmm0 +; X86-AVX512-NEXT: vmovups %zmm0, (%eax) ; X86-AVX512-NEXT: addl $128, %esp ; X86-AVX512-NEXT: vzeroupper ; X86-AVX512-NEXT: retl @@ -2398,20 +2346,20 @@ ; X64-SSE2-NEXT: andl $63, %eax ; X64-SSE2-NEXT: movq -128(%rsp,%rax), %rcx ; X64-SSE2-NEXT: movq -120(%rsp,%rax), %rsi -; X64-SSE2-NEXT: movq -104(%rsp,%rax), %rdi -; X64-SSE2-NEXT: movq -112(%rsp,%rax), %r8 -; X64-SSE2-NEXT: movq -88(%rsp,%rax), %r9 -; X64-SSE2-NEXT: movq -96(%rsp,%rax), %r10 -; X64-SSE2-NEXT: movq -72(%rsp,%rax), %r11 -; X64-SSE2-NEXT: movq -80(%rsp,%rax), %rax -; X64-SSE2-NEXT: movq %rax, 48(%rdx) -; X64-SSE2-NEXT: movq %r11, 56(%rdx) -; X64-SSE2-NEXT: movq %r10, 32(%rdx) -; X64-SSE2-NEXT: movq %r9, 40(%rdx) -; X64-SSE2-NEXT: movq %r8, 16(%rdx) -; X64-SSE2-NEXT: movq %rdi, 24(%rdx) -; X64-SSE2-NEXT: movq %rcx, (%rdx) +; X64-SSE2-NEXT: movq -112(%rsp,%rax), %rdi +; X64-SSE2-NEXT: movq -104(%rsp,%rax), %r8 +; X64-SSE2-NEXT: movq -96(%rsp,%rax), %r9 +; X64-SSE2-NEXT: movq -88(%rsp,%rax), %r10 +; X64-SSE2-NEXT: movq -80(%rsp,%rax), %r11 +; X64-SSE2-NEXT: movq -72(%rsp,%rax), %rax +; X64-SSE2-NEXT: movq %rax, 56(%rdx) +; X64-SSE2-NEXT: movq %r11, 48(%rdx) +; X64-SSE2-NEXT: movq %r10, 40(%rdx) +; X64-SSE2-NEXT: movq %r9, 32(%rdx) +; X64-SSE2-NEXT: movq %r8, 24(%rdx) +; X64-SSE2-NEXT: movq %rdi, 16(%rdx) ; X64-SSE2-NEXT: movq %rsi, 8(%rdx) +; X64-SSE2-NEXT: movq %rcx, (%rdx) ; X64-SSE2-NEXT: popq %rbx ; X64-SSE2-NEXT: popq %r14 ; X64-SSE2-NEXT: retq @@ -2443,43 +2391,65 @@ ; X64-SSE42-NEXT: movups -112(%rsp,%rsi), %xmm1 ; X64-SSE42-NEXT: movups -96(%rsp,%rsi), %xmm2 ; X64-SSE42-NEXT: movups -80(%rsp,%rsi), %xmm3 -; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) -; X64-SSE42-NEXT: movups %xmm2, 32(%rdx) ; X64-SSE42-NEXT: movups %xmm3, 48(%rdx) +; X64-SSE42-NEXT: movups %xmm2, 32(%rdx) +; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: movups %xmm0, (%rdx) ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: ashr_64bytes: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX-NEXT: vmovups 32(%rdi), %xmm1 -; X64-AVX-NEXT: movq 48(%rdi), %rax -; X64-AVX-NEXT: movq 56(%rdi), %rcx -; X64-AVX-NEXT: movl (%rsi), %esi -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: sarq $63, %rcx -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: andl $63, %esi -; X64-AVX-NEXT: vmovups -128(%rsp,%rsi), %xmm0 -; X64-AVX-NEXT: vmovups -112(%rsp,%rsi), %xmm1 -; X64-AVX-NEXT: vmovups -96(%rsp,%rsi), %xmm2 -; X64-AVX-NEXT: vmovups -80(%rsp,%rsi), %xmm3 -; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX-NEXT: vmovups %xmm2, 32(%rdx) -; X64-AVX-NEXT: vmovups %xmm3, 48(%rdx) -; X64-AVX-NEXT: vmovups %xmm0, (%rdx) -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: ashr_64bytes: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-AVX1-NEXT: vmovups 32(%rdi), %xmm1 +; X64-AVX1-NEXT: movq 48(%rdi), %rax +; X64-AVX1-NEXT: movq 56(%rdi), %rcx +; X64-AVX1-NEXT: movl (%rsi), %esi +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: sarq $63, %rcx +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: andl $63, %esi +; X64-AVX1-NEXT: vmovups -128(%rsp,%rsi), %ymm0 +; X64-AVX1-NEXT: vmovups -96(%rsp,%rsi), %ymm1 +; X64-AVX1-NEXT: vmovups %ymm1, 32(%rdx) +; X64-AVX1-NEXT: vmovups %ymm0, (%rdx) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: ashr_64bytes: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovups (%rdi), %ymm0 +; X64-AVX512-NEXT: vmovups 32(%rdi), %xmm1 +; X64-AVX512-NEXT: movq 48(%rdi), %rax +; X64-AVX512-NEXT: movq 56(%rdi), %rcx +; X64-AVX512-NEXT: movl (%rsi), %esi +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: sarq $63, %rcx +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: andl $63, %esi +; X64-AVX512-NEXT: vmovups -128(%rsp,%rsi), %zmm0 +; X64-AVX512-NEXT: vmovups %zmm0, (%rdx) +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq ; ; X86-SSE2-LABEL: ashr_64bytes: ; X86-SSE2: # %bb.0: @@ -2565,55 +2535,55 @@ ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 44(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 52(%esp,%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 48(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 60(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 52(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 56(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 68(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 60(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 64(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 76(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 68(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 72(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 76(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-SSE2-NEXT: movl 84(%esp,%eax), %ebp -; X86-SSE2-NEXT: movl 80(%esp,%eax), %ebx -; X86-SSE2-NEXT: movl 92(%esp,%eax), %edi -; X86-SSE2-NEXT: movl 88(%esp,%eax), %esi -; X86-SSE2-NEXT: movl 100(%esp,%eax), %edx -; X86-SSE2-NEXT: movl 96(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 80(%esp,%eax), %ebp +; X86-SSE2-NEXT: movl 84(%esp,%eax), %ebx +; X86-SSE2-NEXT: movl 88(%esp,%eax), %edi +; X86-SSE2-NEXT: movl 92(%esp,%eax), %esi +; X86-SSE2-NEXT: movl 96(%esp,%eax), %edx +; X86-SSE2-NEXT: movl 100(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl %ecx, 56(%eax) -; X86-SSE2-NEXT: movl %edx, 60(%eax) -; X86-SSE2-NEXT: movl %esi, 48(%eax) -; X86-SSE2-NEXT: movl %edi, 52(%eax) -; X86-SSE2-NEXT: movl %ebx, 40(%eax) -; X86-SSE2-NEXT: movl %ebp, 44(%eax) +; X86-SSE2-NEXT: movl %ecx, 60(%eax) +; X86-SSE2-NEXT: movl %edx, 56(%eax) +; X86-SSE2-NEXT: movl %esi, 52(%eax) +; X86-SSE2-NEXT: movl %edi, 48(%eax) +; X86-SSE2-NEXT: movl %ebx, 44(%eax) +; X86-SSE2-NEXT: movl %ebp, 40(%eax) ; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 32(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 36(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 24(%eax) +; X86-SSE2-NEXT: movl %ecx, 32(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 28(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 16(%eax) +; X86-SSE2-NEXT: movl %ecx, 24(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 20(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 8(%eax) +; X86-SSE2-NEXT: movl %ecx, 16(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 12(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, (%eax) +; X86-SSE2-NEXT: movl %ecx, 8(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl %ecx, (%eax) ; X86-SSE2-NEXT: addl $168, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi @@ -2677,60 +2647,105 @@ ; X86-SSE42-NEXT: popl %ebx ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: ashr_64bytes: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: pushl %ebx -; X86-AVX-NEXT: pushl %edi -; X86-AVX-NEXT: pushl %esi -; X86-AVX-NEXT: subl $128, %esp -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX-NEXT: vmovups (%edx), %ymm0 -; X86-AVX-NEXT: vmovups 32(%edx), %xmm1 -; X86-AVX-NEXT: movl 48(%edx), %esi -; X86-AVX-NEXT: movl 52(%edx), %edi -; X86-AVX-NEXT: movl 56(%edx), %ebx -; X86-AVX-NEXT: movl 60(%edx), %edx -; X86-AVX-NEXT: movl (%ecx), %ecx -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: vmovups %xmm1, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: vmovups %ymm0, (%esp) -; X86-AVX-NEXT: sarl $31, %edx -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: andl $63, %ecx -; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0 -; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1 -; X86-AVX-NEXT: vmovups 32(%esp,%ecx), %xmm2 -; X86-AVX-NEXT: vmovups 48(%esp,%ecx), %xmm3 -; X86-AVX-NEXT: vmovups %xmm3, 48(%eax) -; X86-AVX-NEXT: vmovups %xmm2, 32(%eax) -; X86-AVX-NEXT: vmovups %xmm1, 16(%eax) -; X86-AVX-NEXT: vmovups %xmm0, (%eax) -; X86-AVX-NEXT: addl $128, %esp -; X86-AVX-NEXT: popl %esi -; X86-AVX-NEXT: popl %edi -; X86-AVX-NEXT: popl %ebx -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: ashr_64bytes: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %ebx +; X86-AVX1-NEXT: pushl %edi +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: subl $128, %esp +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: vmovups (%edx), %ymm0 +; X86-AVX1-NEXT: vmovups 32(%edx), %xmm1 +; X86-AVX1-NEXT: movl 48(%edx), %esi +; X86-AVX1-NEXT: movl 52(%edx), %edi +; X86-AVX1-NEXT: movl 56(%edx), %ebx +; X86-AVX1-NEXT: movl 60(%edx), %edx +; X86-AVX1-NEXT: movl (%ecx), %ecx +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: vmovups %xmm1, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: vmovups %ymm0, (%esp) +; X86-AVX1-NEXT: sarl $31, %edx +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: andl $63, %ecx +; X86-AVX1-NEXT: vmovups (%esp,%ecx), %ymm0 +; X86-AVX1-NEXT: vmovups 32(%esp,%ecx), %ymm1 +; X86-AVX1-NEXT: vmovups %ymm1, 32(%eax) +; X86-AVX1-NEXT: vmovups %ymm0, (%eax) +; X86-AVX1-NEXT: addl $128, %esp +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: popl %edi +; X86-AVX1-NEXT: popl %ebx +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: ashr_64bytes: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: pushl %ebx +; X86-AVX512-NEXT: pushl %edi +; X86-AVX512-NEXT: pushl %esi +; X86-AVX512-NEXT: subl $128, %esp +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX512-NEXT: vmovups (%edx), %ymm0 +; X86-AVX512-NEXT: vmovups 32(%edx), %xmm1 +; X86-AVX512-NEXT: movl 48(%edx), %esi +; X86-AVX512-NEXT: movl 52(%edx), %edi +; X86-AVX512-NEXT: movl 56(%edx), %ebx +; X86-AVX512-NEXT: movl 60(%edx), %edx +; X86-AVX512-NEXT: movl (%ecx), %ecx +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: vmovups %xmm1, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: vmovups %ymm0, (%esp) +; X86-AVX512-NEXT: sarl $31, %edx +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: andl $63, %ecx +; X86-AVX512-NEXT: vmovups (%esp,%ecx), %zmm0 +; X86-AVX512-NEXT: vmovups %zmm0, (%eax) +; X86-AVX512-NEXT: addl $128, %esp +; X86-AVX512-NEXT: popl %esi +; X86-AVX512-NEXT: popl %edi +; X86-AVX512-NEXT: popl %ebx +; X86-AVX512-NEXT: vzeroupper +; X86-AVX512-NEXT: retl %src = load i512, ptr %src.ptr, align 1 %byteOff = load i512, ptr %byteOff.ptr, align 1 %bitOff = shl i512 %byteOff, 3 @@ -2772,5 +2787,5 @@ ; FALLBACK7: {{.*}} ; FALLBACK8: {{.*}} ; FALLBACK9: {{.*}} -; X86: {{.*}} ; X64: {{.*}} +; X86: {{.*}} diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll --- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll @@ -608,40 +608,42 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ah ; X86-NO-BMI2-NO-SHLD-NEXT: andb $15, %ah -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ah, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ah, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ecx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ebp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebp), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%ebp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%esi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl (%esp), %ebp # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 12(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 12(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 8(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%edx) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $36, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi @@ -658,15 +660,15 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movb (%eax), %ah ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -676,27 +678,25 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %ah ; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %ah ; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %ah, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%ecx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi @@ -731,30 +731,30 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %bl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %bl, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -768,7 +768,7 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $32, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $36, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx @@ -779,36 +779,38 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %bl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%edx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ebx, %eax, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%edx), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 12(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%edx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ebx, %ebp, %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $36, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -920,43 +922,44 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: andb $15, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: negb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ebp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %cl, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ebp), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ebp), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ecx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 12(%edx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 8(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 12(%edx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi @@ -1007,13 +1010,16 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%ebx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%ebx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%ebx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -1061,17 +1067,17 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, 28(%esp,%edx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 12(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1122,10 +1128,10 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebp, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi @@ -1239,40 +1245,42 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: andb $15, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ecx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ebp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebp), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%ebp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%esi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl (%esp), %ebp # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 12(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 12(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 8(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%edx) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $36, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi @@ -1289,46 +1297,44 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%eax), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $7, %al ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: andb $15, %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebp), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebp), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebp), %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%ecx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%eax) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi @@ -1364,30 +1370,30 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1401,7 +1407,7 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $32, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $36, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx @@ -1412,7 +1418,7 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) @@ -1422,27 +1428,29 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%ebx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%edx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %al +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%edx), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%ebx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %ebx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 12(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%edx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %eax, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%esi) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $36, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -1458,7 +1466,6 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx ; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8 @@ -1475,39 +1482,37 @@ ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax ; X64-NO-BMI2-NO-SHLD-NEXT: andb $7, %al ; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %sil, %r9d -; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r9), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r9), %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %sil, %r8d +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r8), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r8), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r9), %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r8), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r9), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r8), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 24(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; ; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_32bytes: @@ -1544,8 +1549,8 @@ ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r10 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: retq @@ -1569,19 +1574,19 @@ ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -64(%rsp,%rcx), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -64(%rsp,%rcx), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rcx), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rsi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx @@ -1589,8 +1594,8 @@ ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes: @@ -1627,8 +1632,8 @@ ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r11 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 24(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx @@ -1640,17 +1645,17 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $88, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $84, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb (%ecx), %ch +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi @@ -1660,10 +1665,10 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -1672,95 +1677,87 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %al +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ch, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ah -; X86-NO-BMI2-NO-SHLD-NEXT: notb %ah -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebx), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ebp), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ebp), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%ebx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 28(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 24(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 20(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $88, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $84, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -1833,15 +1830,15 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -1849,24 +1846,24 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 24(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 28(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 28(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 24(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -1880,7 +1877,7 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $84, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $80, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %eax @@ -1890,13 +1887,13 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) @@ -1904,8 +1901,8 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -1914,69 +1911,64 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 20(%esp,%edi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, (%esp), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 16(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $84, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 20(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $80, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx @@ -1989,7 +1981,7 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $96, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ecx @@ -2024,57 +2016,61 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %eax, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %eax, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %edi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%eax,%eax), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%ebp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 28(%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $96, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -2120,26 +2116,26 @@ ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%r10), %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%r10), %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%r10), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%r10), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 16(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: retq @@ -2178,11 +2174,12 @@ ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%rsi), %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rdi ; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, (%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: retq ; @@ -2209,24 +2206,24 @@ ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rsi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rsi), %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r9, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rsi, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; @@ -2267,8 +2264,8 @@ ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %r11 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, (%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, (%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq @@ -2316,88 +2313,89 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: negb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %cl, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%ecx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%ecx), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%ecx), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%ecx), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebp), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ah -; X86-NO-BMI2-NO-SHLD-NEXT: notb %ah ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebp # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ecx), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 28(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 24(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 20(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 20(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $88, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi @@ -2488,26 +2486,30 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%ebx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%ebx), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%ebx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%ebx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%ebx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%ebx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%ebx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 24(%ebx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 24(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -2521,13 +2523,13 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $88, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $84, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %ebp @@ -2541,12 +2543,12 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -2558,69 +2560,63 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%esi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, 84(%esp,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%esi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 80(%esp,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%esi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $88, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 20(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $84, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx @@ -2710,11 +2706,11 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 12(%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebp, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%esi) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%esi) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -2736,7 +2732,6 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx ; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8 @@ -2754,39 +2749,37 @@ ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax ; X64-NO-BMI2-NO-SHLD-NEXT: andb $7, %al ; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %sil, %r9d -; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r9), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r9), %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %sil, %r8d +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r8), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r8), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r9), %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r8), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r9), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r8), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: sarq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: sarq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 24(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; ; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_32bytes: @@ -2824,8 +2817,8 @@ ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r10 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi ; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: retq @@ -2850,19 +2843,19 @@ ; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -64(%rsp,%rcx), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -64(%rsp,%rcx), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rcx), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rax, %rcx, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rsi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx @@ -2870,8 +2863,8 @@ ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes: @@ -2909,8 +2902,8 @@ ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r11 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 24(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx @@ -2922,17 +2915,17 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $88, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $84, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb (%ecx), %ch +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx @@ -2942,10 +2935,10 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) @@ -2955,95 +2948,87 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %al +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ch, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ebp), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ah -; X86-NO-BMI2-NO-SHLD-NEXT: notb %ah -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ebp), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebx), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%ebx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebx), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 28(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 24(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 20(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $88, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $84, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -3117,15 +3102,15 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -3133,24 +3118,24 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 24(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 28(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 28(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 24(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -3164,11 +3149,11 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $84, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $80, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi @@ -3186,7 +3171,7 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) @@ -3197,73 +3182,66 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %eax # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, 20(%esp,%edi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ebx, %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 16(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $84, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl (%esp), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 20(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $80, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx @@ -3276,7 +3254,7 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $88, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $96, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %ecx @@ -3312,57 +3290,61 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %eax, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %eax, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %edi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%eax,%eax), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%ebp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%ebp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 28(%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%ebx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 24(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %eax, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 16(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $88, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $96, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -3378,7 +3360,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: lshr_64bytes: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13 @@ -3413,80 +3394,76 @@ ; X64-NO-BMI2-NO-SHLD-NEXT: andl $7, %eax ; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %r8d ; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %r8d -; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%r8), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%r8), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%r8), %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%r8), %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi ; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi ; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %edi -; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%r8), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%r8), %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rsi, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi -; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil -; X64-NO-BMI2-NO-SHLD-NEXT: addq %r9, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%r8), %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%r8), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%r8), %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: addq %r11, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%r8), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r13 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%r8), %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbp,%rbp), %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: addq %r14, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%r8), %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r12,%r12), %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbp +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%r8), %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%r8), %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rdi,%rdi), %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbp, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 56(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r13, 40(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 32(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, 40(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbp ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; ; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_64bytes: @@ -3567,8 +3544,8 @@ ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, 48(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 56(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, 48(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 32(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx) @@ -3621,55 +3598,55 @@ ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %rbp -; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r13, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r14, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r9d +; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %r9b ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r14, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r9, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r13,%r13), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r9, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r12, %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r14, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r14, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r9, %r10, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r12,%r12), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rbx, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r9, %rax, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 32(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 @@ -3744,8 +3721,8 @@ ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r13 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 48(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, 56(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 48(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) @@ -4185,15 +4162,15 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -4208,41 +4185,41 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%esi), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 56(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 48(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 40(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $204, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -4721,9 +4698,9 @@ ; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: negl %esi -; X64-NO-BMI2-NO-SHLD-NEXT: movslq %esi, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r14), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r14), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movslq %esi, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rbx), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rbx), %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi @@ -4734,65 +4711,65 @@ ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r14), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r14), %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rbx), %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi ; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi ; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %edi ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%r14), %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rbx), %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r13 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%r14), %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbp, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rbx), %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%r14), %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rbx), %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, %r13 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r13 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%r14), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rbp +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %rbp +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rbx), %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r13, %rbp ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbp +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbp, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%rbx), %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r13 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, 48(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbp, 56(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, 40(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r13, 56(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, 48(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, 40(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 32(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12 @@ -4841,55 +4818,57 @@ ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi ; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi ; X64-NO-BMI2-HAVE-SHLD-NEXT: negl %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movslq %esi, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r10), %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NO-BMI2-HAVE-SHLD-NEXT: movslq %esi, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r9), %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi ; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %esi ; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r10), %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r10), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r9), %rcx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r9), %r8 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %rdi ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %rdi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rdi ; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r10), %r15 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r9), %r15 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, %r12 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r12 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r10), %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r9), %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r12, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r10), %r12 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r9), %r12 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r12, %r13 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r13 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r10), %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbp +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r9), %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, %rbp ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %rbp ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rbp ; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %rbp ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r15, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%r9), %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r15, %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%r10), %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r12, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %r9 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r8 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 56(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 40(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 40(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 24(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, (%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, (%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, 48(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx) @@ -4942,53 +4921,53 @@ ; X64-HAVE-BMI2-NO-SHLD-NEXT: negl %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %esi, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rcx), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r15, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %r8d -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rcx), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r11, %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rcx), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rbx, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rcx), %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r14, %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %bpl ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r15, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rcx), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rbx, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r8d -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r8d -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r8, %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rcx), %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rcx), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rcx), %r15 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rcx), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rbx, %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r8, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r14, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %r8, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r8, %r11, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %rcx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r13, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 32(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 56(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 48(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 32(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 16(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 @@ -5043,38 +5022,38 @@ ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rdi, %r12 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %r10 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r10, %r15 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebp -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ebp -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ebp +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r13d +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %r13d +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %r13d ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax), %r11 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax), %r9 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbp, %r8, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %r13, %r8, %r8 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rbx, %r8 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r14 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbp, %rbx, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %r13, %rbx, %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r12, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %r13 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbp, %r12, %r12 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %r12 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r12, %rbp +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %rbp +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %r13, %rbp, %r13 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r11, %rbp -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r15, %r12 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r15, %r13 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %r14 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %r13 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %r12 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%rax), %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r9 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 40(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r12, 40(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, 24(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, (%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r12, 48(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, (%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 48(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx @@ -5092,67 +5071,61 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: subl $192, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%ebx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%ebx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%ebx), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%eax), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%eax), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: subl %eax, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -5169,196 +5142,197 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: andl $7, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: notb %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, (%esp) # 1-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: andl $7, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: subl %eax, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: negl %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 132(%esp,%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: notl %edx -; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, (%esp) # 1-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: notl %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: negl %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 176(%esp,%ecx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%edi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%ecx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 56(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 60(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 56(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 52(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 48(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 48(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 52(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 44(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 40(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 44(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 36(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 32(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 36(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 24(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 20(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 16(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 20(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $192, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi @@ -5373,7 +5347,7 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $204, %esp +; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $208, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -5384,7 +5358,7 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx @@ -5392,7 +5366,7 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp @@ -5411,7 +5385,7 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) @@ -5419,7 +5393,7 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) @@ -5429,11 +5403,11 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: subl %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: subl %edi, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -5450,124 +5424,131 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: notl %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: negl %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 148(%esp,%edi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%edi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%edi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esi), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%edi), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: negl %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 188(%esp,%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%edi), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%edi), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%edi), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esi), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%edi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esi), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esi), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 52(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ebp # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 56(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 56(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -5580,7 +5561,7 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $204, %esp +; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $208, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -5834,44 +5815,42 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $204, %esp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $208, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%edi), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%edi), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%edi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%edi), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%edi), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esi), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esi), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esi), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esi), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esi), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -5884,19 +5863,20 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl %edi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl %esi, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -5913,126 +5893,123 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebp, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %ebp, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %edi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %edi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %edi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %edi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%edx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebp, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %ebx, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 188(%esp,%esi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 148(%esp,%esi), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebp, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %ebp, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ecx, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %edi, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, (%esp) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 44(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 56(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $204, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 60(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 44(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %edi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 4(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 56(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 48(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 40(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $208, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -6047,7 +6024,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: ashr_64bytes: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13 @@ -6083,80 +6059,76 @@ ; X64-NO-BMI2-NO-SHLD-NEXT: andl $7, %eax ; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %r8d ; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %r8d -; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%r8), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%r8), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%r8), %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%r8), %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi ; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi ; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %edi -; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%r8), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%r8), %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rsi, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi -; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil -; X64-NO-BMI2-NO-SHLD-NEXT: addq %r9, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%r8), %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%r8), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%r8), %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: addq %r11, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%r8), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r13 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%r8), %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbp,%rbp), %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: addq %r14, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%r8), %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r12,%r12), %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbp +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%r8), %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%r8), %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rdi,%rdi), %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbp, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: sarq %cl, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 56(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r13, 40(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 32(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, 40(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbp ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; ; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_64bytes: @@ -6238,8 +6210,8 @@ ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8 ; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, 48(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 56(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, 48(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 32(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx) @@ -6293,55 +6265,55 @@ ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %rbp -; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r13, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r14, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r9d +; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %r9b ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r14, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r9, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r13,%r13), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r9, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r12, %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r14, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r14, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r9, %r10, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r12,%r12), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rbx, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r9, %rax, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 32(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 @@ -6417,8 +6389,8 @@ ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r12, %r13 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 48(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, 56(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 48(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) @@ -6860,15 +6832,15 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 128(%esp,%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 132(%esp,%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -6883,41 +6855,41 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%esi), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 56(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 56(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 48(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 40(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $204, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -7360,9 +7332,9 @@ } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; ALL: {{.*}} -; X86: {{.*}} -; X86-NO-SHLD: {{.*}} -; X86-SHLD: {{.*}} ; X64: {{.*}} ; X64-NO-SHLD: {{.*}} ; X64-SHLD: {{.*}} +; X86: {{.*}} +; X86-NO-SHLD: {{.*}} +; X86-SHLD: {{.*}} diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll @@ -28,25 +28,24 @@ ; ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half: ; X86-NO-BMI2: # %bb.0: -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: shlb $3, %cl ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NEXT: movzbl (%eax), %eax -; X86-NO-BMI2-NEXT: shll $3, %ecx -; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NEXT: movb %al, (%edx) +; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NEXT: movzbl (%edx), %edx +; X86-NO-BMI2-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NEXT: movb %dl, (%eax) ; X86-NO-BMI2-NEXT: retl ; ; X86-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half: ; X86-BMI2: # %bb.0: -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: shlb $3, %al ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: movzbl (%edx), %edx -; X86-BMI2-NEXT: shll $3, %ecx -; X86-BMI2-NEXT: shrxl %ecx, %edx, %ecx -; X86-BMI2-NEXT: movb %cl, (%eax) +; X86-BMI2-NEXT: shrxl %eax, %edx, %eax +; X86-BMI2-NEXT: movb %al, (%ecx) ; X86-BMI2-NEXT: retl %init1 = load i8, ptr %src, align 1 %intermediate.sroa.0.0.vec.insert = insertelement <2 x i8> , i8 %init1, i64 0 @@ -81,25 +80,24 @@ ; ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half: ; X86-NO-BMI2: # %bb.0: -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: shlb $3, %cl ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NEXT: movzwl (%eax), %eax -; X86-NO-BMI2-NEXT: shll $3, %ecx -; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NEXT: movb %al, (%edx) +; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NEXT: movzwl (%edx), %edx +; X86-NO-BMI2-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NEXT: movb %dl, (%eax) ; X86-NO-BMI2-NEXT: retl ; ; X86-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half: ; X86-BMI2: # %bb.0: -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: shlb $3, %al ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: movzwl (%edx), %edx -; X86-BMI2-NEXT: shll $3, %ecx -; X86-BMI2-NEXT: shrxl %ecx, %edx, %ecx -; X86-BMI2-NEXT: movb %cl, (%eax) +; X86-BMI2-NEXT: shrxl %eax, %edx, %eax +; X86-BMI2-NEXT: movb %al, (%ecx) ; X86-BMI2-NEXT: retl %init = load <2 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <2 x i8> %init, <2 x i8> poison, <4 x i32> @@ -135,25 +133,24 @@ ; ; X86-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half: ; X86-NO-BMI2: # %bb.0: +; X86-NO-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: shlb $3, %cl ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NEXT: movzwl (%edx), %edx -; X86-NO-BMI2-NEXT: shll $3, %ecx -; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NEXT: shrl %cl, %edx ; X86-NO-BMI2-NEXT: movw %dx, (%eax) ; X86-NO-BMI2-NEXT: retl ; ; X86-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half: ; X86-BMI2: # %bb.0: -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: shlb $3, %al ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: movzwl (%edx), %edx -; X86-BMI2-NEXT: shll $3, %ecx -; X86-BMI2-NEXT: shrxl %ecx, %edx, %ecx -; X86-BMI2-NEXT: movw %cx, (%eax) +; X86-BMI2-NEXT: shrxl %eax, %edx, %eax +; X86-BMI2-NEXT: movw %ax, (%ecx) ; X86-BMI2-NEXT: retl %init = load <2 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <2 x i8> %init, <2 x i8> poison, <4 x i32> @@ -193,11 +190,11 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shlb $3, %al ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm1, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi @@ -220,11 +217,11 @@ ; X86-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half: ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shlb $3, %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx @@ -241,11 +238,11 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlb $3, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm1, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi @@ -268,11 +265,11 @@ ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlb $3, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi @@ -322,11 +319,11 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shlb $3, %al ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm1, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi @@ -349,11 +346,11 @@ ; X86-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half: ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shlb $3, %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi @@ -370,11 +367,11 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlb $3, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm1, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi @@ -396,11 +393,11 @@ ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half: ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlb $3, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi @@ -448,11 +445,11 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shlb $3, %al ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm1, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi @@ -475,11 +472,11 @@ ; X86-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half: ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shlb $3, %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi @@ -496,11 +493,11 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlb $3, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm1, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi @@ -522,11 +519,11 @@ ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half: ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlb $3, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi @@ -623,10 +620,9 @@ ; X86: # %bb.0: ; X86-NEXT: subl $32, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -638,9 +634,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $15, %ecx ; X86-NEXT: movzbl (%esp,%ecx), %ecx ; X86-NEXT: movb %cl, (%eax) ; X86-NEXT: addl $32, %esp @@ -733,10 +727,9 @@ ; X86: # %bb.0: ; X86-NEXT: subl $32, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -748,9 +741,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $15, %ecx ; X86-NEXT: movl (%esp,%ecx), %ecx ; X86-NEXT: movw %cx, (%eax) ; X86-NEXT: addl $32, %esp @@ -842,10 +833,9 @@ ; X86: # %bb.0: ; X86-NEXT: subl $32, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -857,9 +847,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $15, %ecx ; X86-NEXT: movl (%esp,%ecx), %ecx ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: addl $32, %esp @@ -951,10 +939,9 @@ ; X86: # %bb.0: ; X86-NEXT: subl $32, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -966,9 +953,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $15, %ecx ; X86-NEXT: movl (%esp,%ecx), %edx ; X86-NEXT: movl 4(%esp,%ecx), %ecx ; X86-NEXT: movl %ecx, 4(%eax) @@ -1012,10 +997,9 @@ ; X86: # %bb.0: ; X86-NEXT: subl $64, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -1035,8 +1019,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $31, %ecx ; X86-NEXT: movzbl (%esp,%ecx), %ecx ; X86-NEXT: movb %cl, (%eax) ; X86-NEXT: addl $64, %esp @@ -1079,10 +1062,9 @@ ; X86: # %bb.0: ; X86-NEXT: subl $64, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -1102,8 +1084,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $31, %ecx ; X86-NEXT: movl (%esp,%ecx), %ecx ; X86-NEXT: movw %cx, (%eax) ; X86-NEXT: addl $64, %esp @@ -1145,10 +1126,9 @@ ; X86: # %bb.0: ; X86-NEXT: subl $64, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -1168,8 +1148,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $31, %ecx ; X86-NEXT: movl (%esp,%ecx), %ecx ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: addl $64, %esp @@ -1211,10 +1190,9 @@ ; X86: # %bb.0: ; X86-NEXT: subl $64, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -1234,8 +1212,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $31, %ecx ; X86-NEXT: movl (%esp,%ecx), %edx ; X86-NEXT: movl 4(%esp,%ecx), %ecx ; X86-NEXT: movl %ecx, 4(%eax) @@ -1283,10 +1260,9 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: subl $64, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -1306,8 +1282,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $31, %ecx ; X86-NEXT: movl (%esp,%ecx), %edx ; X86-NEXT: movl 4(%esp,%ecx), %esi ; X86-NEXT: movl 8(%esp,%ecx), %edi diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll @@ -30,25 +30,24 @@ ; ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca: ; X86-NO-BMI2: # %bb.0: -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: shlb $3, %cl ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NEXT: movzwl (%eax), %eax -; X86-NO-BMI2-NEXT: shll $3, %ecx -; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NEXT: movb %al, (%edx) +; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NEXT: movzwl (%edx), %edx +; X86-NO-BMI2-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NEXT: movb %dl, (%eax) ; X86-NO-BMI2-NEXT: retl ; ; X86-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca: ; X86-BMI2: # %bb.0: -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: shlb $3, %al ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: movzwl (%edx), %edx -; X86-BMI2-NEXT: shll $3, %ecx -; X86-BMI2-NEXT: shrxl %ecx, %edx, %ecx -; X86-BMI2-NEXT: movb %cl, (%eax) +; X86-BMI2-NEXT: shrxl %eax, %edx, %eax +; X86-BMI2-NEXT: movb %al, (%ecx) ; X86-BMI2-NEXT: retl %init = load <2 x i8>, ptr %src, align 1 %intermediate.val.frozen = freeze <2 x i8> %init @@ -83,24 +82,23 @@ ; ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca: ; X86-NO-BMI2: # %bb.0: -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: shlb $3, %cl ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NEXT: movl (%eax), %eax -; X86-NO-BMI2-NEXT: shll $3, %ecx -; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NEXT: movb %al, (%edx) +; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NEXT: movl (%edx), %edx +; X86-NO-BMI2-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NEXT: movb %dl, (%eax) ; X86-NO-BMI2-NEXT: retl ; ; X86-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca: ; X86-BMI2: # %bb.0: -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: shlb $3, %al ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-BMI2-NEXT: shll $3, %ecx -; X86-BMI2-NEXT: shrxl %ecx, (%edx), %ecx -; X86-BMI2-NEXT: movb %cl, (%eax) +; X86-BMI2-NEXT: shrxl %eax, (%edx), %eax +; X86-BMI2-NEXT: movb %al, (%ecx) ; X86-BMI2-NEXT: retl %init = load <4 x i8>, ptr %src, align 1 %intermediate.val.frozen = freeze <4 x i8> %init @@ -133,24 +131,23 @@ ; ; X86-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca: ; X86-NO-BMI2: # %bb.0: +; X86-NO-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: shlb $3, %cl ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NEXT: movl (%edx), %edx -; X86-NO-BMI2-NEXT: shll $3, %ecx -; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NEXT: shrl %cl, %edx ; X86-NO-BMI2-NEXT: movw %dx, (%eax) ; X86-NO-BMI2-NEXT: retl ; ; X86-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca: ; X86-BMI2: # %bb.0: -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: shlb $3, %al ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-BMI2-NEXT: shll $3, %ecx -; X86-BMI2-NEXT: shrxl %ecx, (%edx), %ecx -; X86-BMI2-NEXT: movw %cx, (%eax) +; X86-BMI2-NEXT: shrxl %eax, (%edx), %eax +; X86-BMI2-NEXT: movw %ax, (%ecx) ; X86-BMI2-NEXT: retl %init = load <4 x i8>, ptr %src, align 1 %intermediate.val.frozen = freeze <4 x i8> %init @@ -187,11 +184,11 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shlb $3, %al ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %ebx @@ -214,11 +211,11 @@ ; X86-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca: ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shlb $3, %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx @@ -235,11 +232,11 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlb $3, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi @@ -262,11 +259,11 @@ ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlb $3, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi @@ -311,11 +308,11 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shlb $3, %al ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi @@ -338,11 +335,11 @@ ; X86-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca: ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shlb $3, %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi @@ -359,11 +356,11 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlb $3, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi @@ -385,11 +382,11 @@ ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca: ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlb $3, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi @@ -432,11 +429,11 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shlb $3, %al ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi @@ -459,11 +456,11 @@ ; X86-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca: ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shlb $3, %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi @@ -480,11 +477,11 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlb $3, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi @@ -506,11 +503,11 @@ ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca: ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlb $3, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi @@ -607,10 +604,9 @@ ; X86: # %bb.0: ; X86-NEXT: subl $32, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -622,9 +618,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $15, %ecx ; X86-NEXT: movzbl (%esp,%ecx), %ecx ; X86-NEXT: movb %cl, (%eax) ; X86-NEXT: addl $32, %esp @@ -715,10 +709,9 @@ ; X86: # %bb.0: ; X86-NEXT: subl $32, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -730,9 +723,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $15, %ecx ; X86-NEXT: movl (%esp,%ecx), %ecx ; X86-NEXT: movw %cx, (%eax) ; X86-NEXT: addl $32, %esp @@ -822,10 +813,9 @@ ; X86: # %bb.0: ; X86-NEXT: subl $32, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -837,9 +827,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $15, %ecx ; X86-NEXT: movl (%esp,%ecx), %ecx ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: addl $32, %esp @@ -929,10 +917,9 @@ ; X86: # %bb.0: ; X86-NEXT: subl $32, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -944,9 +931,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $15, %ecx ; X86-NEXT: movl (%esp,%ecx), %edx ; X86-NEXT: movl 4(%esp,%ecx), %ecx ; X86-NEXT: movl %ecx, 4(%eax) @@ -992,11 +977,10 @@ ; X86: # %bb.0: ; X86-NEXT: subl $64, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 ; X86-NEXT: movdqu 16(%edx), %xmm1 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] @@ -1019,8 +1003,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $31, %ecx ; X86-NEXT: movzbl (%esp,%ecx), %ecx ; X86-NEXT: movb %cl, (%eax) ; X86-NEXT: addl $64, %esp @@ -1063,11 +1046,10 @@ ; X86: # %bb.0: ; X86-NEXT: subl $64, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 ; X86-NEXT: movdqu 16(%edx), %xmm1 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] @@ -1090,8 +1072,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $31, %ecx ; X86-NEXT: movl (%esp,%ecx), %ecx ; X86-NEXT: movw %cx, (%eax) ; X86-NEXT: addl $64, %esp @@ -1133,11 +1114,10 @@ ; X86: # %bb.0: ; X86-NEXT: subl $64, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 ; X86-NEXT: movdqu 16(%edx), %xmm1 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] @@ -1160,8 +1140,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $31, %ecx ; X86-NEXT: movl (%esp,%ecx), %ecx ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: addl $64, %esp @@ -1203,11 +1182,10 @@ ; X86: # %bb.0: ; X86-NEXT: subl $64, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 ; X86-NEXT: movdqu 16(%edx), %xmm1 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] @@ -1230,8 +1208,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $31, %ecx ; X86-NEXT: movl (%esp,%ecx), %edx ; X86-NEXT: movl 4(%esp,%ecx), %ecx ; X86-NEXT: movl %ecx, 4(%eax) @@ -1279,11 +1256,10 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: subl $64, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 ; X86-NEXT: movdqu 16(%edx), %xmm1 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] @@ -1306,8 +1282,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $31, %ecx ; X86-NEXT: movl (%esp,%ecx), %edx ; X86-NEXT: movl 4(%esp,%ecx), %esi ; X86-NEXT: movl 8(%esp,%ecx), %edi @@ -1334,7 +1309,7 @@ ; no @load_32byte_chunk_of_32byte_alloca ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; ALL: {{.*}} -; X86-NO-SHLD: {{.*}} -; X86-SHLD: {{.*}} ; X64-NO-SHLD: {{.*}} ; X64-SHLD: {{.*}} +; X86-NO-SHLD: {{.*}} +; X86-SHLD: {{.*}} diff --git a/llvm/test/CodeGen/X86/widen_bitops-1.ll b/llvm/test/CodeGen/X86/widen_bitops-1.ll --- a/llvm/test/CodeGen/X86/widen_bitops-1.ll +++ b/llvm/test/CodeGen/X86/widen_bitops-1.ll @@ -70,8 +70,91 @@ define i32 @and_i32_as_v8i4(i32 %a, i32 %b) nounwind { ; X86-LABEL: and_i32_as_v8i4: ; X86: # %bb.0: +; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $4, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $15, %edx +; X86-NEXT: movd %edx, %xmm0 +; X86-NEXT: pinsrw $1, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $8, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $2, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $12, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $3, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $16, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $4, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $20, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $5, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $24, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $6, %ecx, %xmm0 +; X86-NEXT: shrl $28, %eax +; X86-NEXT: pinsrw $7, %eax, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $4, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $15, %edx +; X86-NEXT: movd %edx, %xmm1 +; X86-NEXT: pinsrw $1, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $8, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $2, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $12, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $3, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $16, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $4, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $20, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $5, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $24, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $6, %ecx, %xmm1 +; X86-NEXT: shrl $28, %eax +; X86-NEXT: pinsrw $7, %eax, %xmm1 +; X86-NEXT: pand %xmm0, %xmm1 +; X86-NEXT: pextrw $0, %xmm1, %eax +; X86-NEXT: pextrw $1, %xmm1, %ecx +; X86-NEXT: shll $4, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: pextrw $2, %xmm1, %edx +; X86-NEXT: shll $8, %edx +; X86-NEXT: pextrw $3, %xmm1, %eax +; X86-NEXT: shll $12, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: pextrw $4, %xmm1, %ecx +; X86-NEXT: shll $16, %ecx +; X86-NEXT: pextrw $5, %xmm1, %edx +; X86-NEXT: shll $20, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: pextrw $6, %xmm1, %ecx +; X86-NEXT: shll $24, %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: pextrw $7, %xmm1, %eax +; X86-NEXT: shll $28, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: popl %ecx ; X86-NEXT: retl ; ; X64-LABEL: and_i32_as_v8i4: @@ -89,8 +172,91 @@ define i32 @xor_i32_as_v8i4(i32 %a, i32 %b) nounwind { ; X86-LABEL: xor_i32_as_v8i4: ; X86: # %bb.0: +; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $4, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $15, %edx +; X86-NEXT: movd %edx, %xmm0 +; X86-NEXT: pinsrw $1, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $8, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $2, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $12, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $3, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $16, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $4, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $20, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $5, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $24, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $6, %ecx, %xmm0 +; X86-NEXT: shrl $28, %eax +; X86-NEXT: pinsrw $7, %eax, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $4, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $15, %edx +; X86-NEXT: movd %edx, %xmm1 +; X86-NEXT: pinsrw $1, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $8, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $2, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $12, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $3, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $16, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $4, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $20, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $5, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $24, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $6, %ecx, %xmm1 +; X86-NEXT: shrl $28, %eax +; X86-NEXT: pinsrw $7, %eax, %xmm1 +; X86-NEXT: pxor %xmm0, %xmm1 +; X86-NEXT: pextrw $0, %xmm1, %eax +; X86-NEXT: pextrw $1, %xmm1, %ecx +; X86-NEXT: shll $4, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: pextrw $2, %xmm1, %edx +; X86-NEXT: shll $8, %edx +; X86-NEXT: pextrw $3, %xmm1, %eax +; X86-NEXT: shll $12, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: pextrw $4, %xmm1, %ecx +; X86-NEXT: shll $16, %ecx +; X86-NEXT: pextrw $5, %xmm1, %edx +; X86-NEXT: shll $20, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: pextrw $6, %xmm1, %ecx +; X86-NEXT: shll $24, %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: pextrw $7, %xmm1, %eax +; X86-NEXT: shll $28, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: popl %ecx ; X86-NEXT: retl ; ; X64-LABEL: xor_i32_as_v8i4: @@ -108,8 +274,91 @@ define i32 @or_i32_as_v8i4(i32 %a, i32 %b) nounwind { ; X86-LABEL: or_i32_as_v8i4: ; X86: # %bb.0: +; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $4, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $15, %edx +; X86-NEXT: movd %edx, %xmm0 +; X86-NEXT: pinsrw $1, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $8, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $2, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $12, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $3, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $16, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $4, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $20, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $5, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $24, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $6, %ecx, %xmm0 +; X86-NEXT: shrl $28, %eax +; X86-NEXT: pinsrw $7, %eax, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $4, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $15, %edx +; X86-NEXT: movd %edx, %xmm1 +; X86-NEXT: pinsrw $1, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $8, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $2, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $12, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $3, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $16, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $4, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $20, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $5, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $24, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $6, %ecx, %xmm1 +; X86-NEXT: shrl $28, %eax +; X86-NEXT: pinsrw $7, %eax, %xmm1 +; X86-NEXT: por %xmm0, %xmm1 +; X86-NEXT: pextrw $0, %xmm1, %eax +; X86-NEXT: pextrw $1, %xmm1, %ecx +; X86-NEXT: shll $4, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: pextrw $2, %xmm1, %edx +; X86-NEXT: shll $8, %edx +; X86-NEXT: pextrw $3, %xmm1, %eax +; X86-NEXT: shll $12, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: pextrw $4, %xmm1, %ecx +; X86-NEXT: shll $16, %ecx +; X86-NEXT: pextrw $5, %xmm1, %edx +; X86-NEXT: shll $20, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: pextrw $6, %xmm1, %ecx +; X86-NEXT: shll $24, %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: pextrw $7, %xmm1, %eax +; X86-NEXT: shll $28, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: popl %ecx ; X86-NEXT: retl ; ; X64-LABEL: or_i32_as_v8i4: diff --git a/llvm/test/CodeGen/X86/widen_cast-2.ll b/llvm/test/CodeGen/X86/widen_cast-2.ll --- a/llvm/test/CodeGen/X86/widen_cast-2.ll +++ b/llvm/test/CodeGen/X86/widen_cast-2.ll @@ -22,9 +22,9 @@ ; CHECK-NEXT: psubw %xmm0, %xmm2 ; CHECK-NEXT: psubw %xmm0, %xmm1 ; CHECK-NEXT: movdqa %xmm1, (%ecx,%eax) -; CHECK-NEXT: movd %xmm2, 16(%ecx,%eax) -; CHECK-NEXT: pextrd $1, %xmm2, 20(%ecx,%eax) ; CHECK-NEXT: pextrd $2, %xmm2, 24(%ecx,%eax) +; CHECK-NEXT: pextrd $1, %xmm2, 20(%ecx,%eax) +; CHECK-NEXT: movd %xmm2, 16(%ecx,%eax) ; CHECK-NEXT: incl (%esp) ; CHECK-NEXT: cmpl $3, (%esp) ; CHECK-NEXT: jle .LBB0_2 diff --git a/llvm/test/CodeGen/X86/widen_fdiv.ll b/llvm/test/CodeGen/X86/widen_fdiv.ll --- a/llvm/test/CodeGen/X86/widen_fdiv.ll +++ b/llvm/test/CodeGen/X86/widen_fdiv.ll @@ -67,17 +67,46 @@ ; ; AVX1OR2-LABEL: widen_fdiv_v2f32_v8f32: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovups (%rdi), %ymm0 -; AVX1OR2-NEXT: vdivps (%rsi), %ymm0, %ymm0 +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero +; AVX1OR2-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1OR2-NEXT: vdivps %ymm6, %ymm2, %ymm2 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm3 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vdivps %ymm3, %ymm0, %ymm0 +; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm2[2,3] +; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1OR2-NEXT: vmovups %ymm0, (%rdx) ; AVX1OR2-NEXT: vzeroupper ; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: widen_fdiv_v2f32_v8f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovups (%rdi), %ymm0 -; AVX512F-NEXT: vdivps (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovups %ymm0, (%rdx) +; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX512F-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero +; AVX512F-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero +; AVX512F-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero +; AVX512F-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero +; AVX512F-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX512F-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512F-NEXT: vdivps %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm3 +; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vdivps %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,8,10] +; AVX512F-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vmovupd %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -89,18 +118,17 @@ ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero -; AVX512VL-NEXT: vdivps %xmm5, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero -; AVX512VL-NEXT: vdivps %xmm6, %xmm3, %xmm3 -; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; AVX512VL-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero +; AVX512VL-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vdivps %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm3 -; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vdivps %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512VL-NEXT: vmovups %ymm0, (%rdx) +; AVX512VL-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,4,6] +; AVX512VL-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 +; AVX512VL-NEXT: vmovupd %ymm1, (%rdx) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a2 = getelementptr inbounds i8, ptr %a0, i64 8 @@ -170,12 +198,42 @@ ; ; AVX1OR2-LABEL: widen_fdiv_v2f32_v16f32: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovups (%rdi), %ymm0 -; AVX1OR2-NEXT: vmovups 32(%rdi), %ymm1 -; AVX1OR2-NEXT: vdivps (%rsi), %ymm0, %ymm0 -; AVX1OR2-NEXT: vdivps 32(%rsi), %ymm1, %ymm1 +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm9 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm10 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm11 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm12 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm13 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm14 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm15 = mem[0],zero +; AVX1OR2-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1OR2-NEXT: vdivps %ymm6, %ymm2, %ymm2 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm3 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vdivps %ymm3, %ymm0, %ymm0 +; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm2[2,3] +; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1OR2-NEXT: vmovups %ymm0, (%rdx) -; AVX1OR2-NEXT: vmovups %ymm1, 32(%rdx) +; AVX1OR2-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm1 +; AVX1OR2-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm1 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm2 +; AVX1OR2-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] +; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1OR2-NEXT: vmovups %ymm0, 32(%rdx) ; AVX1OR2-NEXT: vzeroupper ; AVX1OR2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/widen_load-2.ll b/llvm/test/CodeGen/X86/widen_load-2.ll --- a/llvm/test/CodeGen/X86/widen_load-2.ll +++ b/llvm/test/CodeGen/X86/widen_load-2.ll @@ -47,9 +47,9 @@ ; X86-NEXT: pinsrd $1, 4(%ecx), %xmm1 ; X86-NEXT: pinsrd $2, 8(%ecx), %xmm1 ; X86-NEXT: paddd %xmm0, %xmm1 -; X86-NEXT: movd %xmm1, (%eax) -; X86-NEXT: pextrd $1, %xmm1, 4(%eax) ; X86-NEXT: pextrd $2, %xmm1, 8(%eax) +; X86-NEXT: pextrd $1, %xmm1, 4(%eax) +; X86-NEXT: movd %xmm1, (%eax) ; X86-NEXT: retl $4 ; ; X64-LABEL: add3i32_2: @@ -81,9 +81,9 @@ ; X86-NEXT: movdqa 16(%edx), %xmm1 ; X86-NEXT: paddd (%ecx), %xmm0 ; X86-NEXT: paddd 16(%ecx), %xmm1 -; X86-NEXT: movd %xmm1, 16(%eax) -; X86-NEXT: pextrd $1, %xmm1, 20(%eax) ; X86-NEXT: pextrd $2, %xmm1, 24(%eax) +; X86-NEXT: pextrd $1, %xmm1, 20(%eax) +; X86-NEXT: movd %xmm1, 16(%eax) ; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl $4 ; @@ -94,8 +94,8 @@ ; X64-NEXT: movdqa 16(%rsi), %xmm1 ; X64-NEXT: paddd (%rdx), %xmm0 ; X64-NEXT: paddd 16(%rdx), %xmm1 -; X64-NEXT: movq %xmm1, 16(%rdi) ; X64-NEXT: pextrd $2, %xmm1, 24(%rdi) +; X64-NEXT: movq %xmm1, 16(%rdi) ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq %a = load %i32vec7, ptr %ap, align 16 @@ -116,10 +116,10 @@ ; X86-NEXT: movdqa (%edx), %xmm1 ; X86-NEXT: movdqa 16(%edx), %xmm2 ; X86-NEXT: paddd (%ecx), %xmm1 -; X86-NEXT: paddd 32(%ecx), %xmm0 ; X86-NEXT: paddd 16(%ecx), %xmm2 -; X86-NEXT: movdqa %xmm2, 16(%eax) +; X86-NEXT: paddd 32(%ecx), %xmm0 ; X86-NEXT: movdqa %xmm0, 32(%eax) +; X86-NEXT: movdqa %xmm2, 16(%eax) ; X86-NEXT: movdqa %xmm1, (%eax) ; X86-NEXT: retl $4 ; @@ -130,10 +130,10 @@ ; X64-NEXT: movdqa 16(%rsi), %xmm1 ; X64-NEXT: movdqa 32(%rsi), %xmm2 ; X64-NEXT: paddd (%rdx), %xmm0 -; X64-NEXT: paddd 32(%rdx), %xmm2 ; X64-NEXT: paddd 16(%rdx), %xmm1 -; X64-NEXT: movdqa %xmm1, 16(%rdi) +; X64-NEXT: paddd 32(%rdx), %xmm2 ; X64-NEXT: movdqa %xmm2, 32(%rdi) +; X64-NEXT: movdqa %xmm1, 16(%rdi) ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq %a = load %i32vec12, ptr %ap, align 16 @@ -215,8 +215,8 @@ ; X86-NEXT: movdqa 16(%edx), %xmm1 ; X86-NEXT: paddw (%ecx), %xmm0 ; X86-NEXT: paddw 16(%ecx), %xmm1 -; X86-NEXT: movd %xmm1, 16(%eax) ; X86-NEXT: pextrd $1, %xmm1, 20(%eax) +; X86-NEXT: movd %xmm1, 16(%eax) ; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl $4 ; @@ -248,10 +248,10 @@ ; X86-NEXT: movdqa (%edx), %xmm1 ; X86-NEXT: movdqa 16(%edx), %xmm2 ; X86-NEXT: paddw (%ecx), %xmm1 -; X86-NEXT: paddw 32(%ecx), %xmm0 ; X86-NEXT: paddw 16(%ecx), %xmm2 -; X86-NEXT: movdqa %xmm2, 16(%eax) +; X86-NEXT: paddw 32(%ecx), %xmm0 ; X86-NEXT: movd %xmm0, 32(%eax) +; X86-NEXT: movdqa %xmm2, 16(%eax) ; X86-NEXT: movdqa %xmm1, (%eax) ; X86-NEXT: retl $4 ; @@ -262,10 +262,10 @@ ; X64-NEXT: movdqa 16(%rsi), %xmm1 ; X64-NEXT: movdqa 32(%rsi), %xmm2 ; X64-NEXT: paddw (%rdx), %xmm0 -; X64-NEXT: paddw 32(%rdx), %xmm2 ; X64-NEXT: paddw 16(%rdx), %xmm1 -; X64-NEXT: movdqa %xmm1, 16(%rdi) +; X64-NEXT: paddw 32(%rdx), %xmm2 ; X64-NEXT: movd %xmm2, 32(%rdi) +; X64-NEXT: movdqa %xmm1, 16(%rdi) ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq %a = load %i16vec18, ptr %ap, align 16 @@ -317,11 +317,11 @@ ; X86-NEXT: movdqa 16(%edx), %xmm1 ; X86-NEXT: paddb (%ecx), %xmm0 ; X86-NEXT: paddb 16(%ecx), %xmm1 -; X86-NEXT: movd %xmm1, 16(%eax) -; X86-NEXT: pextrd $1, %xmm1, 20(%eax) -; X86-NEXT: pextrd $2, %xmm1, 24(%eax) -; X86-NEXT: pextrw $6, %xmm1, 28(%eax) ; X86-NEXT: pextrb $14, %xmm1, 30(%eax) +; X86-NEXT: pextrw $6, %xmm1, 28(%eax) +; X86-NEXT: pextrd $2, %xmm1, 24(%eax) +; X86-NEXT: pextrd $1, %xmm1, 20(%eax) +; X86-NEXT: movd %xmm1, 16(%eax) ; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl $4 ; @@ -332,10 +332,10 @@ ; X64-NEXT: movdqa 16(%rsi), %xmm1 ; X64-NEXT: paddb (%rdx), %xmm0 ; X64-NEXT: paddb 16(%rdx), %xmm1 -; X64-NEXT: movq %xmm1, 16(%rdi) -; X64-NEXT: pextrd $2, %xmm1, 24(%rdi) -; X64-NEXT: pextrw $6, %xmm1, 28(%rdi) ; X64-NEXT: pextrb $14, %xmm1, 30(%rdi) +; X64-NEXT: pextrw $6, %xmm1, 28(%rdi) +; X64-NEXT: pextrd $2, %xmm1, 24(%rdi) +; X64-NEXT: movq %xmm1, 16(%rdi) ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq %a = load %i8vec31, ptr %ap, align 16 diff --git a/llvm/test/CodeGen/X86/win64-byval.ll b/llvm/test/CodeGen/X86/win64-byval.ll --- a/llvm/test/CodeGen/X86/win64-byval.ll +++ b/llvm/test/CodeGen/X86/win64-byval.ll @@ -64,12 +64,12 @@ ; CHECK-NEXT: movq 8(%rax), %rax ; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rax diff --git a/llvm/test/CodeGen/X86/win64_frame.ll b/llvm/test/CodeGen/X86/win64_frame.ll --- a/llvm/test/CodeGen/X86/win64_frame.ll +++ b/llvm/test/CodeGen/X86/win64_frame.ll @@ -27,9 +27,9 @@ ; CHECK-NEXT: movq %rsp, %rbp ; CHECK-NEXT: .seh_setframe %rbp, 0 ; CHECK-NEXT: .seh_endprologue -; CHECK-NEXT: movq %rdx, 32(%rbp) -; CHECK-NEXT: movq %r8, 40(%rbp) ; CHECK-NEXT: movq %r9, 48(%rbp) +; CHECK-NEXT: movq %r8, 40(%rbp) +; CHECK-NEXT: movq %rdx, 32(%rbp) ; CHECK-NEXT: leaq 32(%rbp), %rax ; CHECK-NEXT: movq %rax, (%rbp) ; CHECK-NEXT: addq $8, %rsp diff --git a/llvm/test/CodeGen/X86/x32-va_start.ll b/llvm/test/CodeGen/X86/x32-va_start.ll --- a/llvm/test/CodeGen/X86/x32-va_start.ll +++ b/llvm/test/CodeGen/X86/x32-va_start.ll @@ -27,11 +27,11 @@ ; SSE-LABEL: foo: ; SSE: # %bb.0: # %entry ; SSE-NEXT: subl $72, %esp -; SSE-NEXT: movq %rsi, -{{[0-9]+}}(%esp) -; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%esp) -; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%esp) -; SSE-NEXT: movq %r8, -{{[0-9]+}}(%esp) ; SSE-NEXT: movq %r9, -{{[0-9]+}}(%esp) +; SSE-NEXT: movq %r8, -{{[0-9]+}}(%esp) +; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%esp) +; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%esp) +; SSE-NEXT: movq %rsi, -{{[0-9]+}}(%esp) ; SSE-NEXT: testb %al, %al ; SSE-NEXT: je .LBB0_5 ; SSE-NEXT: # %bb.4: # %entry @@ -69,11 +69,11 @@ ; ; NOSSE-LABEL: foo: ; NOSSE: # %bb.0: # %entry -; NOSSE-NEXT: movq %rsi, -{{[0-9]+}}(%esp) -; NOSSE-NEXT: movq %rdx, -{{[0-9]+}}(%esp) -; NOSSE-NEXT: movq %rcx, -{{[0-9]+}}(%esp) -; NOSSE-NEXT: movq %r8, -{{[0-9]+}}(%esp) ; NOSSE-NEXT: movq %r9, -{{[0-9]+}}(%esp) +; NOSSE-NEXT: movq %r8, -{{[0-9]+}}(%esp) +; NOSSE-NEXT: movq %rcx, -{{[0-9]+}}(%esp) +; NOSSE-NEXT: movq %rdx, -{{[0-9]+}}(%esp) +; NOSSE-NEXT: movq %rsi, -{{[0-9]+}}(%esp) ; NOSSE-NEXT: leal -{{[0-9]+}}(%rsp), %eax ; NOSSE-NEXT: movl %eax, -{{[0-9]+}}(%esp) ; NOSSE-NEXT: leal {{[0-9]+}}(%rsp), %eax diff --git a/llvm/test/CodeGen/X86/x86-64-baseptr.ll b/llvm/test/CodeGen/X86/x86-64-baseptr.ll --- a/llvm/test/CodeGen/X86/x86-64-baseptr.ll +++ b/llvm/test/CodeGen/X86/x86-64-baseptr.ll @@ -318,11 +318,11 @@ ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: subq $200, %rsp ; CHECK-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq %rsi, -184(%rbp) -; CHECK-NEXT: movq %rdx, -176(%rbp) -; CHECK-NEXT: movq %rcx, -168(%rbp) -; CHECK-NEXT: movq %r8, -160(%rbp) ; CHECK-NEXT: movq %r9, -152(%rbp) +; CHECK-NEXT: movq %r8, -160(%rbp) +; CHECK-NEXT: movq %rcx, -168(%rbp) +; CHECK-NEXT: movq %rdx, -176(%rbp) +; CHECK-NEXT: movq %rsi, -184(%rbp) ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: je .LBB3_2 ; CHECK-NEXT: # %bb.1: # %entry @@ -361,11 +361,11 @@ ; X32ABI-NEXT: andl $-16, %esp ; X32ABI-NEXT: subl $208, %esp ; X32ABI-NEXT: movl %esp, %ebx -; X32ABI-NEXT: movq %rsi, 24(%ebx) -; X32ABI-NEXT: movq %rdx, 32(%ebx) -; X32ABI-NEXT: movq %rcx, 40(%ebx) -; X32ABI-NEXT: movq %r8, 48(%ebx) ; X32ABI-NEXT: movq %r9, 56(%ebx) +; X32ABI-NEXT: movq %r8, 48(%ebx) +; X32ABI-NEXT: movq %rcx, 40(%ebx) +; X32ABI-NEXT: movq %rdx, 32(%ebx) +; X32ABI-NEXT: movq %rsi, 24(%ebx) ; X32ABI-NEXT: testb %al, %al ; X32ABI-NEXT: je .LBB3_2 ; X32ABI-NEXT: # %bb.1: # %entry diff --git a/llvm/test/CodeGen/X86/x86-64-varargs.ll b/llvm/test/CodeGen/X86/x86-64-varargs.ll --- a/llvm/test/CodeGen/X86/x86-64-varargs.ll +++ b/llvm/test/CodeGen/X86/x86-64-varargs.ll @@ -29,18 +29,18 @@ ; CHECK-X64-NEXT: movaps %xmm6, 192(%rsp) ; CHECK-X64-NEXT: movaps %xmm7, 208(%rsp) ; CHECK-X64-NEXT: LBB0_47: ## %entry -; CHECK-X64-NEXT: movq %rdi, 48(%rsp) -; CHECK-X64-NEXT: movq %rsi, 56(%rsp) -; CHECK-X64-NEXT: movq %rdx, 64(%rsp) -; CHECK-X64-NEXT: movq %rcx, 72(%rsp) -; CHECK-X64-NEXT: movq %r8, 80(%rsp) ; CHECK-X64-NEXT: movq %r9, 88(%rsp) +; CHECK-X64-NEXT: movq %r8, 80(%rsp) +; CHECK-X64-NEXT: movq %rcx, 72(%rsp) +; CHECK-X64-NEXT: movq %rdx, 64(%rsp) +; CHECK-X64-NEXT: movq %rsi, 56(%rsp) +; CHECK-X64-NEXT: movq %rdi, 48(%rsp) ; CHECK-X64-NEXT: movabsq $206158430208, %rax ## imm = 0x3000000000 ; CHECK-X64-NEXT: movq %rax, (%rsp) -; CHECK-X64-NEXT: leaq 240(%rsp), %rax -; CHECK-X64-NEXT: movq %rax, 8(%rsp) ; CHECK-X64-NEXT: leaq 48(%rsp), %rax ; CHECK-X64-NEXT: movq %rax, 16(%rsp) +; CHECK-X64-NEXT: leaq 240(%rsp), %rax +; CHECK-X64-NEXT: movq %rax, 8(%rsp) ; CHECK-X64-NEXT: movl (%rsp), %ecx ; CHECK-X64-NEXT: cmpl $48, %ecx ; CHECK-X64-NEXT: jae LBB0_2 @@ -288,18 +288,18 @@ ; CHECK-X32-NEXT: movaps %xmm6, 176(%esp) ; CHECK-X32-NEXT: movaps %xmm7, 192(%esp) ; CHECK-X32-NEXT: .LBB0_47: # %entry -; CHECK-X32-NEXT: movq %rdi, 32(%esp) -; CHECK-X32-NEXT: movq %rsi, 40(%esp) -; CHECK-X32-NEXT: movq %rdx, 48(%esp) -; CHECK-X32-NEXT: movq %rcx, 56(%esp) -; CHECK-X32-NEXT: movq %r8, 64(%esp) ; CHECK-X32-NEXT: movq %r9, 72(%esp) +; CHECK-X32-NEXT: movq %r8, 64(%esp) +; CHECK-X32-NEXT: movq %rcx, 56(%esp) +; CHECK-X32-NEXT: movq %rdx, 48(%esp) +; CHECK-X32-NEXT: movq %rsi, 40(%esp) +; CHECK-X32-NEXT: movq %rdi, 32(%esp) ; CHECK-X32-NEXT: movabsq $206158430208, %rax # imm = 0x3000000000 ; CHECK-X32-NEXT: movq %rax, (%esp) -; CHECK-X32-NEXT: leal 224(%rsp), %eax -; CHECK-X32-NEXT: movl %eax, 8(%esp) ; CHECK-X32-NEXT: leal 32(%rsp), %eax ; CHECK-X32-NEXT: movl %eax, 12(%esp) +; CHECK-X32-NEXT: leal 224(%rsp), %eax +; CHECK-X32-NEXT: movl %eax, 8(%esp) ; CHECK-X32-NEXT: movl (%esp), %ecx ; CHECK-X32-NEXT: cmpl $48, %ecx ; CHECK-X32-NEXT: jae .LBB0_2 diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -407,11 +407,10 @@ ; AVX512-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; AVX512-NEXT: vmovdqu (%rdi), %ymm1 -; AVX512-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512-NEXT: vmovdqu (%rdi), %ymm2 -; AVX512-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512-NEXT: vpmovwb %zmm2, %ymm2 -; AVX512-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpmovdw %zmm1, %ymm2 +; AVX512-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX512-NEXT: vpmullw %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vzeroupper @@ -1072,109 +1071,115 @@ define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, ptr %p) nounwind { ; AVX1-LABEL: interleaved_store_vf64_i8_stride3: ; AVX1: # %bb.0: -; AVX1-NEXT: subq $24, %rsp +; AVX1-NEXT: subq $56, %rsp ; AVX1-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovdqa %ymm2, %ymm4 +; AVX1-NEXT: vmovdqa %ymm3, %ymm4 +; AVX1-NEXT: vmovdqa %ymm2, %ymm5 ; AVX1-NEXT: vmovdqa %ymm0, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm9 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm12 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = ; AVX1-NEXT: vpshufb %xmm13, %xmm12, %xmm6 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm11 -; AVX1-NEXT: vpshufb %xmm14, %xmm11, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm10 +; AVX1-NEXT: vpshufb %xmm14, %xmm10, %xmm7 ; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpshufb %xmm13, %xmm1, %xmm7 -; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm8 +; AVX1-NEXT: vpshufb %xmm14, %xmm4, %xmm8 ; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpshufb %xmm13, %xmm9, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm15 -; AVX1-NEXT: vpshufb %xmm14, %xmm15, %xmm10 -; AVX1-NEXT: vpor %xmm8, %xmm10, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm15 +; AVX1-NEXT: vpshufb %xmm14, %xmm15, %xmm11 +; AVX1-NEXT: vpor %xmm8, %xmm11, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [11,12,13,14,15,0,1,2,3,4,5,128,128,128,128,128] -; AVX1-NEXT: vpshufb %xmm0, %xmm9, %xmm10 -; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm5 +; AVX1-NEXT: vpshufb %xmm0, %xmm9, %xmm11 +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm7 ; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm12 -; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm6 ; AVX1-NEXT: vpshufb %xmm13, %xmm2, %xmm1 -; AVX1-NEXT: vpshufb %xmm14, %xmm4, %xmm2 -; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm9 +; AVX1-NEXT: vpshufb %xmm14, %xmm5, %xmm2 +; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX1-NEXT: vpshufb %xmm13, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm9 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm15[8],xmm9[9],xmm15[9],xmm9[10],xmm15[10],xmm9[11],xmm15[11],xmm9[12],xmm15[12],xmm9[13],xmm15[13],xmm9[14],xmm15[14],xmm9[15],xmm15[15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm8 -; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm3[8],xmm14[9],xmm3[9],xmm14[10],xmm3[10],xmm14[11],xmm3[11],xmm14[12],xmm3[12],xmm14[13],xmm3[13],xmm14[14],xmm3[14],xmm14[15],xmm3[15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] -; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm2 -; AVX1-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] -; AVX1-NEXT: vpor %xmm13, %xmm12, %xmm13 -; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4] -; AVX1-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4] -; AVX1-NEXT: vpor %xmm5, %xmm11, %xmm11 -; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX1-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4] -; AVX1-NEXT: vpor %xmm11, %xmm10, %xmm11 -; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4] -; AVX1-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4] -; AVX1-NEXT: vpor %xmm0, %xmm15, %xmm15 -; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm15[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4] -; AVX1-NEXT: vpalignr $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX1-NEXT: vpshufb %xmm13, %xmm1, %xmm8 +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; AVX1-NEXT: vpshufb %xmm13, %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] +; AVX1-NEXT: vpshufb %xmm13, %xmm14, %xmm13 +; AVX1-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] +; AVX1-NEXT: vpor %xmm14, %xmm12, %xmm14 +; AVX1-NEXT: vpalignr {{.*#+}} xmm10 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4] +; AVX1-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] +; AVX1-NEXT: vmovdqa %ymm0, %ymm14 +; AVX1-NEXT: vpor %xmm7, %xmm10, %xmm10 +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4] +; AVX1-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4] +; AVX1-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4] +; AVX1-NEXT: vpor %xmm6, %xmm10, %xmm10 +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm10 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4] +; AVX1-NEXT: vpalignr $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-NEXT: # xmm6 = mem[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm7, %xmm8, %xmm8 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128] -; AVX1-NEXT: vpshufb %xmm10, %xmm6, %xmm12 -; AVX1-NEXT: vpor %xmm12, %xmm8, %xmm8 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm12 -; AVX1-NEXT: vpshufb %xmm10, %xmm9, %xmm9 -; AVX1-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm12 -; AVX1-NEXT: vpshufb %xmm10, %xmm14, %xmm14 -; AVX1-NEXT: vpor %xmm14, %xmm12, %xmm12 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm7 -; AVX1-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128] +; AVX1-NEXT: vpshufb %xmm8, %xmm3, %xmm11 +; AVX1-NEXT: vpor %xmm2, %xmm11, %xmm2 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm11 +; AVX1-NEXT: vpshufb %xmm8, %xmm9, %xmm9 +; AVX1-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm11 +; AVX1-NEXT: vpshufb %xmm8, %xmm14, %xmm12 +; AVX1-NEXT: vpor %xmm12, %xmm11, %xmm11 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm7 +; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm7, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX1-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm7, %xmm11, %xmm10 -; AVX1-NEXT: vpshufb %xmm7, %xmm15, %xmm6 +; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm0 +; AVX1-NEXT: vpshufb %xmm7, %xmm15, %xmm8 +; AVX1-NEXT: vpshufb %xmm7, %xmm13, %xmm6 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpshufb %xmm7, %xmm13, %xmm11 -; AVX1-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm7, %xmm12, %xmm12 +; AVX1-NEXT: vpshufb %xmm7, %xmm10, %xmm7 +; AVX1-NEXT: vmovdqu %xmm7, 176(%rdi) +; AVX1-NEXT: vmovdqu %xmm1, 160(%rdi) +; AVX1-NEXT: vmovdqu %xmm12, 144(%rdi) +; AVX1-NEXT: vmovdqu %xmm5, 128(%rdi) +; AVX1-NEXT: vmovdqu %xmm11, 112(%rdi) +; AVX1-NEXT: vmovdqu %xmm3, 96(%rdi) ; AVX1-NEXT: vmovdqu %xmm6, 80(%rdi) ; AVX1-NEXT: vmovdqu %xmm9, 64(%rdi) -; AVX1-NEXT: vmovdqu %xmm8, 16(%rdi) -; AVX1-NEXT: vmovdqu %xmm4, (%rdi) -; AVX1-NEXT: vmovdqu %xmm10, 48(%rdi) +; AVX1-NEXT: vmovdqu %xmm8, 48(%rdi) ; AVX1-NEXT: vmovdqu %xmm0, 32(%rdi) -; AVX1-NEXT: vmovdqu %xmm2, 176(%rdi) -; AVX1-NEXT: vmovdqu %xmm1, 160(%rdi) -; AVX1-NEXT: vmovdqu %xmm12, 112(%rdi) -; AVX1-NEXT: vmovdqu %xmm3, 96(%rdi) -; AVX1-NEXT: vmovdqu %xmm11, 144(%rdi) -; AVX1-NEXT: vmovdqu %xmm5, 128(%rdi) -; AVX1-NEXT: addq $24, %rsp +; AVX1-NEXT: vmovdqu %xmm2, 16(%rdi) +; AVX1-NEXT: vmovdqu %xmm4, (%rdi) +; AVX1-NEXT: addq $56, %rsp ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1600,19 +1605,19 @@ ; AVX512-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5 -; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm6 -; AVX512-NEXT: vextracti64x4 $1, %zmm3, %ymm7 -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm8 -; AVX512-NEXT: vextracti64x4 $1, %zmm4, %ymm9 -; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512-NEXT: vextracti64x4 $1, %zmm3, %ymm6 +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm7 +; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm7 +; AVX512-NEXT: vextracti64x4 $1, %zmm4, %ymm8 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm9 +; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm9 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm4 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm3 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,6,7],zmm4[2,3,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm4 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm0[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm1[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm4 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm0[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm6[0,1,2,3],zmm1[4,5,6,7] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,6,7],zmm0[2,3,6,7] ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rdi) ; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rdi) @@ -1651,10 +1656,10 @@ ; ; AVX512-LABEL: splat2_v4f64_load_store: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %ymm0 -; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] -; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vmovups %zmm0, (%rsi) +; AVX512-NEXT: vbroadcastf64x4 (%rdi), %zmm0 # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] +; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = load <4 x double>, ptr %s, align 8 @@ -1688,10 +1693,10 @@ ; ; AVX512-LABEL: splat2_v4i64_load_store: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %ymm0 -; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] -; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vmovups %zmm0, (%rsi) +; AVX512-NEXT: vbroadcasti64x4 (%rdi), %zmm0 # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] +; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = load <4 x i64>, ptr %s, align 8 @@ -1704,22 +1709,22 @@ define void @splat4_v8f32_load_store(ptr %s, ptr %d) nounwind { ; AVX1-LABEL: splat4_v8f32_load_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss 16(%rdi), %xmm0 -; AVX1-NEXT: vbroadcastss 20(%rdi), %xmm1 -; AVX1-NEXT: vbroadcastss 24(%rdi), %xmm2 -; AVX1-NEXT: vbroadcastss 28(%rdi), %xmm3 -; AVX1-NEXT: vbroadcastss (%rdi), %xmm4 -; AVX1-NEXT: vbroadcastss 4(%rdi), %xmm5 -; AVX1-NEXT: vbroadcastss 8(%rdi), %xmm6 -; AVX1-NEXT: vbroadcastss 12(%rdi), %xmm7 -; AVX1-NEXT: vmovups %xmm7, 48(%rsi) -; AVX1-NEXT: vmovups %xmm6, 32(%rsi) -; AVX1-NEXT: vmovups %xmm5, 16(%rsi) -; AVX1-NEXT: vmovups %xmm4, (%rsi) -; AVX1-NEXT: vmovups %xmm3, 112(%rsi) -; AVX1-NEXT: vmovups %xmm2, 96(%rsi) -; AVX1-NEXT: vmovups %xmm1, 80(%rsi) -; AVX1-NEXT: vmovups %xmm0, 64(%rsi) +; AVX1-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX1-NEXT: vbroadcastss 4(%rdi), %xmm1 +; AVX1-NEXT: vbroadcastss 8(%rdi), %xmm2 +; AVX1-NEXT: vbroadcastss 12(%rdi), %xmm3 +; AVX1-NEXT: vbroadcastss 16(%rdi), %xmm4 +; AVX1-NEXT: vbroadcastss 20(%rdi), %xmm5 +; AVX1-NEXT: vbroadcastss 24(%rdi), %xmm6 +; AVX1-NEXT: vbroadcastss 28(%rdi), %xmm7 +; AVX1-NEXT: vmovups %xmm7, 112(%rsi) +; AVX1-NEXT: vmovups %xmm6, 96(%rsi) +; AVX1-NEXT: vmovups %xmm5, 80(%rsi) +; AVX1-NEXT: vmovups %xmm4, 64(%rsi) +; AVX1-NEXT: vmovups %xmm3, 48(%rsi) +; AVX1-NEXT: vmovups %xmm2, 32(%rsi) +; AVX1-NEXT: vmovups %xmm1, 16(%rsi) +; AVX1-NEXT: vmovups %xmm0, (%rsi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: splat4_v8f32_load_store: @@ -1836,14 +1841,15 @@ ; ; AVX512-LABEL: splat4_v4f64_load_store: ; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX512-NEXT: vbroadcastsd 16(%rdi), %ymm1 -; AVX512-NEXT: vbroadcastsd 8(%rdi), %ymm2 -; AVX512-NEXT: vbroadcastsd 24(%rdi), %ymm3 -; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512-NEXT: vmovups %zmm1, 64(%rsi) -; AVX512-NEXT: vmovups %zmm0, (%rsi) +; AVX512-NEXT: vbroadcastf64x4 (%rdi), %zmm0 # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpbroadcastq %xmm0, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,2,2,2] +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm0[1,1,1,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, 64(%rsi) +; AVX512-NEXT: vmovdqu64 %zmm1, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = load <4 x double>, ptr %s, align 8 @@ -1870,14 +1876,15 @@ ; ; AVX512-LABEL: splat4_v4i64_load_store: ; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX512-NEXT: vbroadcastsd 16(%rdi), %ymm1 -; AVX512-NEXT: vbroadcastsd 8(%rdi), %ymm2 -; AVX512-NEXT: vbroadcastsd 24(%rdi), %ymm3 -; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512-NEXT: vmovups %zmm1, 64(%rsi) -; AVX512-NEXT: vmovups %zmm0, (%rsi) +; AVX512-NEXT: vbroadcasti64x4 (%rdi), %zmm0 # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpbroadcastq %xmm0, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,2,2,2] +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm0[1,1,1,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, 64(%rsi) +; AVX512-NEXT: vmovdqu64 %zmm1, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = load <4 x i64>, ptr %s, align 8 diff --git a/llvm/test/CodeGen/X86/x86-prefer-no-gather-no-scatter.ll b/llvm/test/CodeGen/X86/x86-prefer-no-gather-no-scatter.ll --- a/llvm/test/CodeGen/X86/x86-prefer-no-gather-no-scatter.ll +++ b/llvm/test/CodeGen/X86/x86-prefer-no-gather-no-scatter.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; Check that if option prefer-no-gather/scatter can disable gather/scatter instructions. ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2,+fast-gather %s -o - | FileCheck %s --check-prefixes=GATHER ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2,+fast-gather,+prefer-no-gather %s -o - | FileCheck %s --check-prefixes=NO-GATHER @@ -13,16 +14,684 @@ ; This tests the function that if prefer-no-gather can disable lowerMGather define void @test() #0 { ; GATHER-LABEL: test: -; GATHER: vpgatherdq +; GATHER: # %bb.0: # %iter.check +; GATHER-NEXT: movq $-1024, %rsi # imm = 0xFC00 +; GATHER-NEXT: movq A@GOTPCREL(%rip), %rax +; GATHER-NEXT: movq B@GOTPCREL(%rip), %rcx +; GATHER-NEXT: movq C@GOTPCREL(%rip), %rdx +; GATHER-NEXT: .p2align 4, 0x90 +; GATHER-NEXT: .LBB0_1: # %vector.body +; GATHER-NEXT: # =>This Inner Loop Header: Depth=1 +; GATHER-NEXT: vpmovsxbq 1052(%rax,%rsi), %ymm0 +; GATHER-NEXT: vpmovsxbq 1048(%rax,%rsi), %ymm1 +; GATHER-NEXT: vpmovsxbq 1044(%rax,%rsi), %ymm2 +; GATHER-NEXT: vpmovsxbq 1040(%rax,%rsi), %ymm3 +; GATHER-NEXT: vpmovsxbq 1036(%rax,%rsi), %ymm4 +; GATHER-NEXT: vpmovsxbq 1032(%rax,%rsi), %ymm5 +; GATHER-NEXT: vpmovsxbq 1028(%rax,%rsi), %ymm6 +; GATHER-NEXT: vpmovsxbq 1024(%rax,%rsi), %ymm7 +; GATHER-NEXT: vpxor %xmm8, %xmm8, %xmm8 +; GATHER-NEXT: vpcmpeqd %ymm9, %ymm9, %ymm9 +; GATHER-NEXT: vpgatherqq %ymm9, (%rcx,%ymm7,8), %ymm8 +; GATHER-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; GATHER-NEXT: vpcmpeqd %ymm9, %ymm9, %ymm9 +; GATHER-NEXT: vpgatherqq %ymm9, (%rcx,%ymm6,8), %ymm7 +; GATHER-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; GATHER-NEXT: vpcmpeqd %ymm9, %ymm9, %ymm9 +; GATHER-NEXT: vpgatherqq %ymm9, (%rcx,%ymm5,8), %ymm6 +; GATHER-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; GATHER-NEXT: vpcmpeqd %ymm9, %ymm9, %ymm9 +; GATHER-NEXT: vpgatherqq %ymm9, (%rcx,%ymm4,8), %ymm5 +; GATHER-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; GATHER-NEXT: vpcmpeqd %ymm9, %ymm9, %ymm9 +; GATHER-NEXT: vpgatherqq %ymm9, (%rcx,%ymm3,8), %ymm4 +; GATHER-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; GATHER-NEXT: vpcmpeqd %ymm9, %ymm9, %ymm9 +; GATHER-NEXT: vpgatherqq %ymm9, (%rcx,%ymm2,8), %ymm3 +; GATHER-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; GATHER-NEXT: vpcmpeqd %ymm9, %ymm9, %ymm9 +; GATHER-NEXT: vpgatherqq %ymm9, (%rcx,%ymm1,8), %ymm2 +; GATHER-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; GATHER-NEXT: vpcmpeqd %ymm9, %ymm9, %ymm9 +; GATHER-NEXT: vpgatherqq %ymm9, (%rcx,%ymm0,8), %ymm1 +; GATHER-NEXT: vmovdqu %ymm1, 8416(%rdx,%rsi,8) +; GATHER-NEXT: vmovdqu %ymm2, 8384(%rdx,%rsi,8) +; GATHER-NEXT: vmovdqu %ymm3, 8352(%rdx,%rsi,8) +; GATHER-NEXT: vmovdqu %ymm4, 8320(%rdx,%rsi,8) +; GATHER-NEXT: vmovdqu %ymm5, 8288(%rdx,%rsi,8) +; GATHER-NEXT: vmovdqu %ymm6, 8256(%rdx,%rsi,8) +; GATHER-NEXT: vmovdqu %ymm7, 8224(%rdx,%rsi,8) +; GATHER-NEXT: vmovdqu %ymm8, 8192(%rdx,%rsi,8) +; GATHER-NEXT: addq $32, %rsi +; GATHER-NEXT: jne .LBB0_1 +; GATHER-NEXT: # %bb.2: # %middle.block +; GATHER-NEXT: movb $1, %sil +; GATHER-NEXT: testb %sil, %sil +; GATHER-NEXT: jne .LBB0_5 +; GATHER-NEXT: # %bb.3: # %vec.epilog.scalar.ph +; GATHER-NEXT: movl $1024, %esi # imm = 0x400 +; GATHER-NEXT: .p2align 4, 0x90 +; GATHER-NEXT: .LBB0_4: # %for.body +; GATHER-NEXT: # =>This Inner Loop Header: Depth=1 +; GATHER-NEXT: movsbq (%rax,%rsi), %rdi +; GATHER-NEXT: movq (%rcx,%rdi,8), %rdi +; GATHER-NEXT: movq %rdi, (%rdx,%rsi,8) +; GATHER-NEXT: incq %rsi +; GATHER-NEXT: cmpq $1024, %rsi # imm = 0x400 +; GATHER-NEXT: jb .LBB0_4 +; GATHER-NEXT: .LBB0_5: # %for.cond.cleanup +; GATHER-NEXT: vzeroupper +; GATHER-NEXT: retq ; ; NO-GATHER-LABEL: test: -; NO-GATHER-NOT: vpgatherdq +; NO-GATHER: # %bb.0: # %iter.check +; NO-GATHER-NEXT: pushq %rbp +; NO-GATHER-NEXT: .cfi_def_cfa_offset 16 +; NO-GATHER-NEXT: pushq %r15 +; NO-GATHER-NEXT: .cfi_def_cfa_offset 24 +; NO-GATHER-NEXT: pushq %r14 +; NO-GATHER-NEXT: .cfi_def_cfa_offset 32 +; NO-GATHER-NEXT: pushq %r13 +; NO-GATHER-NEXT: .cfi_def_cfa_offset 40 +; NO-GATHER-NEXT: pushq %r12 +; NO-GATHER-NEXT: .cfi_def_cfa_offset 48 +; NO-GATHER-NEXT: pushq %rbx +; NO-GATHER-NEXT: .cfi_def_cfa_offset 56 +; NO-GATHER-NEXT: .cfi_offset %rbx, -56 +; NO-GATHER-NEXT: .cfi_offset %r12, -48 +; NO-GATHER-NEXT: .cfi_offset %r13, -40 +; NO-GATHER-NEXT: .cfi_offset %r14, -32 +; NO-GATHER-NEXT: .cfi_offset %r15, -24 +; NO-GATHER-NEXT: .cfi_offset %rbp, -16 +; NO-GATHER-NEXT: movq $-1024, %rdx # imm = 0xFC00 +; NO-GATHER-NEXT: movq A@GOTPCREL(%rip), %rdi +; NO-GATHER-NEXT: vpbroadcastq B@GOTPCREL(%rip), %ymm10 +; NO-GATHER-NEXT: movq C@GOTPCREL(%rip), %rcx +; NO-GATHER-NEXT: .p2align 4, 0x90 +; NO-GATHER-NEXT: .LBB0_1: # %vector.body +; NO-GATHER-NEXT: # =>This Inner Loop Header: Depth=1 +; NO-GATHER-NEXT: vpmovsxbq 1024(%rdi,%rdx), %ymm5 +; NO-GATHER-NEXT: vpmovsxbq 1028(%rdi,%rdx), %ymm6 +; NO-GATHER-NEXT: vpmovsxbq 1032(%rdi,%rdx), %ymm7 +; NO-GATHER-NEXT: vpmovsxbq 1036(%rdi,%rdx), %ymm8 +; NO-GATHER-NEXT: vpmovsxbq 1040(%rdi,%rdx), %ymm4 +; NO-GATHER-NEXT: vpmovsxbq 1044(%rdi,%rdx), %ymm3 +; NO-GATHER-NEXT: vpmovsxbq 1048(%rdi,%rdx), %ymm2 +; NO-GATHER-NEXT: vpmovsxbq 1052(%rdi,%rdx), %ymm1 +; NO-GATHER-NEXT: vpsllq $3, %ymm1, %ymm1 +; NO-GATHER-NEXT: vpaddq %ymm1, %ymm10, %ymm1 +; NO-GATHER-NEXT: vpsllq $3, %ymm2, %ymm2 +; NO-GATHER-NEXT: vpaddq %ymm2, %ymm10, %ymm2 +; NO-GATHER-NEXT: vpsllq $3, %ymm3, %ymm3 +; NO-GATHER-NEXT: vpaddq %ymm3, %ymm10, %ymm3 +; NO-GATHER-NEXT: vpsllq $3, %ymm4, %ymm4 +; NO-GATHER-NEXT: vpaddq %ymm4, %ymm10, %ymm4 +; NO-GATHER-NEXT: vpsllq $3, %ymm8, %ymm8 +; NO-GATHER-NEXT: vpaddq %ymm8, %ymm10, %ymm8 +; NO-GATHER-NEXT: vpsllq $3, %ymm7, %ymm7 +; NO-GATHER-NEXT: vpaddq %ymm7, %ymm10, %ymm7 +; NO-GATHER-NEXT: vpsllq $3, %ymm6, %ymm6 +; NO-GATHER-NEXT: vpaddq %ymm6, %ymm10, %ymm6 +; NO-GATHER-NEXT: vpsllq $3, %ymm5, %ymm5 +; NO-GATHER-NEXT: vpaddq %ymm5, %ymm10, %ymm5 +; NO-GATHER-NEXT: vmovq %xmm5, %rdi +; NO-GATHER-NEXT: vpextrq $1, %xmm5, %rax +; NO-GATHER-NEXT: vextracti128 $1, %ymm5, %xmm5 +; NO-GATHER-NEXT: vmovq %xmm5, %r13 +; NO-GATHER-NEXT: vpextrq $1, %xmm5, %rsi +; NO-GATHER-NEXT: vmovq %xmm6, %r15 +; NO-GATHER-NEXT: vpextrq $1, %xmm6, %rbp +; NO-GATHER-NEXT: vextracti128 $1, %ymm6, %xmm5 +; NO-GATHER-NEXT: vmovq %xmm5, %rbx +; NO-GATHER-NEXT: vpextrq $1, %xmm5, %r12 +; NO-GATHER-NEXT: vmovq %xmm7, %r10 +; NO-GATHER-NEXT: vpextrq $1, %xmm7, %r14 +; NO-GATHER-NEXT: vextracti128 $1, %ymm7, %xmm5 +; NO-GATHER-NEXT: vmovq %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; NO-GATHER-NEXT: vpextrq $1, %xmm5, %r9 +; NO-GATHER-NEXT: vmovq %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; NO-GATHER-NEXT: vpextrq $1, %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; NO-GATHER-NEXT: vextracti128 $1, %ymm8, %xmm5 +; NO-GATHER-NEXT: vmovq %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero +; NO-GATHER-NEXT: vpextrq $1, %xmm5, %rax +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero +; NO-GATHER-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; NO-GATHER-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-GATHER-NEXT: vpextrq $1, %xmm4, %r8 +; NO-GATHER-NEXT: vextracti128 $1, %ymm4, %xmm9 +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero +; NO-GATHER-NEXT: vmovq %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero +; NO-GATHER-NEXT: vpextrq $1, %xmm9, %r13 +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm9 = mem[0],zero +; NO-GATHER-NEXT: vmovq %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero +; NO-GATHER-NEXT: vpextrq $1, %xmm3, %r15 +; NO-GATHER-NEXT: vextracti128 $1, %ymm3, %xmm3 +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm11 = mem[0],zero +; NO-GATHER-NEXT: vmovq %xmm3, %rbx +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm12 = mem[0],zero +; NO-GATHER-NEXT: vpextrq $1, %xmm3, %r14 +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm13 = mem[0],zero +; NO-GATHER-NEXT: vmovq %xmm2, %r11 +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm14 = mem[0],zero +; NO-GATHER-NEXT: vpextrq $1, %xmm2, %r12 +; NO-GATHER-NEXT: vextracti128 $1, %ymm2, %xmm2 +; NO-GATHER-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm15 = mem[0],zero +; NO-GATHER-NEXT: vmovq %xmm2, %r10 +; NO-GATHER-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; NO-GATHER-NEXT: vpextrq $1, %xmm2, %rsi +; NO-GATHER-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; NO-GATHER-NEXT: vmovdqa %ymm1, %ymm2 +; NO-GATHER-NEXT: vmovq %xmm2, %r9 +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; NO-GATHER-NEXT: vpextrq $1, %xmm2, %rdi +; NO-GATHER-NEXT: vextracti128 $1, %ymm2, %xmm2 +; NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm6 = xmm7[0],xmm6[0] +; NO-GATHER-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-GATHER-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero +; NO-GATHER-NEXT: vmovq %xmm2, %rbp +; NO-GATHER-NEXT: vpextrq $1, %xmm2, %rax +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero +; NO-GATHER-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm2 # 16-byte Folded Reload +; NO-GATHER-NEXT: # xmm2 = xmm5[0],mem[0] +; NO-GATHER-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-GATHER-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero +; NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm8 = xmm9[0],xmm8[0] +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm9 = mem[0],zero +; NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm4 = xmm11[0],xmm4[0] +; NO-GATHER-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm11 = mem[0],zero +; NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm12 = mem[0],zero +; NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm14 = xmm15[0],xmm14[0] +; NO-GATHER-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm15 = mem[0],zero +; NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm1[0] +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero +; NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm5 = xmm5[0],xmm7[0] +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero +; NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm9 = xmm11[0],xmm9[0] +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm11 = mem[0],zero +; NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm12 = xmm15[0],xmm12[0] +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm15 = mem[0],zero +; NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm3 = xmm6[0],xmm3[0] +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero +; NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm7 = xmm11[0],xmm7[0] +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm11 = mem[0],zero +; NO-GATHER-NEXT: movq A@GOTPCREL(%rip), %rdi +; NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm15[0] +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm15 = mem[0],zero +; NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm11 = xmm15[0],xmm11[0] +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm15 = mem[0],zero +; NO-GATHER-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm15[0] +; NO-GATHER-NEXT: vmovups %xmm2, 8432(%rcx,%rdx,8) +; NO-GATHER-NEXT: vmovups %xmm11, 8416(%rcx,%rdx,8) +; NO-GATHER-NEXT: vmovups %xmm6, 8400(%rcx,%rdx,8) +; NO-GATHER-NEXT: vmovups %xmm7, 8384(%rcx,%rdx,8) +; NO-GATHER-NEXT: vmovups %xmm3, 8368(%rcx,%rdx,8) +; NO-GATHER-NEXT: vmovups %xmm12, 8352(%rcx,%rdx,8) +; NO-GATHER-NEXT: vmovups %xmm9, 8336(%rcx,%rdx,8) +; NO-GATHER-NEXT: vmovups %xmm5, 8320(%rcx,%rdx,8) +; NO-GATHER-NEXT: vmovups %xmm1, 8304(%rcx,%rdx,8) +; NO-GATHER-NEXT: vmovups %xmm0, 8288(%rcx,%rdx,8) +; NO-GATHER-NEXT: vmovups %xmm14, 8272(%rcx,%rdx,8) +; NO-GATHER-NEXT: vmovups %xmm13, 8256(%rcx,%rdx,8) +; NO-GATHER-NEXT: vmovups %xmm4, 8240(%rcx,%rdx,8) +; NO-GATHER-NEXT: vmovups %xmm8, 8224(%rcx,%rdx,8) +; NO-GATHER-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; NO-GATHER-NEXT: vmovups %xmm0, 8208(%rcx,%rdx,8) +; NO-GATHER-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; NO-GATHER-NEXT: vmovups %xmm0, 8192(%rcx,%rdx,8) +; NO-GATHER-NEXT: addq $32, %rdx +; NO-GATHER-NEXT: jne .LBB0_1 +; NO-GATHER-NEXT: # %bb.2: # %middle.block +; NO-GATHER-NEXT: movb $1, %al +; NO-GATHER-NEXT: testb %al, %al +; NO-GATHER-NEXT: jne .LBB0_5 +; NO-GATHER-NEXT: # %bb.3: # %vec.epilog.scalar.ph +; NO-GATHER-NEXT: movl $1024, %eax # imm = 0x400 +; NO-GATHER-NEXT: movq B@GOTPCREL(%rip), %rdx +; NO-GATHER-NEXT: .p2align 4, 0x90 +; NO-GATHER-NEXT: .LBB0_4: # %for.body +; NO-GATHER-NEXT: # =>This Inner Loop Header: Depth=1 +; NO-GATHER-NEXT: movsbq (%rdi,%rax), %rsi +; NO-GATHER-NEXT: movq (%rdx,%rsi,8), %rsi +; NO-GATHER-NEXT: movq %rsi, (%rcx,%rax,8) +; NO-GATHER-NEXT: incq %rax +; NO-GATHER-NEXT: cmpq $1024, %rax # imm = 0x400 +; NO-GATHER-NEXT: jb .LBB0_4 +; NO-GATHER-NEXT: .LBB0_5: # %for.cond.cleanup +; NO-GATHER-NEXT: popq %rbx +; NO-GATHER-NEXT: .cfi_def_cfa_offset 48 +; NO-GATHER-NEXT: popq %r12 +; NO-GATHER-NEXT: .cfi_def_cfa_offset 40 +; NO-GATHER-NEXT: popq %r13 +; NO-GATHER-NEXT: .cfi_def_cfa_offset 32 +; NO-GATHER-NEXT: popq %r14 +; NO-GATHER-NEXT: .cfi_def_cfa_offset 24 +; NO-GATHER-NEXT: popq %r15 +; NO-GATHER-NEXT: .cfi_def_cfa_offset 16 +; NO-GATHER-NEXT: popq %rbp +; NO-GATHER-NEXT: .cfi_def_cfa_offset 8 +; NO-GATHER-NEXT: vzeroupper +; NO-GATHER-NEXT: retq +; +; SCATTER-LABEL: test: +; SCATTER: # %bb.0: # %iter.check +; SCATTER-NEXT: movq $-1024, %rsi # imm = 0xFC00 +; SCATTER-NEXT: movq A@GOTPCREL(%rip), %rax +; SCATTER-NEXT: movq B@GOTPCREL(%rip), %rcx +; SCATTER-NEXT: movq C@GOTPCREL(%rip), %rdx +; SCATTER-NEXT: .p2align 4, 0x90 +; SCATTER-NEXT: .LBB0_1: # %vector.body +; SCATTER-NEXT: # =>This Inner Loop Header: Depth=1 +; SCATTER-NEXT: vpmovsxbq 1048(%rax,%rsi), %zmm0 +; SCATTER-NEXT: vpmovsxbq 1040(%rax,%rsi), %zmm1 +; SCATTER-NEXT: vpmovsxbq 1032(%rax,%rsi), %zmm2 +; SCATTER-NEXT: vpmovsxbq 1024(%rax,%rsi), %zmm3 +; SCATTER-NEXT: kxnorw %k0, %k0, %k1 +; SCATTER-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; SCATTER-NEXT: vpgatherqq (%rcx,%zmm3,8), %zmm4 {%k1} +; SCATTER-NEXT: kxnorw %k0, %k0, %k1 +; SCATTER-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; SCATTER-NEXT: vpgatherqq (%rcx,%zmm2,8), %zmm3 {%k1} +; SCATTER-NEXT: kxnorw %k0, %k0, %k1 +; SCATTER-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; SCATTER-NEXT: vpgatherqq (%rcx,%zmm1,8), %zmm2 {%k1} +; SCATTER-NEXT: kxnorw %k0, %k0, %k1 +; SCATTER-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SCATTER-NEXT: vpgatherqq (%rcx,%zmm0,8), %zmm1 {%k1} +; SCATTER-NEXT: vmovdqu64 %zmm1, 8384(%rdx,%rsi,8) +; SCATTER-NEXT: vmovdqu64 %zmm2, 8320(%rdx,%rsi,8) +; SCATTER-NEXT: vmovdqu64 %zmm3, 8256(%rdx,%rsi,8) +; SCATTER-NEXT: vmovdqu64 %zmm4, 8192(%rdx,%rsi,8) +; SCATTER-NEXT: addq $32, %rsi +; SCATTER-NEXT: jne .LBB0_1 +; SCATTER-NEXT: # %bb.2: # %middle.block +; SCATTER-NEXT: movb $1, %sil +; SCATTER-NEXT: testb %sil, %sil +; SCATTER-NEXT: jne .LBB0_5 +; SCATTER-NEXT: # %bb.3: # %vec.epilog.scalar.ph +; SCATTER-NEXT: movl $1024, %esi # imm = 0x400 +; SCATTER-NEXT: .p2align 4, 0x90 +; SCATTER-NEXT: .LBB0_4: # %for.body +; SCATTER-NEXT: # =>This Inner Loop Header: Depth=1 +; SCATTER-NEXT: movsbq (%rax,%rsi), %rdi +; SCATTER-NEXT: movq (%rcx,%rdi,8), %rdi +; SCATTER-NEXT: movq %rdi, (%rdx,%rsi,8) +; SCATTER-NEXT: incq %rsi +; SCATTER-NEXT: cmpq $1024, %rsi # imm = 0x400 +; SCATTER-NEXT: jb .LBB0_4 +; SCATTER-NEXT: .LBB0_5: # %for.cond.cleanup +; SCATTER-NEXT: vzeroupper +; SCATTER-NEXT: retq +; +; SCATTER-NO-GATHER-LABEL: test: +; SCATTER-NO-GATHER: # %bb.0: # %iter.check +; SCATTER-NO-GATHER-NEXT: movq $-1024, %rdx # imm = 0xFC00 +; SCATTER-NO-GATHER-NEXT: movq A@GOTPCREL(%rip), %rax +; SCATTER-NO-GATHER-NEXT: vpbroadcastq B@GOTPCREL(%rip), %zmm5 +; SCATTER-NO-GATHER-NEXT: movq C@GOTPCREL(%rip), %rcx +; SCATTER-NO-GATHER-NEXT: .p2align 4, 0x90 +; SCATTER-NO-GATHER-NEXT: .LBB0_1: # %vector.body +; SCATTER-NO-GATHER-NEXT: # =>This Inner Loop Header: Depth=1 +; SCATTER-NO-GATHER-NEXT: vpmovsxbq 1024(%rax,%rdx), %zmm3 +; SCATTER-NO-GATHER-NEXT: vpmovsxbq 1032(%rax,%rdx), %zmm4 +; SCATTER-NO-GATHER-NEXT: vpmovsxbq 1040(%rax,%rdx), %zmm2 +; SCATTER-NO-GATHER-NEXT: vpmovsxbq 1048(%rax,%rdx), %zmm1 +; SCATTER-NO-GATHER-NEXT: vpsllq $3, %zmm1, %zmm1 +; SCATTER-NO-GATHER-NEXT: vpaddq %zmm1, %zmm5, %zmm1 +; SCATTER-NO-GATHER-NEXT: vpsllq $3, %zmm2, %zmm2 +; SCATTER-NO-GATHER-NEXT: vpaddq %zmm2, %zmm5, %zmm2 +; SCATTER-NO-GATHER-NEXT: vpsllq $3, %zmm4, %zmm4 +; SCATTER-NO-GATHER-NEXT: vpaddq %zmm4, %zmm5, %zmm6 +; SCATTER-NO-GATHER-NEXT: vpsllq $3, %zmm3, %zmm3 +; SCATTER-NO-GATHER-NEXT: vpaddq %zmm3, %zmm5, %zmm11 +; SCATTER-NO-GATHER-NEXT: vextracti128 $1, %ymm11, %xmm9 +; SCATTER-NO-GATHER-NEXT: vextracti32x4 $2, %zmm11, %xmm7 +; SCATTER-NO-GATHER-NEXT: vextracti32x4 $3, %zmm11, %xmm4 +; SCATTER-NO-GATHER-NEXT: vpextrq $1, %xmm4, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SCATTER-NO-GATHER-NEXT: vmovq %xmm4, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vpextrq $1, %xmm7, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vmovq %xmm7, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vpextrq $1, %xmm9, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vmovq %xmm9, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm9 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vpextrq $1, %xmm11, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm10 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vmovq %xmm11, %rsi +; SCATTER-NO-GATHER-NEXT: vextracti32x4 $1, %ymm6, %xmm17 +; SCATTER-NO-GATHER-NEXT: vextracti32x4 $2, %zmm6, %xmm15 +; SCATTER-NO-GATHER-NEXT: vextracti32x4 $3, %zmm6, %xmm13 +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm11 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vpextrq $1, %xmm13, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm12 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vmovq %xmm13, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm13 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vpextrq $1, %xmm15, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm14 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vmovq %xmm15, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm15 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vpextrq $1, %xmm17, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm16 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vmovq %xmm17, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm17 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vpextrq $1, %xmm6, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm18 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vmovq %xmm6, %rsi +; SCATTER-NO-GATHER-NEXT: vextracti32x4 $1, %ymm2, %xmm19 +; SCATTER-NO-GATHER-NEXT: vextracti32x4 $2, %zmm2, %xmm20 +; SCATTER-NO-GATHER-NEXT: vextracti32x4 $3, %zmm2, %xmm21 +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vpextrq $1, %xmm21, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm22 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vmovq %xmm21, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm21 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vpextrq $1, %xmm20, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm23 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vmovq %xmm20, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm20 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vpextrq $1, %xmm19, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm24 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vmovq %xmm19, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm19 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vpextrq $1, %xmm2, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm25 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vmovq %xmm2, %rsi +; SCATTER-NO-GATHER-NEXT: vextracti128 $1, %ymm1, %xmm2 +; SCATTER-NO-GATHER-NEXT: vextracti32x4 $2, %zmm1, %xmm26 +; SCATTER-NO-GATHER-NEXT: vextracti32x4 $3, %zmm1, %xmm27 +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm28 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vpextrq $1, %xmm27, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm29 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vmovq %xmm27, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm27 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vpextrq $1, %xmm26, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm30 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vmovq %xmm26, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm26 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vpextrq $1, %xmm2, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm31 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vmovq %xmm2, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vpextrq $1, %xmm1, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vmovq %xmm1, %rsi +; SCATTER-NO-GATHER-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; SCATTER-NO-GATHER-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; SCATTER-NO-GATHER-NEXT: # xmm4 = xmm4[0],mem[0] +; SCATTER-NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm3[0] +; SCATTER-NO-GATHER-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; SCATTER-NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm4 = xmm9[0],xmm8[0] +; SCATTER-NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm7 = xmm11[0],xmm10[0] +; SCATTER-NO-GATHER-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; SCATTER-NO-GATHER-NEXT: vinsertf64x4 $1, %ymm3, %zmm4, %zmm3 +; SCATTER-NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm4 = xmm13[0],xmm12[0] +; SCATTER-NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm7 = xmm15[0],xmm14[0] +; SCATTER-NO-GATHER-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; SCATTER-NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm7 = xmm17[0],xmm16[0] +; SCATTER-NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm18[0] +; SCATTER-NO-GATHER-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; SCATTER-NO-GATHER-NEXT: vinsertf64x4 $1, %ymm4, %zmm6, %zmm4 +; SCATTER-NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm6 = xmm21[0],xmm22[0] +; SCATTER-NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm7 = xmm20[0],xmm23[0] +; SCATTER-NO-GATHER-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; SCATTER-NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm7 = xmm19[0],xmm24[0] +; SCATTER-NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm8 = xmm28[0],xmm25[0] +; SCATTER-NO-GATHER-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; SCATTER-NO-GATHER-NEXT: vinsertf64x4 $1, %ymm6, %zmm7, %zmm6 +; SCATTER-NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm7 = xmm27[0],xmm29[0] +; SCATTER-NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm8 = xmm26[0],xmm30[0] +; SCATTER-NO-GATHER-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; SCATTER-NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm31[0] +; SCATTER-NO-GATHER-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; SCATTER-NO-GATHER-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; SCATTER-NO-GATHER-NEXT: vinsertf64x4 $1, %ymm7, %zmm0, %zmm0 +; SCATTER-NO-GATHER-NEXT: vmovups %zmm0, 8384(%rcx,%rdx,8) +; SCATTER-NO-GATHER-NEXT: vmovups %zmm6, 8320(%rcx,%rdx,8) +; SCATTER-NO-GATHER-NEXT: vmovups %zmm4, 8256(%rcx,%rdx,8) +; SCATTER-NO-GATHER-NEXT: vmovups %zmm3, 8192(%rcx,%rdx,8) +; SCATTER-NO-GATHER-NEXT: addq $32, %rdx +; SCATTER-NO-GATHER-NEXT: jne .LBB0_1 +; SCATTER-NO-GATHER-NEXT: # %bb.2: # %middle.block +; SCATTER-NO-GATHER-NEXT: movb $1, %dl +; SCATTER-NO-GATHER-NEXT: testb %dl, %dl +; SCATTER-NO-GATHER-NEXT: jne .LBB0_5 +; SCATTER-NO-GATHER-NEXT: # %bb.3: # %vec.epilog.scalar.ph +; SCATTER-NO-GATHER-NEXT: movl $1024, %edx # imm = 0x400 +; SCATTER-NO-GATHER-NEXT: movq B@GOTPCREL(%rip), %rsi +; SCATTER-NO-GATHER-NEXT: .p2align 4, 0x90 +; SCATTER-NO-GATHER-NEXT: .LBB0_4: # %for.body +; SCATTER-NO-GATHER-NEXT: # =>This Inner Loop Header: Depth=1 +; SCATTER-NO-GATHER-NEXT: movsbq (%rax,%rdx), %rdi +; SCATTER-NO-GATHER-NEXT: movq (%rsi,%rdi,8), %rdi +; SCATTER-NO-GATHER-NEXT: movq %rdi, (%rcx,%rdx,8) +; SCATTER-NO-GATHER-NEXT: incq %rdx +; SCATTER-NO-GATHER-NEXT: cmpq $1024, %rdx # imm = 0x400 +; SCATTER-NO-GATHER-NEXT: jb .LBB0_4 +; SCATTER-NO-GATHER-NEXT: .LBB0_5: # %for.cond.cleanup +; SCATTER-NO-GATHER-NEXT: vzeroupper +; SCATTER-NO-GATHER-NEXT: retq ; ; GATHER-NO-SCATTER-LABEL: test: -; GATHER-NO-SCATTER: vpgatherdq +; GATHER-NO-SCATTER: # %bb.0: # %iter.check +; GATHER-NO-SCATTER-NEXT: movq $-1024, %rsi # imm = 0xFC00 +; GATHER-NO-SCATTER-NEXT: movq A@GOTPCREL(%rip), %rax +; GATHER-NO-SCATTER-NEXT: movq B@GOTPCREL(%rip), %rcx +; GATHER-NO-SCATTER-NEXT: movq C@GOTPCREL(%rip), %rdx +; GATHER-NO-SCATTER-NEXT: .p2align 4, 0x90 +; GATHER-NO-SCATTER-NEXT: .LBB0_1: # %vector.body +; GATHER-NO-SCATTER-NEXT: # =>This Inner Loop Header: Depth=1 +; GATHER-NO-SCATTER-NEXT: vpmovsxbq 1048(%rax,%rsi), %zmm0 +; GATHER-NO-SCATTER-NEXT: vpmovsxbq 1040(%rax,%rsi), %zmm1 +; GATHER-NO-SCATTER-NEXT: vpmovsxbq 1032(%rax,%rsi), %zmm2 +; GATHER-NO-SCATTER-NEXT: vpmovsxbq 1024(%rax,%rsi), %zmm3 +; GATHER-NO-SCATTER-NEXT: kxnorw %k0, %k0, %k1 +; GATHER-NO-SCATTER-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; GATHER-NO-SCATTER-NEXT: vpgatherqq (%rcx,%zmm3,8), %zmm4 {%k1} +; GATHER-NO-SCATTER-NEXT: kxnorw %k0, %k0, %k1 +; GATHER-NO-SCATTER-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; GATHER-NO-SCATTER-NEXT: vpgatherqq (%rcx,%zmm2,8), %zmm3 {%k1} +; GATHER-NO-SCATTER-NEXT: kxnorw %k0, %k0, %k1 +; GATHER-NO-SCATTER-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; GATHER-NO-SCATTER-NEXT: vpgatherqq (%rcx,%zmm1,8), %zmm2 {%k1} +; GATHER-NO-SCATTER-NEXT: kxnorw %k0, %k0, %k1 +; GATHER-NO-SCATTER-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; GATHER-NO-SCATTER-NEXT: vpgatherqq (%rcx,%zmm0,8), %zmm1 {%k1} +; GATHER-NO-SCATTER-NEXT: vmovdqu64 %zmm1, 8384(%rdx,%rsi,8) +; GATHER-NO-SCATTER-NEXT: vmovdqu64 %zmm2, 8320(%rdx,%rsi,8) +; GATHER-NO-SCATTER-NEXT: vmovdqu64 %zmm3, 8256(%rdx,%rsi,8) +; GATHER-NO-SCATTER-NEXT: vmovdqu64 %zmm4, 8192(%rdx,%rsi,8) +; GATHER-NO-SCATTER-NEXT: addq $32, %rsi +; GATHER-NO-SCATTER-NEXT: jne .LBB0_1 +; GATHER-NO-SCATTER-NEXT: # %bb.2: # %middle.block +; GATHER-NO-SCATTER-NEXT: movb $1, %sil +; GATHER-NO-SCATTER-NEXT: testb %sil, %sil +; GATHER-NO-SCATTER-NEXT: jne .LBB0_5 +; GATHER-NO-SCATTER-NEXT: # %bb.3: # %vec.epilog.scalar.ph +; GATHER-NO-SCATTER-NEXT: movl $1024, %esi # imm = 0x400 +; GATHER-NO-SCATTER-NEXT: .p2align 4, 0x90 +; GATHER-NO-SCATTER-NEXT: .LBB0_4: # %for.body +; GATHER-NO-SCATTER-NEXT: # =>This Inner Loop Header: Depth=1 +; GATHER-NO-SCATTER-NEXT: movsbq (%rax,%rsi), %rdi +; GATHER-NO-SCATTER-NEXT: movq (%rcx,%rdi,8), %rdi +; GATHER-NO-SCATTER-NEXT: movq %rdi, (%rdx,%rsi,8) +; GATHER-NO-SCATTER-NEXT: incq %rsi +; GATHER-NO-SCATTER-NEXT: cmpq $1024, %rsi # imm = 0x400 +; GATHER-NO-SCATTER-NEXT: jb .LBB0_4 +; GATHER-NO-SCATTER-NEXT: .LBB0_5: # %for.cond.cleanup +; GATHER-NO-SCATTER-NEXT: vzeroupper +; GATHER-NO-SCATTER-NEXT: retq ; ; NO-SCATTER-GATHER-LABEL: test: -; NO-SCATTER-GATHER-NOT: vpgatherdq +; NO-SCATTER-GATHER: # %bb.0: # %iter.check +; NO-SCATTER-GATHER-NEXT: movq $-1024, %rdx # imm = 0xFC00 +; NO-SCATTER-GATHER-NEXT: movq A@GOTPCREL(%rip), %rax +; NO-SCATTER-GATHER-NEXT: vpbroadcastq B@GOTPCREL(%rip), %zmm5 +; NO-SCATTER-GATHER-NEXT: movq C@GOTPCREL(%rip), %rcx +; NO-SCATTER-GATHER-NEXT: .p2align 4, 0x90 +; NO-SCATTER-GATHER-NEXT: .LBB0_1: # %vector.body +; NO-SCATTER-GATHER-NEXT: # =>This Inner Loop Header: Depth=1 +; NO-SCATTER-GATHER-NEXT: vpmovsxbq 1024(%rax,%rdx), %zmm3 +; NO-SCATTER-GATHER-NEXT: vpmovsxbq 1032(%rax,%rdx), %zmm4 +; NO-SCATTER-GATHER-NEXT: vpmovsxbq 1040(%rax,%rdx), %zmm2 +; NO-SCATTER-GATHER-NEXT: vpmovsxbq 1048(%rax,%rdx), %zmm1 +; NO-SCATTER-GATHER-NEXT: vpsllq $3, %zmm1, %zmm1 +; NO-SCATTER-GATHER-NEXT: vpaddq %zmm1, %zmm5, %zmm1 +; NO-SCATTER-GATHER-NEXT: vpsllq $3, %zmm2, %zmm2 +; NO-SCATTER-GATHER-NEXT: vpaddq %zmm2, %zmm5, %zmm2 +; NO-SCATTER-GATHER-NEXT: vpsllq $3, %zmm4, %zmm4 +; NO-SCATTER-GATHER-NEXT: vpaddq %zmm4, %zmm5, %zmm6 +; NO-SCATTER-GATHER-NEXT: vpsllq $3, %zmm3, %zmm3 +; NO-SCATTER-GATHER-NEXT: vpaddq %zmm3, %zmm5, %zmm11 +; NO-SCATTER-GATHER-NEXT: vextracti128 $1, %ymm11, %xmm9 +; NO-SCATTER-GATHER-NEXT: vextracti32x4 $2, %zmm11, %xmm7 +; NO-SCATTER-GATHER-NEXT: vextracti32x4 $3, %zmm11, %xmm4 +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm4, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-SCATTER-GATHER-NEXT: vmovq %xmm4, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm7, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vmovq %xmm7, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm9, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vmovq %xmm9, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm9 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm11, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm10 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vmovq %xmm11, %rsi +; NO-SCATTER-GATHER-NEXT: vextracti32x4 $1, %ymm6, %xmm17 +; NO-SCATTER-GATHER-NEXT: vextracti32x4 $2, %zmm6, %xmm15 +; NO-SCATTER-GATHER-NEXT: vextracti32x4 $3, %zmm6, %xmm13 +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm11 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm13, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm12 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vmovq %xmm13, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm13 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm15, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm14 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vmovq %xmm15, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm15 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm17, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm16 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vmovq %xmm17, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm17 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm6, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm18 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vmovq %xmm6, %rsi +; NO-SCATTER-GATHER-NEXT: vextracti32x4 $1, %ymm2, %xmm19 +; NO-SCATTER-GATHER-NEXT: vextracti32x4 $2, %zmm2, %xmm20 +; NO-SCATTER-GATHER-NEXT: vextracti32x4 $3, %zmm2, %xmm21 +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm21, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm22 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vmovq %xmm21, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm21 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm20, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm23 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vmovq %xmm20, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm20 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm19, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm24 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vmovq %xmm19, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm19 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm2, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm25 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vmovq %xmm2, %rsi +; NO-SCATTER-GATHER-NEXT: vextracti128 $1, %ymm1, %xmm2 +; NO-SCATTER-GATHER-NEXT: vextracti32x4 $2, %zmm1, %xmm26 +; NO-SCATTER-GATHER-NEXT: vextracti32x4 $3, %zmm1, %xmm27 +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm28 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm27, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm29 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vmovq %xmm27, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm27 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm26, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm30 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vmovq %xmm26, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm26 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm2, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm31 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vmovq %xmm2, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm1, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vmovq %xmm1, %rsi +; NO-SCATTER-GATHER-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; NO-SCATTER-GATHER-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; NO-SCATTER-GATHER-NEXT: # xmm4 = xmm4[0],mem[0] +; NO-SCATTER-GATHER-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm3[0] +; NO-SCATTER-GATHER-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; NO-SCATTER-GATHER-NEXT: vmovlhps {{.*#+}} xmm4 = xmm9[0],xmm8[0] +; NO-SCATTER-GATHER-NEXT: vmovlhps {{.*#+}} xmm7 = xmm11[0],xmm10[0] +; NO-SCATTER-GATHER-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; NO-SCATTER-GATHER-NEXT: vinsertf64x4 $1, %ymm3, %zmm4, %zmm3 +; NO-SCATTER-GATHER-NEXT: vmovlhps {{.*#+}} xmm4 = xmm13[0],xmm12[0] +; NO-SCATTER-GATHER-NEXT: vmovlhps {{.*#+}} xmm7 = xmm15[0],xmm14[0] +; NO-SCATTER-GATHER-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; NO-SCATTER-GATHER-NEXT: vmovlhps {{.*#+}} xmm7 = xmm17[0],xmm16[0] +; NO-SCATTER-GATHER-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm18[0] +; NO-SCATTER-GATHER-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; NO-SCATTER-GATHER-NEXT: vinsertf64x4 $1, %ymm4, %zmm6, %zmm4 +; NO-SCATTER-GATHER-NEXT: vmovlhps {{.*#+}} xmm6 = xmm21[0],xmm22[0] +; NO-SCATTER-GATHER-NEXT: vmovlhps {{.*#+}} xmm7 = xmm20[0],xmm23[0] +; NO-SCATTER-GATHER-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; NO-SCATTER-GATHER-NEXT: vmovlhps {{.*#+}} xmm7 = xmm19[0],xmm24[0] +; NO-SCATTER-GATHER-NEXT: vmovlhps {{.*#+}} xmm8 = xmm28[0],xmm25[0] +; NO-SCATTER-GATHER-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; NO-SCATTER-GATHER-NEXT: vinsertf64x4 $1, %ymm6, %zmm7, %zmm6 +; NO-SCATTER-GATHER-NEXT: vmovlhps {{.*#+}} xmm7 = xmm27[0],xmm29[0] +; NO-SCATTER-GATHER-NEXT: vmovlhps {{.*#+}} xmm8 = xmm26[0],xmm30[0] +; NO-SCATTER-GATHER-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; NO-SCATTER-GATHER-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm31[0] +; NO-SCATTER-GATHER-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; NO-SCATTER-GATHER-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; NO-SCATTER-GATHER-NEXT: vinsertf64x4 $1, %ymm7, %zmm0, %zmm0 +; NO-SCATTER-GATHER-NEXT: vmovups %zmm0, 8384(%rcx,%rdx,8) +; NO-SCATTER-GATHER-NEXT: vmovups %zmm6, 8320(%rcx,%rdx,8) +; NO-SCATTER-GATHER-NEXT: vmovups %zmm4, 8256(%rcx,%rdx,8) +; NO-SCATTER-GATHER-NEXT: vmovups %zmm3, 8192(%rcx,%rdx,8) +; NO-SCATTER-GATHER-NEXT: addq $32, %rdx +; NO-SCATTER-GATHER-NEXT: jne .LBB0_1 +; NO-SCATTER-GATHER-NEXT: # %bb.2: # %middle.block +; NO-SCATTER-GATHER-NEXT: movb $1, %dl +; NO-SCATTER-GATHER-NEXT: testb %dl, %dl +; NO-SCATTER-GATHER-NEXT: jne .LBB0_5 +; NO-SCATTER-GATHER-NEXT: # %bb.3: # %vec.epilog.scalar.ph +; NO-SCATTER-GATHER-NEXT: movl $1024, %edx # imm = 0x400 +; NO-SCATTER-GATHER-NEXT: movq B@GOTPCREL(%rip), %rsi +; NO-SCATTER-GATHER-NEXT: .p2align 4, 0x90 +; NO-SCATTER-GATHER-NEXT: .LBB0_4: # %for.body +; NO-SCATTER-GATHER-NEXT: # =>This Inner Loop Header: Depth=1 +; NO-SCATTER-GATHER-NEXT: movsbq (%rax,%rdx), %rdi +; NO-SCATTER-GATHER-NEXT: movq (%rsi,%rdi,8), %rdi +; NO-SCATTER-GATHER-NEXT: movq %rdi, (%rcx,%rdx,8) +; NO-SCATTER-GATHER-NEXT: incq %rdx +; NO-SCATTER-GATHER-NEXT: cmpq $1024, %rdx # imm = 0x400 +; NO-SCATTER-GATHER-NEXT: jb .LBB0_4 +; NO-SCATTER-GATHER-NEXT: .LBB0_5: # %for.cond.cleanup +; NO-SCATTER-GATHER-NEXT: vzeroupper +; NO-SCATTER-GATHER-NEXT: retq iter.check: br i1 false, label %vec.epilog.scalar.ph, label %vector.main.loop.iter.check @@ -112,16 +781,147 @@ ; This tests the function that if prefer-no-gather can disable ScalarizeMaskedGather define <4 x float> @gather_v4f32_ptr_v4i32(<4 x ptr> %ptr, <4 x i32> %trigger, <4 x float> %passthru) { ; GATHER-LABEL: gather_v4f32_ptr_v4i32: -; GATHER: vgatherqps +; GATHER: # %bb.0: +; GATHER-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; GATHER-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; GATHER-NEXT: vgatherqps %xmm1, (,%ymm0), %xmm2 +; GATHER-NEXT: vmovaps %xmm2, %xmm0 +; GATHER-NEXT: vzeroupper +; GATHER-NEXT: retq ; ; NO-GATHER-LABEL: gather_v4f32_ptr_v4i32: -; NO-GATHER-NOT: vgatherqps +; NO-GATHER: # %bb.0: +; NO-GATHER-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; NO-GATHER-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; NO-GATHER-NEXT: vmovmskps %xmm1, %eax +; NO-GATHER-NEXT: testb $1, %al +; NO-GATHER-NEXT: je .LBB1_2 +; NO-GATHER-NEXT: # %bb.1: # %cond.load +; NO-GATHER-NEXT: vmovq %xmm0, %rcx +; NO-GATHER-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; NO-GATHER-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; NO-GATHER-NEXT: .LBB1_2: # %else +; NO-GATHER-NEXT: testb $2, %al +; NO-GATHER-NEXT: je .LBB1_4 +; NO-GATHER-NEXT: # %bb.3: # %cond.load1 +; NO-GATHER-NEXT: vpextrq $1, %xmm0, %rcx +; NO-GATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] +; NO-GATHER-NEXT: .LBB1_4: # %else2 +; NO-GATHER-NEXT: testb $4, %al +; NO-GATHER-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NO-GATHER-NEXT: jne .LBB1_5 +; NO-GATHER-NEXT: # %bb.6: # %else5 +; NO-GATHER-NEXT: testb $8, %al +; NO-GATHER-NEXT: jne .LBB1_7 +; NO-GATHER-NEXT: .LBB1_8: # %else8 +; NO-GATHER-NEXT: vmovaps %xmm2, %xmm0 +; NO-GATHER-NEXT: vzeroupper +; NO-GATHER-NEXT: retq +; NO-GATHER-NEXT: .LBB1_5: # %cond.load4 +; NO-GATHER-NEXT: vmovq %xmm0, %rcx +; NO-GATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] +; NO-GATHER-NEXT: testb $8, %al +; NO-GATHER-NEXT: je .LBB1_8 +; NO-GATHER-NEXT: .LBB1_7: # %cond.load7 +; NO-GATHER-NEXT: vpextrq $1, %xmm0, %rax +; NO-GATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] +; NO-GATHER-NEXT: vmovaps %xmm2, %xmm0 +; NO-GATHER-NEXT: vzeroupper +; NO-GATHER-NEXT: retq +; +; SCATTER-LABEL: gather_v4f32_ptr_v4i32: +; SCATTER: # %bb.0: +; SCATTER-NEXT: vptestnmd %xmm1, %xmm1, %k1 +; SCATTER-NEXT: vgatherqps (,%ymm0), %xmm2 {%k1} +; SCATTER-NEXT: vmovaps %xmm2, %xmm0 +; SCATTER-NEXT: vzeroupper +; SCATTER-NEXT: retq +; +; SCATTER-NO-GATHER-LABEL: gather_v4f32_ptr_v4i32: +; SCATTER-NO-GATHER: # %bb.0: +; SCATTER-NO-GATHER-NEXT: vptestnmd %xmm1, %xmm1, %k0 +; SCATTER-NO-GATHER-NEXT: kmovw %k0, %eax +; SCATTER-NO-GATHER-NEXT: testb $1, %al +; SCATTER-NO-GATHER-NEXT: je .LBB1_2 +; SCATTER-NO-GATHER-NEXT: # %bb.1: # %cond.load +; SCATTER-NO-GATHER-NEXT: vmovq %xmm0, %rcx +; SCATTER-NO-GATHER-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SCATTER-NO-GATHER-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SCATTER-NO-GATHER-NEXT: .LBB1_2: # %else +; SCATTER-NO-GATHER-NEXT: testb $2, %al +; SCATTER-NO-GATHER-NEXT: je .LBB1_4 +; SCATTER-NO-GATHER-NEXT: # %bb.3: # %cond.load1 +; SCATTER-NO-GATHER-NEXT: vpextrq $1, %xmm0, %rcx +; SCATTER-NO-GATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] +; SCATTER-NO-GATHER-NEXT: .LBB1_4: # %else2 +; SCATTER-NO-GATHER-NEXT: testb $4, %al +; SCATTER-NO-GATHER-NEXT: vextracti128 $1, %ymm0, %xmm0 +; SCATTER-NO-GATHER-NEXT: jne .LBB1_5 +; SCATTER-NO-GATHER-NEXT: # %bb.6: # %else5 +; SCATTER-NO-GATHER-NEXT: testb $8, %al +; SCATTER-NO-GATHER-NEXT: jne .LBB1_7 +; SCATTER-NO-GATHER-NEXT: .LBB1_8: # %else8 +; SCATTER-NO-GATHER-NEXT: vmovaps %xmm2, %xmm0 +; SCATTER-NO-GATHER-NEXT: vzeroupper +; SCATTER-NO-GATHER-NEXT: retq +; SCATTER-NO-GATHER-NEXT: .LBB1_5: # %cond.load4 +; SCATTER-NO-GATHER-NEXT: vmovq %xmm0, %rcx +; SCATTER-NO-GATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] +; SCATTER-NO-GATHER-NEXT: testb $8, %al +; SCATTER-NO-GATHER-NEXT: je .LBB1_8 +; SCATTER-NO-GATHER-NEXT: .LBB1_7: # %cond.load7 +; SCATTER-NO-GATHER-NEXT: vpextrq $1, %xmm0, %rax +; SCATTER-NO-GATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] +; SCATTER-NO-GATHER-NEXT: vmovaps %xmm2, %xmm0 +; SCATTER-NO-GATHER-NEXT: vzeroupper +; SCATTER-NO-GATHER-NEXT: retq ; ; GATHER-NO-SCATTER-LABEL: gather_v4f32_ptr_v4i32: -; GATHER-NO-SCATTER: vgatherqps +; GATHER-NO-SCATTER: # %bb.0: +; GATHER-NO-SCATTER-NEXT: vptestnmd %xmm1, %xmm1, %k1 +; GATHER-NO-SCATTER-NEXT: vgatherqps (,%ymm0), %xmm2 {%k1} +; GATHER-NO-SCATTER-NEXT: vmovaps %xmm2, %xmm0 +; GATHER-NO-SCATTER-NEXT: vzeroupper +; GATHER-NO-SCATTER-NEXT: retq ; ; NO-SCATTER-GATHER-LABEL: gather_v4f32_ptr_v4i32: -; NO-SCATTER-GATHER-NOT: vgatherqps +; NO-SCATTER-GATHER: # %bb.0: +; NO-SCATTER-GATHER-NEXT: vptestnmd %xmm1, %xmm1, %k0 +; NO-SCATTER-GATHER-NEXT: kmovw %k0, %eax +; NO-SCATTER-GATHER-NEXT: testb $1, %al +; NO-SCATTER-GATHER-NEXT: je .LBB1_2 +; NO-SCATTER-GATHER-NEXT: # %bb.1: # %cond.load +; NO-SCATTER-GATHER-NEXT: vmovq %xmm0, %rcx +; NO-SCATTER-GATHER-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; NO-SCATTER-GATHER-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; NO-SCATTER-GATHER-NEXT: .LBB1_2: # %else +; NO-SCATTER-GATHER-NEXT: testb $2, %al +; NO-SCATTER-GATHER-NEXT: je .LBB1_4 +; NO-SCATTER-GATHER-NEXT: # %bb.3: # %cond.load1 +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm0, %rcx +; NO-SCATTER-GATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] +; NO-SCATTER-GATHER-NEXT: .LBB1_4: # %else2 +; NO-SCATTER-GATHER-NEXT: testb $4, %al +; NO-SCATTER-GATHER-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NO-SCATTER-GATHER-NEXT: jne .LBB1_5 +; NO-SCATTER-GATHER-NEXT: # %bb.6: # %else5 +; NO-SCATTER-GATHER-NEXT: testb $8, %al +; NO-SCATTER-GATHER-NEXT: jne .LBB1_7 +; NO-SCATTER-GATHER-NEXT: .LBB1_8: # %else8 +; NO-SCATTER-GATHER-NEXT: vmovaps %xmm2, %xmm0 +; NO-SCATTER-GATHER-NEXT: vzeroupper +; NO-SCATTER-GATHER-NEXT: retq +; NO-SCATTER-GATHER-NEXT: .LBB1_5: # %cond.load4 +; NO-SCATTER-GATHER-NEXT: vmovq %xmm0, %rcx +; NO-SCATTER-GATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] +; NO-SCATTER-GATHER-NEXT: testb $8, %al +; NO-SCATTER-GATHER-NEXT: je .LBB1_8 +; NO-SCATTER-GATHER-NEXT: .LBB1_7: # %cond.load7 +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm0, %rax +; NO-SCATTER-GATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] +; NO-SCATTER-GATHER-NEXT: vmovaps %xmm2, %xmm0 +; NO-SCATTER-GATHER-NEXT: vzeroupper +; NO-SCATTER-GATHER-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %ptr, i32 4, <4 x i1> %mask, <4 x float> %passthru) ret <4 x float> %res @@ -136,13 +936,639 @@ ; This tests the function that if prefer-no-gather can disable ScalarizeMaskedGather define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) { ; GATHER-LABEL: gather_v8i32_v8i32: -; GATHER: vpgatherdd +; GATHER: # %bb.0: +; GATHER-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; GATHER-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 +; GATHER-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; GATHER-NEXT: vmovdqa %ymm0, %ymm3 +; GATHER-NEXT: vpgatherdd %ymm3, c+12(,%ymm1), %ymm2 +; GATHER-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; GATHER-NEXT: vpgatherdd %ymm0, c+28(,%ymm1), %ymm3 +; GATHER-NEXT: vpaddd %ymm3, %ymm2, %ymm0 +; GATHER-NEXT: vpaddd %ymm3, %ymm0, %ymm0 +; GATHER-NEXT: retq ; ; NO-GATHER-LABEL: gather_v8i32_v8i32: -; NO-GATHER-NOT: vpgatherdd +; NO-GATHER: # %bb.0: +; NO-GATHER-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; NO-GATHER-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm1 +; NO-GATHER-NEXT: vmovmskps %ymm1, %eax +; NO-GATHER-NEXT: testb $1, %al +; NO-GATHER-NEXT: # implicit-def: $ymm1 +; NO-GATHER-NEXT: jne .LBB2_1 +; NO-GATHER-NEXT: # %bb.2: # %else +; NO-GATHER-NEXT: testb $2, %al +; NO-GATHER-NEXT: jne .LBB2_3 +; NO-GATHER-NEXT: .LBB2_4: # %else2 +; NO-GATHER-NEXT: testb $4, %al +; NO-GATHER-NEXT: jne .LBB2_5 +; NO-GATHER-NEXT: .LBB2_6: # %else5 +; NO-GATHER-NEXT: testb $8, %al +; NO-GATHER-NEXT: jne .LBB2_7 +; NO-GATHER-NEXT: .LBB2_8: # %else8 +; NO-GATHER-NEXT: testb $16, %al +; NO-GATHER-NEXT: jne .LBB2_9 +; NO-GATHER-NEXT: .LBB2_10: # %else11 +; NO-GATHER-NEXT: testb $32, %al +; NO-GATHER-NEXT: jne .LBB2_11 +; NO-GATHER-NEXT: .LBB2_12: # %else14 +; NO-GATHER-NEXT: testb $64, %al +; NO-GATHER-NEXT: jne .LBB2_13 +; NO-GATHER-NEXT: .LBB2_14: # %else17 +; NO-GATHER-NEXT: testb $-128, %al +; NO-GATHER-NEXT: je .LBB2_16 +; NO-GATHER-NEXT: .LBB2_15: # %cond.load19 +; NO-GATHER-NEXT: vpbroadcastd c+12(%rip), %ymm2 +; NO-GATHER-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; NO-GATHER-NEXT: .LBB2_16: # %else20 +; NO-GATHER-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; NO-GATHER-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm2 +; NO-GATHER-NEXT: vmovmskps %ymm2, %eax +; NO-GATHER-NEXT: testb $1, %al +; NO-GATHER-NEXT: # implicit-def: $ymm2 +; NO-GATHER-NEXT: jne .LBB2_17 +; NO-GATHER-NEXT: # %bb.18: # %else26 +; NO-GATHER-NEXT: testb $2, %al +; NO-GATHER-NEXT: jne .LBB2_19 +; NO-GATHER-NEXT: .LBB2_20: # %else31 +; NO-GATHER-NEXT: testb $4, %al +; NO-GATHER-NEXT: jne .LBB2_21 +; NO-GATHER-NEXT: .LBB2_22: # %else36 +; NO-GATHER-NEXT: testb $8, %al +; NO-GATHER-NEXT: jne .LBB2_23 +; NO-GATHER-NEXT: .LBB2_24: # %else41 +; NO-GATHER-NEXT: testb $16, %al +; NO-GATHER-NEXT: jne .LBB2_25 +; NO-GATHER-NEXT: .LBB2_26: # %else46 +; NO-GATHER-NEXT: testb $32, %al +; NO-GATHER-NEXT: jne .LBB2_27 +; NO-GATHER-NEXT: .LBB2_28: # %else51 +; NO-GATHER-NEXT: testb $64, %al +; NO-GATHER-NEXT: jne .LBB2_29 +; NO-GATHER-NEXT: .LBB2_30: # %else56 +; NO-GATHER-NEXT: testb $-128, %al +; NO-GATHER-NEXT: je .LBB2_32 +; NO-GATHER-NEXT: .LBB2_31: # %cond.load58 +; NO-GATHER-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; NO-GATHER-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; NO-GATHER-NEXT: .LBB2_32: # %else61 +; NO-GATHER-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; NO-GATHER-NEXT: vpcmpeqd %ymm3, %ymm0, %ymm0 +; NO-GATHER-NEXT: vmovmskps %ymm0, %eax +; NO-GATHER-NEXT: testb $1, %al +; NO-GATHER-NEXT: # implicit-def: $ymm0 +; NO-GATHER-NEXT: jne .LBB2_33 +; NO-GATHER-NEXT: # %bb.34: # %else67 +; NO-GATHER-NEXT: testb $2, %al +; NO-GATHER-NEXT: jne .LBB2_35 +; NO-GATHER-NEXT: .LBB2_36: # %else72 +; NO-GATHER-NEXT: testb $4, %al +; NO-GATHER-NEXT: jne .LBB2_37 +; NO-GATHER-NEXT: .LBB2_38: # %else77 +; NO-GATHER-NEXT: testb $8, %al +; NO-GATHER-NEXT: jne .LBB2_39 +; NO-GATHER-NEXT: .LBB2_40: # %else82 +; NO-GATHER-NEXT: testb $16, %al +; NO-GATHER-NEXT: jne .LBB2_41 +; NO-GATHER-NEXT: .LBB2_42: # %else87 +; NO-GATHER-NEXT: testb $32, %al +; NO-GATHER-NEXT: jne .LBB2_43 +; NO-GATHER-NEXT: .LBB2_44: # %else92 +; NO-GATHER-NEXT: testb $64, %al +; NO-GATHER-NEXT: je .LBB2_46 +; NO-GATHER-NEXT: .LBB2_45: # %cond.load94 +; NO-GATHER-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; NO-GATHER-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6],ymm0[7] +; NO-GATHER-NEXT: .LBB2_46: # %else97 +; NO-GATHER-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; NO-GATHER-NEXT: testb $-128, %al +; NO-GATHER-NEXT: je .LBB2_48 +; NO-GATHER-NEXT: # %bb.47: # %cond.load99 +; NO-GATHER-NEXT: vpbroadcastd c+28(%rip), %ymm2 +; NO-GATHER-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; NO-GATHER-NEXT: .LBB2_48: # %else102 +; NO-GATHER-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; NO-GATHER-NEXT: retq +; NO-GATHER-NEXT: .LBB2_1: # %cond.load +; NO-GATHER-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; NO-GATHER-NEXT: testb $2, %al +; NO-GATHER-NEXT: je .LBB2_4 +; NO-GATHER-NEXT: .LBB2_3: # %cond.load1 +; NO-GATHER-NEXT: vpinsrd $1, c+12(%rip), %xmm1, %xmm2 +; NO-GATHER-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; NO-GATHER-NEXT: testb $4, %al +; NO-GATHER-NEXT: je .LBB2_6 +; NO-GATHER-NEXT: .LBB2_5: # %cond.load4 +; NO-GATHER-NEXT: vpinsrd $2, c+12(%rip), %xmm1, %xmm2 +; NO-GATHER-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; NO-GATHER-NEXT: testb $8, %al +; NO-GATHER-NEXT: je .LBB2_8 +; NO-GATHER-NEXT: .LBB2_7: # %cond.load7 +; NO-GATHER-NEXT: vpinsrd $3, c+12(%rip), %xmm1, %xmm2 +; NO-GATHER-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; NO-GATHER-NEXT: testb $16, %al +; NO-GATHER-NEXT: je .LBB2_10 +; NO-GATHER-NEXT: .LBB2_9: # %cond.load10 +; NO-GATHER-NEXT: vpbroadcastd c+12(%rip), %ymm2 +; NO-GATHER-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] +; NO-GATHER-NEXT: testb $32, %al +; NO-GATHER-NEXT: je .LBB2_12 +; NO-GATHER-NEXT: .LBB2_11: # %cond.load13 +; NO-GATHER-NEXT: vpbroadcastd c+12(%rip), %ymm2 +; NO-GATHER-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] +; NO-GATHER-NEXT: testb $64, %al +; NO-GATHER-NEXT: je .LBB2_14 +; NO-GATHER-NEXT: .LBB2_13: # %cond.load16 +; NO-GATHER-NEXT: vpbroadcastd c+12(%rip), %ymm2 +; NO-GATHER-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; NO-GATHER-NEXT: testb $-128, %al +; NO-GATHER-NEXT: jne .LBB2_15 +; NO-GATHER-NEXT: jmp .LBB2_16 +; NO-GATHER-NEXT: .LBB2_17: # %cond.load23 +; NO-GATHER-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; NO-GATHER-NEXT: testb $2, %al +; NO-GATHER-NEXT: je .LBB2_20 +; NO-GATHER-NEXT: .LBB2_19: # %cond.load28 +; NO-GATHER-NEXT: vpinsrd $1, c+28(%rip), %xmm2, %xmm3 +; NO-GATHER-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; NO-GATHER-NEXT: testb $4, %al +; NO-GATHER-NEXT: je .LBB2_22 +; NO-GATHER-NEXT: .LBB2_21: # %cond.load33 +; NO-GATHER-NEXT: vpinsrd $2, c+28(%rip), %xmm2, %xmm3 +; NO-GATHER-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; NO-GATHER-NEXT: testb $8, %al +; NO-GATHER-NEXT: je .LBB2_24 +; NO-GATHER-NEXT: .LBB2_23: # %cond.load38 +; NO-GATHER-NEXT: vpinsrd $3, c+28(%rip), %xmm2, %xmm3 +; NO-GATHER-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; NO-GATHER-NEXT: testb $16, %al +; NO-GATHER-NEXT: je .LBB2_26 +; NO-GATHER-NEXT: .LBB2_25: # %cond.load43 +; NO-GATHER-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; NO-GATHER-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] +; NO-GATHER-NEXT: testb $32, %al +; NO-GATHER-NEXT: je .LBB2_28 +; NO-GATHER-NEXT: .LBB2_27: # %cond.load48 +; NO-GATHER-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; NO-GATHER-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; NO-GATHER-NEXT: testb $64, %al +; NO-GATHER-NEXT: je .LBB2_30 +; NO-GATHER-NEXT: .LBB2_29: # %cond.load53 +; NO-GATHER-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; NO-GATHER-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] +; NO-GATHER-NEXT: testb $-128, %al +; NO-GATHER-NEXT: jne .LBB2_31 +; NO-GATHER-NEXT: jmp .LBB2_32 +; NO-GATHER-NEXT: .LBB2_33: # %cond.load64 +; NO-GATHER-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; NO-GATHER-NEXT: testb $2, %al +; NO-GATHER-NEXT: je .LBB2_36 +; NO-GATHER-NEXT: .LBB2_35: # %cond.load69 +; NO-GATHER-NEXT: vpinsrd $1, c+28(%rip), %xmm0, %xmm3 +; NO-GATHER-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; NO-GATHER-NEXT: testb $4, %al +; NO-GATHER-NEXT: je .LBB2_38 +; NO-GATHER-NEXT: .LBB2_37: # %cond.load74 +; NO-GATHER-NEXT: vpinsrd $2, c+28(%rip), %xmm0, %xmm3 +; NO-GATHER-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; NO-GATHER-NEXT: testb $8, %al +; NO-GATHER-NEXT: je .LBB2_40 +; NO-GATHER-NEXT: .LBB2_39: # %cond.load79 +; NO-GATHER-NEXT: vpinsrd $3, c+28(%rip), %xmm0, %xmm3 +; NO-GATHER-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; NO-GATHER-NEXT: testb $16, %al +; NO-GATHER-NEXT: je .LBB2_42 +; NO-GATHER-NEXT: .LBB2_41: # %cond.load84 +; NO-GATHER-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; NO-GATHER-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4],ymm0[5,6,7] +; NO-GATHER-NEXT: testb $32, %al +; NO-GATHER-NEXT: je .LBB2_44 +; NO-GATHER-NEXT: .LBB2_43: # %cond.load89 +; NO-GATHER-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; NO-GATHER-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; NO-GATHER-NEXT: testb $64, %al +; NO-GATHER-NEXT: jne .LBB2_45 +; NO-GATHER-NEXT: jmp .LBB2_46 +; +; SCATTER-LABEL: gather_v8i32_v8i32: +; SCATTER: # %bb.0: +; SCATTER-NEXT: vptestnmd %ymm0, %ymm0, %k1 +; SCATTER-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; SCATTER-NEXT: kmovw %k1, %k2 +; SCATTER-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SCATTER-NEXT: vpgatherdd c+12(,%ymm0), %ymm1 {%k2} +; SCATTER-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; SCATTER-NEXT: vpgatherdd c+28(,%ymm0), %ymm2 {%k1} +; SCATTER-NEXT: vpaddd %ymm2, %ymm1, %ymm0 +; SCATTER-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; SCATTER-NEXT: retq +; +; SCATTER-NO-GATHER-LABEL: gather_v8i32_v8i32: +; SCATTER-NO-GATHER: # %bb.0: +; SCATTER-NO-GATHER-NEXT: vptestnmd %ymm0, %ymm0, %k0 +; SCATTER-NO-GATHER-NEXT: kmovw %k0, %eax +; SCATTER-NO-GATHER-NEXT: testb $1, %al +; SCATTER-NO-GATHER-NEXT: # implicit-def: $ymm0 +; SCATTER-NO-GATHER-NEXT: jne .LBB2_1 +; SCATTER-NO-GATHER-NEXT: # %bb.2: # %else +; SCATTER-NO-GATHER-NEXT: testb $2, %al +; SCATTER-NO-GATHER-NEXT: jne .LBB2_3 +; SCATTER-NO-GATHER-NEXT: .LBB2_4: # %else2 +; SCATTER-NO-GATHER-NEXT: testb $4, %al +; SCATTER-NO-GATHER-NEXT: jne .LBB2_5 +; SCATTER-NO-GATHER-NEXT: .LBB2_6: # %else5 +; SCATTER-NO-GATHER-NEXT: testb $8, %al +; SCATTER-NO-GATHER-NEXT: jne .LBB2_7 +; SCATTER-NO-GATHER-NEXT: .LBB2_8: # %else8 +; SCATTER-NO-GATHER-NEXT: testb $16, %al +; SCATTER-NO-GATHER-NEXT: jne .LBB2_9 +; SCATTER-NO-GATHER-NEXT: .LBB2_10: # %else11 +; SCATTER-NO-GATHER-NEXT: testb $32, %al +; SCATTER-NO-GATHER-NEXT: jne .LBB2_11 +; SCATTER-NO-GATHER-NEXT: .LBB2_12: # %else14 +; SCATTER-NO-GATHER-NEXT: testb $64, %al +; SCATTER-NO-GATHER-NEXT: jne .LBB2_13 +; SCATTER-NO-GATHER-NEXT: .LBB2_14: # %else17 +; SCATTER-NO-GATHER-NEXT: testb $-128, %al +; SCATTER-NO-GATHER-NEXT: je .LBB2_16 +; SCATTER-NO-GATHER-NEXT: .LBB2_15: # %cond.load19 +; SCATTER-NO-GATHER-NEXT: vpbroadcastd c+12(%rip), %ymm1 +; SCATTER-NO-GATHER-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; SCATTER-NO-GATHER-NEXT: .LBB2_16: # %else20 +; SCATTER-NO-GATHER-NEXT: kmovw %k0, %eax +; SCATTER-NO-GATHER-NEXT: testb $1, %al +; SCATTER-NO-GATHER-NEXT: # implicit-def: $ymm1 +; SCATTER-NO-GATHER-NEXT: jne .LBB2_17 +; SCATTER-NO-GATHER-NEXT: # %bb.18: # %else26 +; SCATTER-NO-GATHER-NEXT: testb $2, %al +; SCATTER-NO-GATHER-NEXT: jne .LBB2_19 +; SCATTER-NO-GATHER-NEXT: .LBB2_20: # %else31 +; SCATTER-NO-GATHER-NEXT: testb $4, %al +; SCATTER-NO-GATHER-NEXT: jne .LBB2_21 +; SCATTER-NO-GATHER-NEXT: .LBB2_22: # %else36 +; SCATTER-NO-GATHER-NEXT: testb $8, %al +; SCATTER-NO-GATHER-NEXT: jne .LBB2_23 +; SCATTER-NO-GATHER-NEXT: .LBB2_24: # %else41 +; SCATTER-NO-GATHER-NEXT: testb $16, %al +; SCATTER-NO-GATHER-NEXT: jne .LBB2_25 +; SCATTER-NO-GATHER-NEXT: .LBB2_26: # %else46 +; SCATTER-NO-GATHER-NEXT: testb $32, %al +; SCATTER-NO-GATHER-NEXT: jne .LBB2_27 +; SCATTER-NO-GATHER-NEXT: .LBB2_28: # %else51 +; SCATTER-NO-GATHER-NEXT: testb $64, %al +; SCATTER-NO-GATHER-NEXT: jne .LBB2_29 +; SCATTER-NO-GATHER-NEXT: .LBB2_30: # %else56 +; SCATTER-NO-GATHER-NEXT: testb $-128, %al +; SCATTER-NO-GATHER-NEXT: je .LBB2_32 +; SCATTER-NO-GATHER-NEXT: .LBB2_31: # %cond.load58 +; SCATTER-NO-GATHER-NEXT: vpbroadcastd c+28(%rip), %ymm2 +; SCATTER-NO-GATHER-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; SCATTER-NO-GATHER-NEXT: .LBB2_32: # %else61 +; SCATTER-NO-GATHER-NEXT: kmovw %k0, %eax +; SCATTER-NO-GATHER-NEXT: testb $1, %al +; SCATTER-NO-GATHER-NEXT: # implicit-def: $ymm2 +; SCATTER-NO-GATHER-NEXT: jne .LBB2_33 +; SCATTER-NO-GATHER-NEXT: # %bb.34: # %else67 +; SCATTER-NO-GATHER-NEXT: testb $2, %al +; SCATTER-NO-GATHER-NEXT: jne .LBB2_35 +; SCATTER-NO-GATHER-NEXT: .LBB2_36: # %else72 +; SCATTER-NO-GATHER-NEXT: testb $4, %al +; SCATTER-NO-GATHER-NEXT: jne .LBB2_37 +; SCATTER-NO-GATHER-NEXT: .LBB2_38: # %else77 +; SCATTER-NO-GATHER-NEXT: testb $8, %al +; SCATTER-NO-GATHER-NEXT: jne .LBB2_39 +; SCATTER-NO-GATHER-NEXT: .LBB2_40: # %else82 +; SCATTER-NO-GATHER-NEXT: testb $16, %al +; SCATTER-NO-GATHER-NEXT: jne .LBB2_41 +; SCATTER-NO-GATHER-NEXT: .LBB2_42: # %else87 +; SCATTER-NO-GATHER-NEXT: testb $32, %al +; SCATTER-NO-GATHER-NEXT: jne .LBB2_43 +; SCATTER-NO-GATHER-NEXT: .LBB2_44: # %else92 +; SCATTER-NO-GATHER-NEXT: testb $64, %al +; SCATTER-NO-GATHER-NEXT: je .LBB2_46 +; SCATTER-NO-GATHER-NEXT: .LBB2_45: # %cond.load94 +; SCATTER-NO-GATHER-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; SCATTER-NO-GATHER-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] +; SCATTER-NO-GATHER-NEXT: .LBB2_46: # %else97 +; SCATTER-NO-GATHER-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; SCATTER-NO-GATHER-NEXT: testb $-128, %al +; SCATTER-NO-GATHER-NEXT: je .LBB2_48 +; SCATTER-NO-GATHER-NEXT: # %bb.47: # %cond.load99 +; SCATTER-NO-GATHER-NEXT: vpbroadcastd c+28(%rip), %ymm1 +; SCATTER-NO-GATHER-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; SCATTER-NO-GATHER-NEXT: .LBB2_48: # %else102 +; SCATTER-NO-GATHER-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; SCATTER-NO-GATHER-NEXT: retq +; SCATTER-NO-GATHER-NEXT: .LBB2_1: # %cond.load +; SCATTER-NO-GATHER-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SCATTER-NO-GATHER-NEXT: testb $2, %al +; SCATTER-NO-GATHER-NEXT: je .LBB2_4 +; SCATTER-NO-GATHER-NEXT: .LBB2_3: # %cond.load1 +; SCATTER-NO-GATHER-NEXT: vpinsrd $1, c+12(%rip), %xmm0, %xmm1 +; SCATTER-NO-GATHER-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; SCATTER-NO-GATHER-NEXT: testb $4, %al +; SCATTER-NO-GATHER-NEXT: je .LBB2_6 +; SCATTER-NO-GATHER-NEXT: .LBB2_5: # %cond.load4 +; SCATTER-NO-GATHER-NEXT: vpinsrd $2, c+12(%rip), %xmm0, %xmm1 +; SCATTER-NO-GATHER-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; SCATTER-NO-GATHER-NEXT: testb $8, %al +; SCATTER-NO-GATHER-NEXT: je .LBB2_8 +; SCATTER-NO-GATHER-NEXT: .LBB2_7: # %cond.load7 +; SCATTER-NO-GATHER-NEXT: vpinsrd $3, c+12(%rip), %xmm0, %xmm1 +; SCATTER-NO-GATHER-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; SCATTER-NO-GATHER-NEXT: testb $16, %al +; SCATTER-NO-GATHER-NEXT: je .LBB2_10 +; SCATTER-NO-GATHER-NEXT: .LBB2_9: # %cond.load10 +; SCATTER-NO-GATHER-NEXT: vpbroadcastd c+12(%rip), %ymm1 +; SCATTER-NO-GATHER-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] +; SCATTER-NO-GATHER-NEXT: testb $32, %al +; SCATTER-NO-GATHER-NEXT: je .LBB2_12 +; SCATTER-NO-GATHER-NEXT: .LBB2_11: # %cond.load13 +; SCATTER-NO-GATHER-NEXT: vpbroadcastd c+12(%rip), %ymm1 +; SCATTER-NO-GATHER-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; SCATTER-NO-GATHER-NEXT: testb $64, %al +; SCATTER-NO-GATHER-NEXT: je .LBB2_14 +; SCATTER-NO-GATHER-NEXT: .LBB2_13: # %cond.load16 +; SCATTER-NO-GATHER-NEXT: vpbroadcastd c+12(%rip), %ymm1 +; SCATTER-NO-GATHER-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] +; SCATTER-NO-GATHER-NEXT: testb $-128, %al +; SCATTER-NO-GATHER-NEXT: jne .LBB2_15 +; SCATTER-NO-GATHER-NEXT: jmp .LBB2_16 +; SCATTER-NO-GATHER-NEXT: .LBB2_17: # %cond.load23 +; SCATTER-NO-GATHER-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SCATTER-NO-GATHER-NEXT: testb $2, %al +; SCATTER-NO-GATHER-NEXT: je .LBB2_20 +; SCATTER-NO-GATHER-NEXT: .LBB2_19: # %cond.load28 +; SCATTER-NO-GATHER-NEXT: vpinsrd $1, c+28(%rip), %xmm1, %xmm2 +; SCATTER-NO-GATHER-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; SCATTER-NO-GATHER-NEXT: testb $4, %al +; SCATTER-NO-GATHER-NEXT: je .LBB2_22 +; SCATTER-NO-GATHER-NEXT: .LBB2_21: # %cond.load33 +; SCATTER-NO-GATHER-NEXT: vpinsrd $2, c+28(%rip), %xmm1, %xmm2 +; SCATTER-NO-GATHER-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; SCATTER-NO-GATHER-NEXT: testb $8, %al +; SCATTER-NO-GATHER-NEXT: je .LBB2_24 +; SCATTER-NO-GATHER-NEXT: .LBB2_23: # %cond.load38 +; SCATTER-NO-GATHER-NEXT: vpinsrd $3, c+28(%rip), %xmm1, %xmm2 +; SCATTER-NO-GATHER-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; SCATTER-NO-GATHER-NEXT: testb $16, %al +; SCATTER-NO-GATHER-NEXT: je .LBB2_26 +; SCATTER-NO-GATHER-NEXT: .LBB2_25: # %cond.load43 +; SCATTER-NO-GATHER-NEXT: vpbroadcastd c+28(%rip), %ymm2 +; SCATTER-NO-GATHER-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] +; SCATTER-NO-GATHER-NEXT: testb $32, %al +; SCATTER-NO-GATHER-NEXT: je .LBB2_28 +; SCATTER-NO-GATHER-NEXT: .LBB2_27: # %cond.load48 +; SCATTER-NO-GATHER-NEXT: vpbroadcastd c+28(%rip), %ymm2 +; SCATTER-NO-GATHER-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] +; SCATTER-NO-GATHER-NEXT: testb $64, %al +; SCATTER-NO-GATHER-NEXT: je .LBB2_30 +; SCATTER-NO-GATHER-NEXT: .LBB2_29: # %cond.load53 +; SCATTER-NO-GATHER-NEXT: vpbroadcastd c+28(%rip), %ymm2 +; SCATTER-NO-GATHER-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; SCATTER-NO-GATHER-NEXT: testb $-128, %al +; SCATTER-NO-GATHER-NEXT: jne .LBB2_31 +; SCATTER-NO-GATHER-NEXT: jmp .LBB2_32 +; SCATTER-NO-GATHER-NEXT: .LBB2_33: # %cond.load64 +; SCATTER-NO-GATHER-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SCATTER-NO-GATHER-NEXT: testb $2, %al +; SCATTER-NO-GATHER-NEXT: je .LBB2_36 +; SCATTER-NO-GATHER-NEXT: .LBB2_35: # %cond.load69 +; SCATTER-NO-GATHER-NEXT: vpinsrd $1, c+28(%rip), %xmm2, %xmm3 +; SCATTER-NO-GATHER-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; SCATTER-NO-GATHER-NEXT: testb $4, %al +; SCATTER-NO-GATHER-NEXT: je .LBB2_38 +; SCATTER-NO-GATHER-NEXT: .LBB2_37: # %cond.load74 +; SCATTER-NO-GATHER-NEXT: vpinsrd $2, c+28(%rip), %xmm2, %xmm3 +; SCATTER-NO-GATHER-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; SCATTER-NO-GATHER-NEXT: testb $8, %al +; SCATTER-NO-GATHER-NEXT: je .LBB2_40 +; SCATTER-NO-GATHER-NEXT: .LBB2_39: # %cond.load79 +; SCATTER-NO-GATHER-NEXT: vpinsrd $3, c+28(%rip), %xmm2, %xmm3 +; SCATTER-NO-GATHER-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; SCATTER-NO-GATHER-NEXT: testb $16, %al +; SCATTER-NO-GATHER-NEXT: je .LBB2_42 +; SCATTER-NO-GATHER-NEXT: .LBB2_41: # %cond.load84 +; SCATTER-NO-GATHER-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; SCATTER-NO-GATHER-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] +; SCATTER-NO-GATHER-NEXT: testb $32, %al +; SCATTER-NO-GATHER-NEXT: je .LBB2_44 +; SCATTER-NO-GATHER-NEXT: .LBB2_43: # %cond.load89 +; SCATTER-NO-GATHER-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; SCATTER-NO-GATHER-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; SCATTER-NO-GATHER-NEXT: testb $64, %al +; SCATTER-NO-GATHER-NEXT: jne .LBB2_45 +; SCATTER-NO-GATHER-NEXT: jmp .LBB2_46 +; +; GATHER-NO-SCATTER-LABEL: gather_v8i32_v8i32: +; GATHER-NO-SCATTER: # %bb.0: +; GATHER-NO-SCATTER-NEXT: vptestnmd %ymm0, %ymm0, %k1 +; GATHER-NO-SCATTER-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; GATHER-NO-SCATTER-NEXT: kmovw %k1, %k2 +; GATHER-NO-SCATTER-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; GATHER-NO-SCATTER-NEXT: vpgatherdd c+12(,%ymm0), %ymm1 {%k2} +; GATHER-NO-SCATTER-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; GATHER-NO-SCATTER-NEXT: vpgatherdd c+28(,%ymm0), %ymm2 {%k1} +; GATHER-NO-SCATTER-NEXT: vpaddd %ymm2, %ymm1, %ymm0 +; GATHER-NO-SCATTER-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; GATHER-NO-SCATTER-NEXT: retq ; ; NO-SCATTER-GATHER-LABEL: gather_v8i32_v8i32: -; NO-SCATTER-GATHER-NOT: vpgatherdd +; NO-SCATTER-GATHER: # %bb.0: +; NO-SCATTER-GATHER-NEXT: vptestnmd %ymm0, %ymm0, %k0 +; NO-SCATTER-GATHER-NEXT: kmovw %k0, %eax +; NO-SCATTER-GATHER-NEXT: testb $1, %al +; NO-SCATTER-GATHER-NEXT: # implicit-def: $ymm0 +; NO-SCATTER-GATHER-NEXT: jne .LBB2_1 +; NO-SCATTER-GATHER-NEXT: # %bb.2: # %else +; NO-SCATTER-GATHER-NEXT: testb $2, %al +; NO-SCATTER-GATHER-NEXT: jne .LBB2_3 +; NO-SCATTER-GATHER-NEXT: .LBB2_4: # %else2 +; NO-SCATTER-GATHER-NEXT: testb $4, %al +; NO-SCATTER-GATHER-NEXT: jne .LBB2_5 +; NO-SCATTER-GATHER-NEXT: .LBB2_6: # %else5 +; NO-SCATTER-GATHER-NEXT: testb $8, %al +; NO-SCATTER-GATHER-NEXT: jne .LBB2_7 +; NO-SCATTER-GATHER-NEXT: .LBB2_8: # %else8 +; NO-SCATTER-GATHER-NEXT: testb $16, %al +; NO-SCATTER-GATHER-NEXT: jne .LBB2_9 +; NO-SCATTER-GATHER-NEXT: .LBB2_10: # %else11 +; NO-SCATTER-GATHER-NEXT: testb $32, %al +; NO-SCATTER-GATHER-NEXT: jne .LBB2_11 +; NO-SCATTER-GATHER-NEXT: .LBB2_12: # %else14 +; NO-SCATTER-GATHER-NEXT: testb $64, %al +; NO-SCATTER-GATHER-NEXT: jne .LBB2_13 +; NO-SCATTER-GATHER-NEXT: .LBB2_14: # %else17 +; NO-SCATTER-GATHER-NEXT: testb $-128, %al +; NO-SCATTER-GATHER-NEXT: je .LBB2_16 +; NO-SCATTER-GATHER-NEXT: .LBB2_15: # %cond.load19 +; NO-SCATTER-GATHER-NEXT: vpbroadcastd c+12(%rip), %ymm1 +; NO-SCATTER-GATHER-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; NO-SCATTER-GATHER-NEXT: .LBB2_16: # %else20 +; NO-SCATTER-GATHER-NEXT: kmovw %k0, %eax +; NO-SCATTER-GATHER-NEXT: testb $1, %al +; NO-SCATTER-GATHER-NEXT: # implicit-def: $ymm1 +; NO-SCATTER-GATHER-NEXT: jne .LBB2_17 +; NO-SCATTER-GATHER-NEXT: # %bb.18: # %else26 +; NO-SCATTER-GATHER-NEXT: testb $2, %al +; NO-SCATTER-GATHER-NEXT: jne .LBB2_19 +; NO-SCATTER-GATHER-NEXT: .LBB2_20: # %else31 +; NO-SCATTER-GATHER-NEXT: testb $4, %al +; NO-SCATTER-GATHER-NEXT: jne .LBB2_21 +; NO-SCATTER-GATHER-NEXT: .LBB2_22: # %else36 +; NO-SCATTER-GATHER-NEXT: testb $8, %al +; NO-SCATTER-GATHER-NEXT: jne .LBB2_23 +; NO-SCATTER-GATHER-NEXT: .LBB2_24: # %else41 +; NO-SCATTER-GATHER-NEXT: testb $16, %al +; NO-SCATTER-GATHER-NEXT: jne .LBB2_25 +; NO-SCATTER-GATHER-NEXT: .LBB2_26: # %else46 +; NO-SCATTER-GATHER-NEXT: testb $32, %al +; NO-SCATTER-GATHER-NEXT: jne .LBB2_27 +; NO-SCATTER-GATHER-NEXT: .LBB2_28: # %else51 +; NO-SCATTER-GATHER-NEXT: testb $64, %al +; NO-SCATTER-GATHER-NEXT: jne .LBB2_29 +; NO-SCATTER-GATHER-NEXT: .LBB2_30: # %else56 +; NO-SCATTER-GATHER-NEXT: testb $-128, %al +; NO-SCATTER-GATHER-NEXT: je .LBB2_32 +; NO-SCATTER-GATHER-NEXT: .LBB2_31: # %cond.load58 +; NO-SCATTER-GATHER-NEXT: vpbroadcastd c+28(%rip), %ymm2 +; NO-SCATTER-GATHER-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; NO-SCATTER-GATHER-NEXT: .LBB2_32: # %else61 +; NO-SCATTER-GATHER-NEXT: kmovw %k0, %eax +; NO-SCATTER-GATHER-NEXT: testb $1, %al +; NO-SCATTER-GATHER-NEXT: # implicit-def: $ymm2 +; NO-SCATTER-GATHER-NEXT: jne .LBB2_33 +; NO-SCATTER-GATHER-NEXT: # %bb.34: # %else67 +; NO-SCATTER-GATHER-NEXT: testb $2, %al +; NO-SCATTER-GATHER-NEXT: jne .LBB2_35 +; NO-SCATTER-GATHER-NEXT: .LBB2_36: # %else72 +; NO-SCATTER-GATHER-NEXT: testb $4, %al +; NO-SCATTER-GATHER-NEXT: jne .LBB2_37 +; NO-SCATTER-GATHER-NEXT: .LBB2_38: # %else77 +; NO-SCATTER-GATHER-NEXT: testb $8, %al +; NO-SCATTER-GATHER-NEXT: jne .LBB2_39 +; NO-SCATTER-GATHER-NEXT: .LBB2_40: # %else82 +; NO-SCATTER-GATHER-NEXT: testb $16, %al +; NO-SCATTER-GATHER-NEXT: jne .LBB2_41 +; NO-SCATTER-GATHER-NEXT: .LBB2_42: # %else87 +; NO-SCATTER-GATHER-NEXT: testb $32, %al +; NO-SCATTER-GATHER-NEXT: jne .LBB2_43 +; NO-SCATTER-GATHER-NEXT: .LBB2_44: # %else92 +; NO-SCATTER-GATHER-NEXT: testb $64, %al +; NO-SCATTER-GATHER-NEXT: je .LBB2_46 +; NO-SCATTER-GATHER-NEXT: .LBB2_45: # %cond.load94 +; NO-SCATTER-GATHER-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; NO-SCATTER-GATHER-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] +; NO-SCATTER-GATHER-NEXT: .LBB2_46: # %else97 +; NO-SCATTER-GATHER-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; NO-SCATTER-GATHER-NEXT: testb $-128, %al +; NO-SCATTER-GATHER-NEXT: je .LBB2_48 +; NO-SCATTER-GATHER-NEXT: # %bb.47: # %cond.load99 +; NO-SCATTER-GATHER-NEXT: vpbroadcastd c+28(%rip), %ymm1 +; NO-SCATTER-GATHER-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; NO-SCATTER-GATHER-NEXT: .LBB2_48: # %else102 +; NO-SCATTER-GATHER-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; NO-SCATTER-GATHER-NEXT: retq +; NO-SCATTER-GATHER-NEXT: .LBB2_1: # %cond.load +; NO-SCATTER-GATHER-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; NO-SCATTER-GATHER-NEXT: testb $2, %al +; NO-SCATTER-GATHER-NEXT: je .LBB2_4 +; NO-SCATTER-GATHER-NEXT: .LBB2_3: # %cond.load1 +; NO-SCATTER-GATHER-NEXT: vpinsrd $1, c+12(%rip), %xmm0, %xmm1 +; NO-SCATTER-GATHER-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; NO-SCATTER-GATHER-NEXT: testb $4, %al +; NO-SCATTER-GATHER-NEXT: je .LBB2_6 +; NO-SCATTER-GATHER-NEXT: .LBB2_5: # %cond.load4 +; NO-SCATTER-GATHER-NEXT: vpinsrd $2, c+12(%rip), %xmm0, %xmm1 +; NO-SCATTER-GATHER-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; NO-SCATTER-GATHER-NEXT: testb $8, %al +; NO-SCATTER-GATHER-NEXT: je .LBB2_8 +; NO-SCATTER-GATHER-NEXT: .LBB2_7: # %cond.load7 +; NO-SCATTER-GATHER-NEXT: vpinsrd $3, c+12(%rip), %xmm0, %xmm1 +; NO-SCATTER-GATHER-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; NO-SCATTER-GATHER-NEXT: testb $16, %al +; NO-SCATTER-GATHER-NEXT: je .LBB2_10 +; NO-SCATTER-GATHER-NEXT: .LBB2_9: # %cond.load10 +; NO-SCATTER-GATHER-NEXT: vpbroadcastd c+12(%rip), %ymm1 +; NO-SCATTER-GATHER-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] +; NO-SCATTER-GATHER-NEXT: testb $32, %al +; NO-SCATTER-GATHER-NEXT: je .LBB2_12 +; NO-SCATTER-GATHER-NEXT: .LBB2_11: # %cond.load13 +; NO-SCATTER-GATHER-NEXT: vpbroadcastd c+12(%rip), %ymm1 +; NO-SCATTER-GATHER-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; NO-SCATTER-GATHER-NEXT: testb $64, %al +; NO-SCATTER-GATHER-NEXT: je .LBB2_14 +; NO-SCATTER-GATHER-NEXT: .LBB2_13: # %cond.load16 +; NO-SCATTER-GATHER-NEXT: vpbroadcastd c+12(%rip), %ymm1 +; NO-SCATTER-GATHER-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] +; NO-SCATTER-GATHER-NEXT: testb $-128, %al +; NO-SCATTER-GATHER-NEXT: jne .LBB2_15 +; NO-SCATTER-GATHER-NEXT: jmp .LBB2_16 +; NO-SCATTER-GATHER-NEXT: .LBB2_17: # %cond.load23 +; NO-SCATTER-GATHER-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; NO-SCATTER-GATHER-NEXT: testb $2, %al +; NO-SCATTER-GATHER-NEXT: je .LBB2_20 +; NO-SCATTER-GATHER-NEXT: .LBB2_19: # %cond.load28 +; NO-SCATTER-GATHER-NEXT: vpinsrd $1, c+28(%rip), %xmm1, %xmm2 +; NO-SCATTER-GATHER-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; NO-SCATTER-GATHER-NEXT: testb $4, %al +; NO-SCATTER-GATHER-NEXT: je .LBB2_22 +; NO-SCATTER-GATHER-NEXT: .LBB2_21: # %cond.load33 +; NO-SCATTER-GATHER-NEXT: vpinsrd $2, c+28(%rip), %xmm1, %xmm2 +; NO-SCATTER-GATHER-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; NO-SCATTER-GATHER-NEXT: testb $8, %al +; NO-SCATTER-GATHER-NEXT: je .LBB2_24 +; NO-SCATTER-GATHER-NEXT: .LBB2_23: # %cond.load38 +; NO-SCATTER-GATHER-NEXT: vpinsrd $3, c+28(%rip), %xmm1, %xmm2 +; NO-SCATTER-GATHER-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; NO-SCATTER-GATHER-NEXT: testb $16, %al +; NO-SCATTER-GATHER-NEXT: je .LBB2_26 +; NO-SCATTER-GATHER-NEXT: .LBB2_25: # %cond.load43 +; NO-SCATTER-GATHER-NEXT: vpbroadcastd c+28(%rip), %ymm2 +; NO-SCATTER-GATHER-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] +; NO-SCATTER-GATHER-NEXT: testb $32, %al +; NO-SCATTER-GATHER-NEXT: je .LBB2_28 +; NO-SCATTER-GATHER-NEXT: .LBB2_27: # %cond.load48 +; NO-SCATTER-GATHER-NEXT: vpbroadcastd c+28(%rip), %ymm2 +; NO-SCATTER-GATHER-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] +; NO-SCATTER-GATHER-NEXT: testb $64, %al +; NO-SCATTER-GATHER-NEXT: je .LBB2_30 +; NO-SCATTER-GATHER-NEXT: .LBB2_29: # %cond.load53 +; NO-SCATTER-GATHER-NEXT: vpbroadcastd c+28(%rip), %ymm2 +; NO-SCATTER-GATHER-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; NO-SCATTER-GATHER-NEXT: testb $-128, %al +; NO-SCATTER-GATHER-NEXT: jne .LBB2_31 +; NO-SCATTER-GATHER-NEXT: jmp .LBB2_32 +; NO-SCATTER-GATHER-NEXT: .LBB2_33: # %cond.load64 +; NO-SCATTER-GATHER-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; NO-SCATTER-GATHER-NEXT: testb $2, %al +; NO-SCATTER-GATHER-NEXT: je .LBB2_36 +; NO-SCATTER-GATHER-NEXT: .LBB2_35: # %cond.load69 +; NO-SCATTER-GATHER-NEXT: vpinsrd $1, c+28(%rip), %xmm2, %xmm3 +; NO-SCATTER-GATHER-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; NO-SCATTER-GATHER-NEXT: testb $4, %al +; NO-SCATTER-GATHER-NEXT: je .LBB2_38 +; NO-SCATTER-GATHER-NEXT: .LBB2_37: # %cond.load74 +; NO-SCATTER-GATHER-NEXT: vpinsrd $2, c+28(%rip), %xmm2, %xmm3 +; NO-SCATTER-GATHER-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; NO-SCATTER-GATHER-NEXT: testb $8, %al +; NO-SCATTER-GATHER-NEXT: je .LBB2_40 +; NO-SCATTER-GATHER-NEXT: .LBB2_39: # %cond.load79 +; NO-SCATTER-GATHER-NEXT: vpinsrd $3, c+28(%rip), %xmm2, %xmm3 +; NO-SCATTER-GATHER-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; NO-SCATTER-GATHER-NEXT: testb $16, %al +; NO-SCATTER-GATHER-NEXT: je .LBB2_42 +; NO-SCATTER-GATHER-NEXT: .LBB2_41: # %cond.load84 +; NO-SCATTER-GATHER-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; NO-SCATTER-GATHER-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] +; NO-SCATTER-GATHER-NEXT: testb $32, %al +; NO-SCATTER-GATHER-NEXT: je .LBB2_44 +; NO-SCATTER-GATHER-NEXT: .LBB2_43: # %cond.load89 +; NO-SCATTER-GATHER-NEXT: vpbroadcastd c+28(%rip), %ymm3 +; NO-SCATTER-GATHER-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; NO-SCATTER-GATHER-NEXT: testb $64, %al +; NO-SCATTER-GATHER-NEXT: jne .LBB2_45 +; NO-SCATTER-GATHER-NEXT: jmp .LBB2_46 %1 = icmp eq <8 x i32> %trigger, zeroinitializer %2 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> getelementptr (%struct.a, <8 x ptr> , <8 x i64> zeroinitializer, i32 0, <8 x i64> ), i32 4, <8 x i1> %1, <8 x i32> undef) %3 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> getelementptr (%struct.a, <8 x ptr> , <8 x i64> zeroinitializer, i32 3), i32 4, <8 x i1> %1, <8 x i32> undef) @@ -154,19 +1580,1099 @@ declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i32>) -; scatter test cases +; scatter test cases define void @scatter_test1(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) { +; GATHER-LABEL: scatter_test1: +; GATHER: # %bb.0: +; GATHER-NEXT: vmovq %rdi, %xmm4 +; GATHER-NEXT: vpbroadcastq %xmm4, %ymm10 +; GATHER-NEXT: vpmovsxdq %xmm0, %ymm4 +; GATHER-NEXT: vpsllq $2, %ymm4, %ymm4 +; GATHER-NEXT: vpaddq %ymm4, %ymm10, %ymm4 +; GATHER-NEXT: testb $1, %sil +; GATHER-NEXT: je .LBB3_2 +; GATHER-NEXT: # %bb.1: # %cond.store +; GATHER-NEXT: vmovq %xmm4, %rax +; GATHER-NEXT: vmovss %xmm2, (%rax) +; GATHER-NEXT: .LBB3_2: # %else +; GATHER-NEXT: vextracti128 $1, %ymm0, %xmm0 +; GATHER-NEXT: testb $2, %sil +; GATHER-NEXT: je .LBB3_4 +; GATHER-NEXT: # %bb.3: # %cond.store1 +; GATHER-NEXT: vpextrq $1, %xmm4, %rax +; GATHER-NEXT: vextractps $1, %xmm2, (%rax) +; GATHER-NEXT: .LBB3_4: # %else2 +; GATHER-NEXT: vpmovsxdq %xmm0, %ymm0 +; GATHER-NEXT: testb $4, %sil +; GATHER-NEXT: vextracti128 $1, %ymm4, %xmm6 +; GATHER-NEXT: je .LBB3_6 +; GATHER-NEXT: # %bb.5: # %cond.store3 +; GATHER-NEXT: vmovq %xmm6, %rax +; GATHER-NEXT: vextractps $2, %xmm2, (%rax) +; GATHER-NEXT: .LBB3_6: # %else4 +; GATHER-NEXT: vpsllq $2, %ymm0, %ymm0 +; GATHER-NEXT: testb $8, %sil +; GATHER-NEXT: je .LBB3_8 +; GATHER-NEXT: # %bb.7: # %cond.store5 +; GATHER-NEXT: vpextrq $1, %xmm6, %rax +; GATHER-NEXT: vextractps $3, %xmm2, (%rax) +; GATHER-NEXT: .LBB3_8: # %else6 +; GATHER-NEXT: vpaddq %ymm0, %ymm10, %ymm5 +; GATHER-NEXT: testb $16, %sil +; GATHER-NEXT: vextractf128 $1, %ymm2, %xmm0 +; GATHER-NEXT: je .LBB3_10 +; GATHER-NEXT: # %bb.9: # %cond.store7 +; GATHER-NEXT: vmovq %xmm5, %rax +; GATHER-NEXT: vmovss %xmm0, (%rax) +; GATHER-NEXT: .LBB3_10: # %else8 +; GATHER-NEXT: testb $32, %sil +; GATHER-NEXT: je .LBB3_12 +; GATHER-NEXT: # %bb.11: # %cond.store9 +; GATHER-NEXT: vpextrq $1, %xmm5, %rax +; GATHER-NEXT: vextractps $1, %xmm0, (%rax) +; GATHER-NEXT: .LBB3_12: # %else10 +; GATHER-NEXT: vpmovsxdq %xmm1, %ymm7 +; GATHER-NEXT: testb $64, %sil +; GATHER-NEXT: vextracti128 $1, %ymm5, %xmm8 +; GATHER-NEXT: je .LBB3_14 +; GATHER-NEXT: # %bb.13: # %cond.store11 +; GATHER-NEXT: vmovq %xmm8, %rax +; GATHER-NEXT: vextractps $2, %xmm0, (%rax) +; GATHER-NEXT: .LBB3_14: # %else12 +; GATHER-NEXT: vpsllq $2, %ymm7, %ymm7 +; GATHER-NEXT: testb %sil, %sil +; GATHER-NEXT: jns .LBB3_16 +; GATHER-NEXT: # %bb.15: # %cond.store13 +; GATHER-NEXT: vpextrq $1, %xmm8, %rax +; GATHER-NEXT: vextractps $3, %xmm0, (%rax) +; GATHER-NEXT: .LBB3_16: # %else14 +; GATHER-NEXT: vpaddq %ymm7, %ymm10, %ymm7 +; GATHER-NEXT: testl $256, %esi # imm = 0x100 +; GATHER-NEXT: je .LBB3_18 +; GATHER-NEXT: # %bb.17: # %cond.store15 +; GATHER-NEXT: vmovq %xmm7, %rax +; GATHER-NEXT: vmovss %xmm3, (%rax) +; GATHER-NEXT: .LBB3_18: # %else16 +; GATHER-NEXT: vextracti128 $1, %ymm1, %xmm1 +; GATHER-NEXT: testl $512, %esi # imm = 0x200 +; GATHER-NEXT: je .LBB3_20 +; GATHER-NEXT: # %bb.19: # %cond.store17 +; GATHER-NEXT: vpextrq $1, %xmm7, %rax +; GATHER-NEXT: vextractps $1, %xmm3, (%rax) +; GATHER-NEXT: .LBB3_20: # %else18 +; GATHER-NEXT: vmovd %esi, %xmm11 +; GATHER-NEXT: vpmovsxdq %xmm1, %ymm1 +; GATHER-NEXT: testl $1024, %esi # imm = 0x400 +; GATHER-NEXT: vextracti128 $1, %ymm7, %xmm9 +; GATHER-NEXT: je .LBB3_22 +; GATHER-NEXT: # %bb.21: # %cond.store19 +; GATHER-NEXT: vmovq %xmm9, %rax +; GATHER-NEXT: vextractps $2, %xmm3, (%rax) +; GATHER-NEXT: .LBB3_22: # %else20 +; GATHER-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; GATHER-NEXT: vpbroadcastq {{.*#+}} xmm11 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GATHER-NEXT: vpsllq $2, %ymm1, %ymm1 +; GATHER-NEXT: testl $2048, %esi # imm = 0x800 +; GATHER-NEXT: je .LBB3_24 +; GATHER-NEXT: # %bb.23: # %cond.store21 +; GATHER-NEXT: vpextrq $1, %xmm9, %rax +; GATHER-NEXT: vextractps $3, %xmm3, (%rax) +; GATHER-NEXT: .LBB3_24: # %else22 +; GATHER-NEXT: vpand %xmm11, %xmm12, %xmm12 +; GATHER-NEXT: vpaddq %ymm1, %ymm10, %ymm10 +; GATHER-NEXT: testl $4096, %esi # imm = 0x1000 +; GATHER-NEXT: vextractf128 $1, %ymm3, %xmm1 +; GATHER-NEXT: je .LBB3_26 +; GATHER-NEXT: # %bb.25: # %cond.store23 +; GATHER-NEXT: vmovq %xmm10, %rax +; GATHER-NEXT: vmovss %xmm1, (%rax) +; GATHER-NEXT: .LBB3_26: # %else24 +; GATHER-NEXT: vpcmpeqb %xmm11, %xmm12, %xmm11 +; GATHER-NEXT: testl $8192, %esi # imm = 0x2000 +; GATHER-NEXT: je .LBB3_28 +; GATHER-NEXT: # %bb.27: # %cond.store25 +; GATHER-NEXT: vpextrq $1, %xmm10, %rax +; GATHER-NEXT: vextractps $1, %xmm1, (%rax) +; GATHER-NEXT: .LBB3_28: # %else26 +; GATHER-NEXT: vpsrlw $7, %xmm11, %xmm12 +; GATHER-NEXT: testl $16384, %esi # imm = 0x4000 +; GATHER-NEXT: vextracti128 $1, %ymm10, %xmm11 +; GATHER-NEXT: je .LBB3_30 +; GATHER-NEXT: # %bb.29: # %cond.store27 +; GATHER-NEXT: vmovq %xmm11, %rax +; GATHER-NEXT: vextractps $2, %xmm1, (%rax) +; GATHER-NEXT: .LBB3_30: # %else28 +; GATHER-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm12, %xmm12 +; GATHER-NEXT: testl $32768, %esi # imm = 0x8000 +; GATHER-NEXT: je .LBB3_32 +; GATHER-NEXT: # %bb.31: # %cond.store29 +; GATHER-NEXT: vpextrq $1, %xmm11, %rax +; GATHER-NEXT: vextractps $3, %xmm1, (%rax) +; GATHER-NEXT: .LBB3_32: # %else30 +; GATHER-NEXT: vpsllw $7, %xmm12, %xmm12 +; GATHER-NEXT: vpmovmskb %xmm12, %eax +; GATHER-NEXT: testb $1, %al +; GATHER-NEXT: jne .LBB3_33 +; GATHER-NEXT: # %bb.34: # %else35 +; GATHER-NEXT: testb $2, %al +; GATHER-NEXT: jne .LBB3_35 +; GATHER-NEXT: .LBB3_36: # %else39 +; GATHER-NEXT: testb $4, %al +; GATHER-NEXT: jne .LBB3_37 +; GATHER-NEXT: .LBB3_38: # %else43 +; GATHER-NEXT: testb $8, %al +; GATHER-NEXT: jne .LBB3_39 +; GATHER-NEXT: .LBB3_40: # %else47 +; GATHER-NEXT: testb $16, %al +; GATHER-NEXT: jne .LBB3_41 +; GATHER-NEXT: .LBB3_42: # %else51 +; GATHER-NEXT: testb $32, %al +; GATHER-NEXT: jne .LBB3_43 +; GATHER-NEXT: .LBB3_44: # %else55 +; GATHER-NEXT: testb $64, %al +; GATHER-NEXT: jne .LBB3_45 +; GATHER-NEXT: .LBB3_46: # %else59 +; GATHER-NEXT: testb %al, %al +; GATHER-NEXT: js .LBB3_47 +; GATHER-NEXT: .LBB3_48: # %else63 +; GATHER-NEXT: testl $256, %eax # imm = 0x100 +; GATHER-NEXT: jne .LBB3_49 +; GATHER-NEXT: .LBB3_50: # %else67 +; GATHER-NEXT: testl $512, %eax # imm = 0x200 +; GATHER-NEXT: jne .LBB3_51 +; GATHER-NEXT: .LBB3_52: # %else71 +; GATHER-NEXT: testl $1024, %eax # imm = 0x400 +; GATHER-NEXT: jne .LBB3_53 +; GATHER-NEXT: .LBB3_54: # %else75 +; GATHER-NEXT: testl $2048, %eax # imm = 0x800 +; GATHER-NEXT: jne .LBB3_55 +; GATHER-NEXT: .LBB3_56: # %else79 +; GATHER-NEXT: testl $4096, %eax # imm = 0x1000 +; GATHER-NEXT: jne .LBB3_57 +; GATHER-NEXT: .LBB3_58: # %else83 +; GATHER-NEXT: testl $8192, %eax # imm = 0x2000 +; GATHER-NEXT: jne .LBB3_59 +; GATHER-NEXT: .LBB3_60: # %else87 +; GATHER-NEXT: testl $16384, %eax # imm = 0x4000 +; GATHER-NEXT: jne .LBB3_61 +; GATHER-NEXT: .LBB3_62: # %else91 +; GATHER-NEXT: testl $32768, %eax # imm = 0x8000 +; GATHER-NEXT: jne .LBB3_63 +; GATHER-NEXT: .LBB3_64: # %else95 +; GATHER-NEXT: vzeroupper +; GATHER-NEXT: retq +; GATHER-NEXT: .LBB3_33: # %cond.store32 +; GATHER-NEXT: vmovq %xmm4, %rcx +; GATHER-NEXT: vmovss %xmm2, (%rcx) +; GATHER-NEXT: testb $2, %al +; GATHER-NEXT: je .LBB3_36 +; GATHER-NEXT: .LBB3_35: # %cond.store36 +; GATHER-NEXT: vpextrq $1, %xmm4, %rcx +; GATHER-NEXT: vextractps $1, %xmm2, (%rcx) +; GATHER-NEXT: testb $4, %al +; GATHER-NEXT: je .LBB3_38 +; GATHER-NEXT: .LBB3_37: # %cond.store40 +; GATHER-NEXT: vmovq %xmm6, %rcx +; GATHER-NEXT: vextractps $2, %xmm2, (%rcx) +; GATHER-NEXT: testb $8, %al +; GATHER-NEXT: je .LBB3_40 +; GATHER-NEXT: .LBB3_39: # %cond.store44 +; GATHER-NEXT: vpextrq $1, %xmm6, %rcx +; GATHER-NEXT: vextractps $3, %xmm2, (%rcx) +; GATHER-NEXT: testb $16, %al +; GATHER-NEXT: je .LBB3_42 +; GATHER-NEXT: .LBB3_41: # %cond.store48 +; GATHER-NEXT: vmovq %xmm5, %rcx +; GATHER-NEXT: vmovss %xmm0, (%rcx) +; GATHER-NEXT: testb $32, %al +; GATHER-NEXT: je .LBB3_44 +; GATHER-NEXT: .LBB3_43: # %cond.store52 +; GATHER-NEXT: vpextrq $1, %xmm5, %rcx +; GATHER-NEXT: vextractps $1, %xmm0, (%rcx) +; GATHER-NEXT: testb $64, %al +; GATHER-NEXT: je .LBB3_46 +; GATHER-NEXT: .LBB3_45: # %cond.store56 +; GATHER-NEXT: vmovq %xmm8, %rcx +; GATHER-NEXT: vextractps $2, %xmm0, (%rcx) +; GATHER-NEXT: testb %al, %al +; GATHER-NEXT: jns .LBB3_48 +; GATHER-NEXT: .LBB3_47: # %cond.store60 +; GATHER-NEXT: vpextrq $1, %xmm8, %rcx +; GATHER-NEXT: vextractps $3, %xmm0, (%rcx) +; GATHER-NEXT: testl $256, %eax # imm = 0x100 +; GATHER-NEXT: je .LBB3_50 +; GATHER-NEXT: .LBB3_49: # %cond.store64 +; GATHER-NEXT: vmovq %xmm7, %rcx +; GATHER-NEXT: vmovss %xmm3, (%rcx) +; GATHER-NEXT: testl $512, %eax # imm = 0x200 +; GATHER-NEXT: je .LBB3_52 +; GATHER-NEXT: .LBB3_51: # %cond.store68 +; GATHER-NEXT: vpextrq $1, %xmm7, %rcx +; GATHER-NEXT: vextractps $1, %xmm3, (%rcx) +; GATHER-NEXT: testl $1024, %eax # imm = 0x400 +; GATHER-NEXT: je .LBB3_54 +; GATHER-NEXT: .LBB3_53: # %cond.store72 +; GATHER-NEXT: vmovq %xmm9, %rcx +; GATHER-NEXT: vextractps $2, %xmm3, (%rcx) +; GATHER-NEXT: testl $2048, %eax # imm = 0x800 +; GATHER-NEXT: je .LBB3_56 +; GATHER-NEXT: .LBB3_55: # %cond.store76 +; GATHER-NEXT: vpextrq $1, %xmm9, %rcx +; GATHER-NEXT: vextractps $3, %xmm3, (%rcx) +; GATHER-NEXT: testl $4096, %eax # imm = 0x1000 +; GATHER-NEXT: je .LBB3_58 +; GATHER-NEXT: .LBB3_57: # %cond.store80 +; GATHER-NEXT: vmovq %xmm10, %rcx +; GATHER-NEXT: vmovss %xmm1, (%rcx) +; GATHER-NEXT: testl $8192, %eax # imm = 0x2000 +; GATHER-NEXT: je .LBB3_60 +; GATHER-NEXT: .LBB3_59: # %cond.store84 +; GATHER-NEXT: vpextrq $1, %xmm10, %rcx +; GATHER-NEXT: vextractps $1, %xmm1, (%rcx) +; GATHER-NEXT: testl $16384, %eax # imm = 0x4000 +; GATHER-NEXT: je .LBB3_62 +; GATHER-NEXT: .LBB3_61: # %cond.store88 +; GATHER-NEXT: vmovq %xmm11, %rcx +; GATHER-NEXT: vextractps $2, %xmm1, (%rcx) +; GATHER-NEXT: testl $32768, %eax # imm = 0x8000 +; GATHER-NEXT: je .LBB3_64 +; GATHER-NEXT: .LBB3_63: # %cond.store92 +; GATHER-NEXT: vpextrq $1, %xmm11, %rax +; GATHER-NEXT: vextractps $3, %xmm1, (%rax) +; GATHER-NEXT: vzeroupper +; GATHER-NEXT: retq +; +; NO-GATHER-LABEL: scatter_test1: +; NO-GATHER: # %bb.0: +; NO-GATHER-NEXT: vmovq %rdi, %xmm4 +; NO-GATHER-NEXT: vpbroadcastq %xmm4, %ymm10 +; NO-GATHER-NEXT: vpmovsxdq %xmm0, %ymm4 +; NO-GATHER-NEXT: vpsllq $2, %ymm4, %ymm4 +; NO-GATHER-NEXT: vpaddq %ymm4, %ymm10, %ymm4 +; NO-GATHER-NEXT: testb $1, %sil +; NO-GATHER-NEXT: je .LBB3_2 +; NO-GATHER-NEXT: # %bb.1: # %cond.store +; NO-GATHER-NEXT: vmovq %xmm4, %rax +; NO-GATHER-NEXT: vmovss %xmm2, (%rax) +; NO-GATHER-NEXT: .LBB3_2: # %else +; NO-GATHER-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NO-GATHER-NEXT: testb $2, %sil +; NO-GATHER-NEXT: je .LBB3_4 +; NO-GATHER-NEXT: # %bb.3: # %cond.store1 +; NO-GATHER-NEXT: vpextrq $1, %xmm4, %rax +; NO-GATHER-NEXT: vextractps $1, %xmm2, (%rax) +; NO-GATHER-NEXT: .LBB3_4: # %else2 +; NO-GATHER-NEXT: vpmovsxdq %xmm0, %ymm0 +; NO-GATHER-NEXT: testb $4, %sil +; NO-GATHER-NEXT: vextracti128 $1, %ymm4, %xmm6 +; NO-GATHER-NEXT: je .LBB3_6 +; NO-GATHER-NEXT: # %bb.5: # %cond.store3 +; NO-GATHER-NEXT: vmovq %xmm6, %rax +; NO-GATHER-NEXT: vextractps $2, %xmm2, (%rax) +; NO-GATHER-NEXT: .LBB3_6: # %else4 +; NO-GATHER-NEXT: vpsllq $2, %ymm0, %ymm0 +; NO-GATHER-NEXT: testb $8, %sil +; NO-GATHER-NEXT: je .LBB3_8 +; NO-GATHER-NEXT: # %bb.7: # %cond.store5 +; NO-GATHER-NEXT: vpextrq $1, %xmm6, %rax +; NO-GATHER-NEXT: vextractps $3, %xmm2, (%rax) +; NO-GATHER-NEXT: .LBB3_8: # %else6 +; NO-GATHER-NEXT: vpaddq %ymm0, %ymm10, %ymm5 +; NO-GATHER-NEXT: testb $16, %sil +; NO-GATHER-NEXT: vextractf128 $1, %ymm2, %xmm0 +; NO-GATHER-NEXT: je .LBB3_10 +; NO-GATHER-NEXT: # %bb.9: # %cond.store7 +; NO-GATHER-NEXT: vmovq %xmm5, %rax +; NO-GATHER-NEXT: vmovss %xmm0, (%rax) +; NO-GATHER-NEXT: .LBB3_10: # %else8 +; NO-GATHER-NEXT: testb $32, %sil +; NO-GATHER-NEXT: je .LBB3_12 +; NO-GATHER-NEXT: # %bb.11: # %cond.store9 +; NO-GATHER-NEXT: vpextrq $1, %xmm5, %rax +; NO-GATHER-NEXT: vextractps $1, %xmm0, (%rax) +; NO-GATHER-NEXT: .LBB3_12: # %else10 +; NO-GATHER-NEXT: vpmovsxdq %xmm1, %ymm7 +; NO-GATHER-NEXT: testb $64, %sil +; NO-GATHER-NEXT: vextracti128 $1, %ymm5, %xmm8 +; NO-GATHER-NEXT: je .LBB3_14 +; NO-GATHER-NEXT: # %bb.13: # %cond.store11 +; NO-GATHER-NEXT: vmovq %xmm8, %rax +; NO-GATHER-NEXT: vextractps $2, %xmm0, (%rax) +; NO-GATHER-NEXT: .LBB3_14: # %else12 +; NO-GATHER-NEXT: vpsllq $2, %ymm7, %ymm7 +; NO-GATHER-NEXT: testb %sil, %sil +; NO-GATHER-NEXT: jns .LBB3_16 +; NO-GATHER-NEXT: # %bb.15: # %cond.store13 +; NO-GATHER-NEXT: vpextrq $1, %xmm8, %rax +; NO-GATHER-NEXT: vextractps $3, %xmm0, (%rax) +; NO-GATHER-NEXT: .LBB3_16: # %else14 +; NO-GATHER-NEXT: vpaddq %ymm7, %ymm10, %ymm7 +; NO-GATHER-NEXT: testl $256, %esi # imm = 0x100 +; NO-GATHER-NEXT: je .LBB3_18 +; NO-GATHER-NEXT: # %bb.17: # %cond.store15 +; NO-GATHER-NEXT: vmovq %xmm7, %rax +; NO-GATHER-NEXT: vmovss %xmm3, (%rax) +; NO-GATHER-NEXT: .LBB3_18: # %else16 +; NO-GATHER-NEXT: vextracti128 $1, %ymm1, %xmm1 +; NO-GATHER-NEXT: testl $512, %esi # imm = 0x200 +; NO-GATHER-NEXT: je .LBB3_20 +; NO-GATHER-NEXT: # %bb.19: # %cond.store17 +; NO-GATHER-NEXT: vpextrq $1, %xmm7, %rax +; NO-GATHER-NEXT: vextractps $1, %xmm3, (%rax) +; NO-GATHER-NEXT: .LBB3_20: # %else18 +; NO-GATHER-NEXT: vmovd %esi, %xmm11 +; NO-GATHER-NEXT: vpmovsxdq %xmm1, %ymm1 +; NO-GATHER-NEXT: testl $1024, %esi # imm = 0x400 +; NO-GATHER-NEXT: vextracti128 $1, %ymm7, %xmm9 +; NO-GATHER-NEXT: je .LBB3_22 +; NO-GATHER-NEXT: # %bb.21: # %cond.store19 +; NO-GATHER-NEXT: vmovq %xmm9, %rax +; NO-GATHER-NEXT: vextractps $2, %xmm3, (%rax) +; NO-GATHER-NEXT: .LBB3_22: # %else20 +; NO-GATHER-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; NO-GATHER-NEXT: vpbroadcastq {{.*#+}} xmm11 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; NO-GATHER-NEXT: vpsllq $2, %ymm1, %ymm1 +; NO-GATHER-NEXT: testl $2048, %esi # imm = 0x800 +; NO-GATHER-NEXT: je .LBB3_24 +; NO-GATHER-NEXT: # %bb.23: # %cond.store21 +; NO-GATHER-NEXT: vpextrq $1, %xmm9, %rax +; NO-GATHER-NEXT: vextractps $3, %xmm3, (%rax) +; NO-GATHER-NEXT: .LBB3_24: # %else22 +; NO-GATHER-NEXT: vpand %xmm11, %xmm12, %xmm12 +; NO-GATHER-NEXT: vpaddq %ymm1, %ymm10, %ymm10 +; NO-GATHER-NEXT: testl $4096, %esi # imm = 0x1000 +; NO-GATHER-NEXT: vextractf128 $1, %ymm3, %xmm1 +; NO-GATHER-NEXT: je .LBB3_26 +; NO-GATHER-NEXT: # %bb.25: # %cond.store23 +; NO-GATHER-NEXT: vmovq %xmm10, %rax +; NO-GATHER-NEXT: vmovss %xmm1, (%rax) +; NO-GATHER-NEXT: .LBB3_26: # %else24 +; NO-GATHER-NEXT: vpcmpeqb %xmm11, %xmm12, %xmm11 +; NO-GATHER-NEXT: testl $8192, %esi # imm = 0x2000 +; NO-GATHER-NEXT: je .LBB3_28 +; NO-GATHER-NEXT: # %bb.27: # %cond.store25 +; NO-GATHER-NEXT: vpextrq $1, %xmm10, %rax +; NO-GATHER-NEXT: vextractps $1, %xmm1, (%rax) +; NO-GATHER-NEXT: .LBB3_28: # %else26 +; NO-GATHER-NEXT: vpsrlw $7, %xmm11, %xmm12 +; NO-GATHER-NEXT: testl $16384, %esi # imm = 0x4000 +; NO-GATHER-NEXT: vextracti128 $1, %ymm10, %xmm11 +; NO-GATHER-NEXT: je .LBB3_30 +; NO-GATHER-NEXT: # %bb.29: # %cond.store27 +; NO-GATHER-NEXT: vmovq %xmm11, %rax +; NO-GATHER-NEXT: vextractps $2, %xmm1, (%rax) +; NO-GATHER-NEXT: .LBB3_30: # %else28 +; NO-GATHER-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm12, %xmm12 +; NO-GATHER-NEXT: testl $32768, %esi # imm = 0x8000 +; NO-GATHER-NEXT: je .LBB3_32 +; NO-GATHER-NEXT: # %bb.31: # %cond.store29 +; NO-GATHER-NEXT: vpextrq $1, %xmm11, %rax +; NO-GATHER-NEXT: vextractps $3, %xmm1, (%rax) +; NO-GATHER-NEXT: .LBB3_32: # %else30 +; NO-GATHER-NEXT: vpsllw $7, %xmm12, %xmm12 +; NO-GATHER-NEXT: vpmovmskb %xmm12, %eax +; NO-GATHER-NEXT: testb $1, %al +; NO-GATHER-NEXT: jne .LBB3_33 +; NO-GATHER-NEXT: # %bb.34: # %else35 +; NO-GATHER-NEXT: testb $2, %al +; NO-GATHER-NEXT: jne .LBB3_35 +; NO-GATHER-NEXT: .LBB3_36: # %else39 +; NO-GATHER-NEXT: testb $4, %al +; NO-GATHER-NEXT: jne .LBB3_37 +; NO-GATHER-NEXT: .LBB3_38: # %else43 +; NO-GATHER-NEXT: testb $8, %al +; NO-GATHER-NEXT: jne .LBB3_39 +; NO-GATHER-NEXT: .LBB3_40: # %else47 +; NO-GATHER-NEXT: testb $16, %al +; NO-GATHER-NEXT: jne .LBB3_41 +; NO-GATHER-NEXT: .LBB3_42: # %else51 +; NO-GATHER-NEXT: testb $32, %al +; NO-GATHER-NEXT: jne .LBB3_43 +; NO-GATHER-NEXT: .LBB3_44: # %else55 +; NO-GATHER-NEXT: testb $64, %al +; NO-GATHER-NEXT: jne .LBB3_45 +; NO-GATHER-NEXT: .LBB3_46: # %else59 +; NO-GATHER-NEXT: testb %al, %al +; NO-GATHER-NEXT: js .LBB3_47 +; NO-GATHER-NEXT: .LBB3_48: # %else63 +; NO-GATHER-NEXT: testl $256, %eax # imm = 0x100 +; NO-GATHER-NEXT: jne .LBB3_49 +; NO-GATHER-NEXT: .LBB3_50: # %else67 +; NO-GATHER-NEXT: testl $512, %eax # imm = 0x200 +; NO-GATHER-NEXT: jne .LBB3_51 +; NO-GATHER-NEXT: .LBB3_52: # %else71 +; NO-GATHER-NEXT: testl $1024, %eax # imm = 0x400 +; NO-GATHER-NEXT: jne .LBB3_53 +; NO-GATHER-NEXT: .LBB3_54: # %else75 +; NO-GATHER-NEXT: testl $2048, %eax # imm = 0x800 +; NO-GATHER-NEXT: jne .LBB3_55 +; NO-GATHER-NEXT: .LBB3_56: # %else79 +; NO-GATHER-NEXT: testl $4096, %eax # imm = 0x1000 +; NO-GATHER-NEXT: jne .LBB3_57 +; NO-GATHER-NEXT: .LBB3_58: # %else83 +; NO-GATHER-NEXT: testl $8192, %eax # imm = 0x2000 +; NO-GATHER-NEXT: jne .LBB3_59 +; NO-GATHER-NEXT: .LBB3_60: # %else87 +; NO-GATHER-NEXT: testl $16384, %eax # imm = 0x4000 +; NO-GATHER-NEXT: jne .LBB3_61 +; NO-GATHER-NEXT: .LBB3_62: # %else91 +; NO-GATHER-NEXT: testl $32768, %eax # imm = 0x8000 +; NO-GATHER-NEXT: jne .LBB3_63 +; NO-GATHER-NEXT: .LBB3_64: # %else95 +; NO-GATHER-NEXT: vzeroupper +; NO-GATHER-NEXT: retq +; NO-GATHER-NEXT: .LBB3_33: # %cond.store32 +; NO-GATHER-NEXT: vmovq %xmm4, %rcx +; NO-GATHER-NEXT: vmovss %xmm2, (%rcx) +; NO-GATHER-NEXT: testb $2, %al +; NO-GATHER-NEXT: je .LBB3_36 +; NO-GATHER-NEXT: .LBB3_35: # %cond.store36 +; NO-GATHER-NEXT: vpextrq $1, %xmm4, %rcx +; NO-GATHER-NEXT: vextractps $1, %xmm2, (%rcx) +; NO-GATHER-NEXT: testb $4, %al +; NO-GATHER-NEXT: je .LBB3_38 +; NO-GATHER-NEXT: .LBB3_37: # %cond.store40 +; NO-GATHER-NEXT: vmovq %xmm6, %rcx +; NO-GATHER-NEXT: vextractps $2, %xmm2, (%rcx) +; NO-GATHER-NEXT: testb $8, %al +; NO-GATHER-NEXT: je .LBB3_40 +; NO-GATHER-NEXT: .LBB3_39: # %cond.store44 +; NO-GATHER-NEXT: vpextrq $1, %xmm6, %rcx +; NO-GATHER-NEXT: vextractps $3, %xmm2, (%rcx) +; NO-GATHER-NEXT: testb $16, %al +; NO-GATHER-NEXT: je .LBB3_42 +; NO-GATHER-NEXT: .LBB3_41: # %cond.store48 +; NO-GATHER-NEXT: vmovq %xmm5, %rcx +; NO-GATHER-NEXT: vmovss %xmm0, (%rcx) +; NO-GATHER-NEXT: testb $32, %al +; NO-GATHER-NEXT: je .LBB3_44 +; NO-GATHER-NEXT: .LBB3_43: # %cond.store52 +; NO-GATHER-NEXT: vpextrq $1, %xmm5, %rcx +; NO-GATHER-NEXT: vextractps $1, %xmm0, (%rcx) +; NO-GATHER-NEXT: testb $64, %al +; NO-GATHER-NEXT: je .LBB3_46 +; NO-GATHER-NEXT: .LBB3_45: # %cond.store56 +; NO-GATHER-NEXT: vmovq %xmm8, %rcx +; NO-GATHER-NEXT: vextractps $2, %xmm0, (%rcx) +; NO-GATHER-NEXT: testb %al, %al +; NO-GATHER-NEXT: jns .LBB3_48 +; NO-GATHER-NEXT: .LBB3_47: # %cond.store60 +; NO-GATHER-NEXT: vpextrq $1, %xmm8, %rcx +; NO-GATHER-NEXT: vextractps $3, %xmm0, (%rcx) +; NO-GATHER-NEXT: testl $256, %eax # imm = 0x100 +; NO-GATHER-NEXT: je .LBB3_50 +; NO-GATHER-NEXT: .LBB3_49: # %cond.store64 +; NO-GATHER-NEXT: vmovq %xmm7, %rcx +; NO-GATHER-NEXT: vmovss %xmm3, (%rcx) +; NO-GATHER-NEXT: testl $512, %eax # imm = 0x200 +; NO-GATHER-NEXT: je .LBB3_52 +; NO-GATHER-NEXT: .LBB3_51: # %cond.store68 +; NO-GATHER-NEXT: vpextrq $1, %xmm7, %rcx +; NO-GATHER-NEXT: vextractps $1, %xmm3, (%rcx) +; NO-GATHER-NEXT: testl $1024, %eax # imm = 0x400 +; NO-GATHER-NEXT: je .LBB3_54 +; NO-GATHER-NEXT: .LBB3_53: # %cond.store72 +; NO-GATHER-NEXT: vmovq %xmm9, %rcx +; NO-GATHER-NEXT: vextractps $2, %xmm3, (%rcx) +; NO-GATHER-NEXT: testl $2048, %eax # imm = 0x800 +; NO-GATHER-NEXT: je .LBB3_56 +; NO-GATHER-NEXT: .LBB3_55: # %cond.store76 +; NO-GATHER-NEXT: vpextrq $1, %xmm9, %rcx +; NO-GATHER-NEXT: vextractps $3, %xmm3, (%rcx) +; NO-GATHER-NEXT: testl $4096, %eax # imm = 0x1000 +; NO-GATHER-NEXT: je .LBB3_58 +; NO-GATHER-NEXT: .LBB3_57: # %cond.store80 +; NO-GATHER-NEXT: vmovq %xmm10, %rcx +; NO-GATHER-NEXT: vmovss %xmm1, (%rcx) +; NO-GATHER-NEXT: testl $8192, %eax # imm = 0x2000 +; NO-GATHER-NEXT: je .LBB3_60 +; NO-GATHER-NEXT: .LBB3_59: # %cond.store84 +; NO-GATHER-NEXT: vpextrq $1, %xmm10, %rcx +; NO-GATHER-NEXT: vextractps $1, %xmm1, (%rcx) +; NO-GATHER-NEXT: testl $16384, %eax # imm = 0x4000 +; NO-GATHER-NEXT: je .LBB3_62 +; NO-GATHER-NEXT: .LBB3_61: # %cond.store88 +; NO-GATHER-NEXT: vmovq %xmm11, %rcx +; NO-GATHER-NEXT: vextractps $2, %xmm1, (%rcx) +; NO-GATHER-NEXT: testl $32768, %eax # imm = 0x8000 +; NO-GATHER-NEXT: je .LBB3_64 +; NO-GATHER-NEXT: .LBB3_63: # %cond.store92 +; NO-GATHER-NEXT: vpextrq $1, %xmm11, %rax +; NO-GATHER-NEXT: vextractps $3, %xmm1, (%rax) +; NO-GATHER-NEXT: vzeroupper +; NO-GATHER-NEXT: retq +; ; SCATTER-LABEL: scatter_test1: -; SCATTER: vpscatterdd +; SCATTER: # %bb.0: +; SCATTER-NEXT: kmovw %esi, %k1 +; SCATTER-NEXT: kmovw %k1, %k2 +; SCATTER-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2} +; SCATTER-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} +; SCATTER-NEXT: vzeroupper +; SCATTER-NEXT: retq ; ; SCATTER-NO-GATHER-LABEL: scatter_test1: -; SCATTER-NO-GATHER: vpscatterdd +; SCATTER-NO-GATHER: # %bb.0: +; SCATTER-NO-GATHER-NEXT: kmovw %esi, %k1 +; SCATTER-NO-GATHER-NEXT: kmovw %k1, %k2 +; SCATTER-NO-GATHER-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2} +; SCATTER-NO-GATHER-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} +; SCATTER-NO-GATHER-NEXT: vzeroupper +; SCATTER-NO-GATHER-NEXT: retq ; ; GATHER-NO-SCATTER-LABEL: scatter_test1: -; GATHER-NO-SCATTER-NOT: vpscatterdd +; GATHER-NO-SCATTER: # %bb.0: +; GATHER-NO-SCATTER-NEXT: vpbroadcastq %rdi, %zmm3 +; GATHER-NO-SCATTER-NEXT: vpmovsxdq %ymm0, %zmm2 +; GATHER-NO-SCATTER-NEXT: vpsllq $2, %zmm2, %zmm2 +; GATHER-NO-SCATTER-NEXT: vpaddq %zmm2, %zmm3, %zmm2 +; GATHER-NO-SCATTER-NEXT: testb $1, %sil +; GATHER-NO-SCATTER-NEXT: jne .LBB3_1 +; GATHER-NO-SCATTER-NEXT: # %bb.2: # %else +; GATHER-NO-SCATTER-NEXT: testb $2, %sil +; GATHER-NO-SCATTER-NEXT: jne .LBB3_3 +; GATHER-NO-SCATTER-NEXT: .LBB3_4: # %else2 +; GATHER-NO-SCATTER-NEXT: testb $4, %sil +; GATHER-NO-SCATTER-NEXT: jne .LBB3_5 +; GATHER-NO-SCATTER-NEXT: .LBB3_6: # %else4 +; GATHER-NO-SCATTER-NEXT: testb $8, %sil +; GATHER-NO-SCATTER-NEXT: je .LBB3_8 +; GATHER-NO-SCATTER-NEXT: .LBB3_7: # %cond.store5 +; GATHER-NO-SCATTER-NEXT: vextracti128 $1, %ymm2, %xmm4 +; GATHER-NO-SCATTER-NEXT: vpextrq $1, %xmm4, %rax +; GATHER-NO-SCATTER-NEXT: vpextrd $3, %xmm1, (%rax) +; GATHER-NO-SCATTER-NEXT: .LBB3_8: # %else6 +; GATHER-NO-SCATTER-NEXT: testb $16, %sil +; GATHER-NO-SCATTER-NEXT: vextracti32x4 $2, %zmm2, %xmm5 +; GATHER-NO-SCATTER-NEXT: je .LBB3_10 +; GATHER-NO-SCATTER-NEXT: # %bb.9: # %cond.store7 +; GATHER-NO-SCATTER-NEXT: vextracti128 $1, %ymm1, %xmm4 +; GATHER-NO-SCATTER-NEXT: vmovq %xmm5, %rax +; GATHER-NO-SCATTER-NEXT: vmovd %xmm4, (%rax) +; GATHER-NO-SCATTER-NEXT: .LBB3_10: # %else8 +; GATHER-NO-SCATTER-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; GATHER-NO-SCATTER-NEXT: testb $32, %sil +; GATHER-NO-SCATTER-NEXT: je .LBB3_12 +; GATHER-NO-SCATTER-NEXT: # %bb.11: # %cond.store9 +; GATHER-NO-SCATTER-NEXT: vextracti128 $1, %ymm1, %xmm4 +; GATHER-NO-SCATTER-NEXT: vpextrq $1, %xmm5, %rax +; GATHER-NO-SCATTER-NEXT: vpextrd $1, %xmm4, (%rax) +; GATHER-NO-SCATTER-NEXT: .LBB3_12: # %else10 +; GATHER-NO-SCATTER-NEXT: vpmovsxdq %ymm0, %zmm0 +; GATHER-NO-SCATTER-NEXT: testb $64, %sil +; GATHER-NO-SCATTER-NEXT: vextracti32x4 $3, %zmm2, %xmm6 +; GATHER-NO-SCATTER-NEXT: je .LBB3_14 +; GATHER-NO-SCATTER-NEXT: # %bb.13: # %cond.store11 +; GATHER-NO-SCATTER-NEXT: vextracti128 $1, %ymm1, %xmm4 +; GATHER-NO-SCATTER-NEXT: vmovq %xmm6, %rax +; GATHER-NO-SCATTER-NEXT: vpextrd $2, %xmm4, (%rax) +; GATHER-NO-SCATTER-NEXT: .LBB3_14: # %else12 +; GATHER-NO-SCATTER-NEXT: vpsllq $2, %zmm0, %zmm0 +; GATHER-NO-SCATTER-NEXT: testb %sil, %sil +; GATHER-NO-SCATTER-NEXT: jns .LBB3_16 +; GATHER-NO-SCATTER-NEXT: # %bb.15: # %cond.store13 +; GATHER-NO-SCATTER-NEXT: vextracti128 $1, %ymm1, %xmm4 +; GATHER-NO-SCATTER-NEXT: vpextrq $1, %xmm6, %rax +; GATHER-NO-SCATTER-NEXT: vpextrd $3, %xmm4, (%rax) +; GATHER-NO-SCATTER-NEXT: .LBB3_16: # %else14 +; GATHER-NO-SCATTER-NEXT: vpaddq %zmm0, %zmm3, %zmm0 +; GATHER-NO-SCATTER-NEXT: testl $256, %esi # imm = 0x100 +; GATHER-NO-SCATTER-NEXT: vextracti32x4 $2, %zmm1, %xmm3 +; GATHER-NO-SCATTER-NEXT: jne .LBB3_17 +; GATHER-NO-SCATTER-NEXT: # %bb.18: # %else16 +; GATHER-NO-SCATTER-NEXT: testl $512, %esi # imm = 0x200 +; GATHER-NO-SCATTER-NEXT: jne .LBB3_19 +; GATHER-NO-SCATTER-NEXT: .LBB3_20: # %else18 +; GATHER-NO-SCATTER-NEXT: testl $1024, %esi # imm = 0x400 +; GATHER-NO-SCATTER-NEXT: jne .LBB3_21 +; GATHER-NO-SCATTER-NEXT: .LBB3_22: # %else20 +; GATHER-NO-SCATTER-NEXT: testl $2048, %esi # imm = 0x800 +; GATHER-NO-SCATTER-NEXT: je .LBB3_24 +; GATHER-NO-SCATTER-NEXT: .LBB3_23: # %cond.store21 +; GATHER-NO-SCATTER-NEXT: vextracti128 $1, %ymm0, %xmm4 +; GATHER-NO-SCATTER-NEXT: vpextrq $1, %xmm4, %rax +; GATHER-NO-SCATTER-NEXT: vpextrd $3, %xmm3, (%rax) +; GATHER-NO-SCATTER-NEXT: .LBB3_24: # %else22 +; GATHER-NO-SCATTER-NEXT: testl $4096, %esi # imm = 0x1000 +; GATHER-NO-SCATTER-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; GATHER-NO-SCATTER-NEXT: vextracti32x4 $2, %zmm0, %xmm7 +; GATHER-NO-SCATTER-NEXT: je .LBB3_26 +; GATHER-NO-SCATTER-NEXT: # %bb.25: # %cond.store23 +; GATHER-NO-SCATTER-NEXT: vmovq %xmm7, %rax +; GATHER-NO-SCATTER-NEXT: vmovd %xmm4, (%rax) +; GATHER-NO-SCATTER-NEXT: .LBB3_26: # %else24 +; GATHER-NO-SCATTER-NEXT: testl $8192, %esi # imm = 0x2000 +; GATHER-NO-SCATTER-NEXT: je .LBB3_28 +; GATHER-NO-SCATTER-NEXT: # %bb.27: # %cond.store25 +; GATHER-NO-SCATTER-NEXT: vpextrq $1, %xmm7, %rax +; GATHER-NO-SCATTER-NEXT: vpextrd $1, %xmm4, (%rax) +; GATHER-NO-SCATTER-NEXT: .LBB3_28: # %else26 +; GATHER-NO-SCATTER-NEXT: testl $16384, %esi # imm = 0x4000 +; GATHER-NO-SCATTER-NEXT: vextracti32x4 $3, %zmm0, %xmm8 +; GATHER-NO-SCATTER-NEXT: jne .LBB3_29 +; GATHER-NO-SCATTER-NEXT: # %bb.30: # %else28 +; GATHER-NO-SCATTER-NEXT: testl $32768, %esi # imm = 0x8000 +; GATHER-NO-SCATTER-NEXT: jne .LBB3_31 +; GATHER-NO-SCATTER-NEXT: .LBB3_32: # %else30 +; GATHER-NO-SCATTER-NEXT: testb $1, %sil +; GATHER-NO-SCATTER-NEXT: jne .LBB3_33 +; GATHER-NO-SCATTER-NEXT: .LBB3_34: # %else35 +; GATHER-NO-SCATTER-NEXT: testb $2, %sil +; GATHER-NO-SCATTER-NEXT: jne .LBB3_35 +; GATHER-NO-SCATTER-NEXT: .LBB3_36: # %else39 +; GATHER-NO-SCATTER-NEXT: testb $4, %sil +; GATHER-NO-SCATTER-NEXT: jne .LBB3_37 +; GATHER-NO-SCATTER-NEXT: .LBB3_38: # %else43 +; GATHER-NO-SCATTER-NEXT: testb $8, %sil +; GATHER-NO-SCATTER-NEXT: jne .LBB3_39 +; GATHER-NO-SCATTER-NEXT: .LBB3_40: # %else47 +; GATHER-NO-SCATTER-NEXT: testb $16, %sil +; GATHER-NO-SCATTER-NEXT: jne .LBB3_41 +; GATHER-NO-SCATTER-NEXT: .LBB3_42: # %else51 +; GATHER-NO-SCATTER-NEXT: testb $32, %sil +; GATHER-NO-SCATTER-NEXT: jne .LBB3_43 +; GATHER-NO-SCATTER-NEXT: .LBB3_44: # %else55 +; GATHER-NO-SCATTER-NEXT: testb $64, %sil +; GATHER-NO-SCATTER-NEXT: jne .LBB3_45 +; GATHER-NO-SCATTER-NEXT: .LBB3_46: # %else59 +; GATHER-NO-SCATTER-NEXT: testb %sil, %sil +; GATHER-NO-SCATTER-NEXT: js .LBB3_47 +; GATHER-NO-SCATTER-NEXT: .LBB3_48: # %else63 +; GATHER-NO-SCATTER-NEXT: testl $256, %esi # imm = 0x100 +; GATHER-NO-SCATTER-NEXT: jne .LBB3_49 +; GATHER-NO-SCATTER-NEXT: .LBB3_50: # %else67 +; GATHER-NO-SCATTER-NEXT: testl $512, %esi # imm = 0x200 +; GATHER-NO-SCATTER-NEXT: jne .LBB3_51 +; GATHER-NO-SCATTER-NEXT: .LBB3_52: # %else71 +; GATHER-NO-SCATTER-NEXT: testl $1024, %esi # imm = 0x400 +; GATHER-NO-SCATTER-NEXT: jne .LBB3_53 +; GATHER-NO-SCATTER-NEXT: .LBB3_54: # %else75 +; GATHER-NO-SCATTER-NEXT: testl $2048, %esi # imm = 0x800 +; GATHER-NO-SCATTER-NEXT: jne .LBB3_55 +; GATHER-NO-SCATTER-NEXT: .LBB3_56: # %else79 +; GATHER-NO-SCATTER-NEXT: testl $4096, %esi # imm = 0x1000 +; GATHER-NO-SCATTER-NEXT: jne .LBB3_57 +; GATHER-NO-SCATTER-NEXT: .LBB3_58: # %else83 +; GATHER-NO-SCATTER-NEXT: testl $8192, %esi # imm = 0x2000 +; GATHER-NO-SCATTER-NEXT: jne .LBB3_59 +; GATHER-NO-SCATTER-NEXT: .LBB3_60: # %else87 +; GATHER-NO-SCATTER-NEXT: testl $16384, %esi # imm = 0x4000 +; GATHER-NO-SCATTER-NEXT: jne .LBB3_61 +; GATHER-NO-SCATTER-NEXT: .LBB3_62: # %else91 +; GATHER-NO-SCATTER-NEXT: testl $32768, %esi # imm = 0x8000 +; GATHER-NO-SCATTER-NEXT: jne .LBB3_63 +; GATHER-NO-SCATTER-NEXT: .LBB3_64: # %else95 +; GATHER-NO-SCATTER-NEXT: vzeroupper +; GATHER-NO-SCATTER-NEXT: retq +; GATHER-NO-SCATTER-NEXT: .LBB3_1: # %cond.store +; GATHER-NO-SCATTER-NEXT: vmovq %xmm2, %rax +; GATHER-NO-SCATTER-NEXT: vmovd %xmm1, (%rax) +; GATHER-NO-SCATTER-NEXT: testb $2, %sil +; GATHER-NO-SCATTER-NEXT: je .LBB3_4 +; GATHER-NO-SCATTER-NEXT: .LBB3_3: # %cond.store1 +; GATHER-NO-SCATTER-NEXT: vpextrq $1, %xmm2, %rax +; GATHER-NO-SCATTER-NEXT: vpextrd $1, %xmm1, (%rax) +; GATHER-NO-SCATTER-NEXT: testb $4, %sil +; GATHER-NO-SCATTER-NEXT: je .LBB3_6 +; GATHER-NO-SCATTER-NEXT: .LBB3_5: # %cond.store3 +; GATHER-NO-SCATTER-NEXT: vextracti128 $1, %ymm2, %xmm4 +; GATHER-NO-SCATTER-NEXT: vmovq %xmm4, %rax +; GATHER-NO-SCATTER-NEXT: vpextrd $2, %xmm1, (%rax) +; GATHER-NO-SCATTER-NEXT: testb $8, %sil +; GATHER-NO-SCATTER-NEXT: jne .LBB3_7 +; GATHER-NO-SCATTER-NEXT: jmp .LBB3_8 +; GATHER-NO-SCATTER-NEXT: .LBB3_17: # %cond.store15 +; GATHER-NO-SCATTER-NEXT: vmovq %xmm0, %rax +; GATHER-NO-SCATTER-NEXT: vmovd %xmm3, (%rax) +; GATHER-NO-SCATTER-NEXT: testl $512, %esi # imm = 0x200 +; GATHER-NO-SCATTER-NEXT: je .LBB3_20 +; GATHER-NO-SCATTER-NEXT: .LBB3_19: # %cond.store17 +; GATHER-NO-SCATTER-NEXT: vpextrq $1, %xmm0, %rax +; GATHER-NO-SCATTER-NEXT: vpextrd $1, %xmm3, (%rax) +; GATHER-NO-SCATTER-NEXT: testl $1024, %esi # imm = 0x400 +; GATHER-NO-SCATTER-NEXT: je .LBB3_22 +; GATHER-NO-SCATTER-NEXT: .LBB3_21: # %cond.store19 +; GATHER-NO-SCATTER-NEXT: vextracti128 $1, %ymm0, %xmm4 +; GATHER-NO-SCATTER-NEXT: vmovq %xmm4, %rax +; GATHER-NO-SCATTER-NEXT: vpextrd $2, %xmm3, (%rax) +; GATHER-NO-SCATTER-NEXT: testl $2048, %esi # imm = 0x800 +; GATHER-NO-SCATTER-NEXT: jne .LBB3_23 +; GATHER-NO-SCATTER-NEXT: jmp .LBB3_24 +; GATHER-NO-SCATTER-NEXT: .LBB3_29: # %cond.store27 +; GATHER-NO-SCATTER-NEXT: vmovq %xmm8, %rax +; GATHER-NO-SCATTER-NEXT: vpextrd $2, %xmm4, (%rax) +; GATHER-NO-SCATTER-NEXT: testl $32768, %esi # imm = 0x8000 +; GATHER-NO-SCATTER-NEXT: je .LBB3_32 +; GATHER-NO-SCATTER-NEXT: .LBB3_31: # %cond.store29 +; GATHER-NO-SCATTER-NEXT: vpextrq $1, %xmm8, %rax +; GATHER-NO-SCATTER-NEXT: vpextrd $3, %xmm4, (%rax) +; GATHER-NO-SCATTER-NEXT: testb $1, %sil +; GATHER-NO-SCATTER-NEXT: je .LBB3_34 +; GATHER-NO-SCATTER-NEXT: .LBB3_33: # %cond.store32 +; GATHER-NO-SCATTER-NEXT: vmovq %xmm2, %rax +; GATHER-NO-SCATTER-NEXT: vmovd %xmm1, (%rax) +; GATHER-NO-SCATTER-NEXT: testb $2, %sil +; GATHER-NO-SCATTER-NEXT: je .LBB3_36 +; GATHER-NO-SCATTER-NEXT: .LBB3_35: # %cond.store36 +; GATHER-NO-SCATTER-NEXT: vpextrq $1, %xmm2, %rax +; GATHER-NO-SCATTER-NEXT: vpextrd $1, %xmm1, (%rax) +; GATHER-NO-SCATTER-NEXT: testb $4, %sil +; GATHER-NO-SCATTER-NEXT: je .LBB3_38 +; GATHER-NO-SCATTER-NEXT: .LBB3_37: # %cond.store40 +; GATHER-NO-SCATTER-NEXT: vextracti128 $1, %ymm2, %xmm9 +; GATHER-NO-SCATTER-NEXT: vmovq %xmm9, %rax +; GATHER-NO-SCATTER-NEXT: vpextrd $2, %xmm1, (%rax) +; GATHER-NO-SCATTER-NEXT: testb $8, %sil +; GATHER-NO-SCATTER-NEXT: je .LBB3_40 +; GATHER-NO-SCATTER-NEXT: .LBB3_39: # %cond.store44 +; GATHER-NO-SCATTER-NEXT: vextracti128 $1, %ymm2, %xmm2 +; GATHER-NO-SCATTER-NEXT: vpextrq $1, %xmm2, %rax +; GATHER-NO-SCATTER-NEXT: vpextrd $3, %xmm1, (%rax) +; GATHER-NO-SCATTER-NEXT: testb $16, %sil +; GATHER-NO-SCATTER-NEXT: je .LBB3_42 +; GATHER-NO-SCATTER-NEXT: .LBB3_41: # %cond.store48 +; GATHER-NO-SCATTER-NEXT: vextracti128 $1, %ymm1, %xmm2 +; GATHER-NO-SCATTER-NEXT: vmovq %xmm5, %rax +; GATHER-NO-SCATTER-NEXT: vmovd %xmm2, (%rax) +; GATHER-NO-SCATTER-NEXT: testb $32, %sil +; GATHER-NO-SCATTER-NEXT: je .LBB3_44 +; GATHER-NO-SCATTER-NEXT: .LBB3_43: # %cond.store52 +; GATHER-NO-SCATTER-NEXT: vextracti128 $1, %ymm1, %xmm2 +; GATHER-NO-SCATTER-NEXT: vpextrq $1, %xmm5, %rax +; GATHER-NO-SCATTER-NEXT: vpextrd $1, %xmm2, (%rax) +; GATHER-NO-SCATTER-NEXT: testb $64, %sil +; GATHER-NO-SCATTER-NEXT: je .LBB3_46 +; GATHER-NO-SCATTER-NEXT: .LBB3_45: # %cond.store56 +; GATHER-NO-SCATTER-NEXT: vextracti128 $1, %ymm1, %xmm2 +; GATHER-NO-SCATTER-NEXT: vmovq %xmm6, %rax +; GATHER-NO-SCATTER-NEXT: vpextrd $2, %xmm2, (%rax) +; GATHER-NO-SCATTER-NEXT: testb %sil, %sil +; GATHER-NO-SCATTER-NEXT: jns .LBB3_48 +; GATHER-NO-SCATTER-NEXT: .LBB3_47: # %cond.store60 +; GATHER-NO-SCATTER-NEXT: vextracti128 $1, %ymm1, %xmm1 +; GATHER-NO-SCATTER-NEXT: vpextrq $1, %xmm6, %rax +; GATHER-NO-SCATTER-NEXT: vpextrd $3, %xmm1, (%rax) +; GATHER-NO-SCATTER-NEXT: testl $256, %esi # imm = 0x100 +; GATHER-NO-SCATTER-NEXT: je .LBB3_50 +; GATHER-NO-SCATTER-NEXT: .LBB3_49: # %cond.store64 +; GATHER-NO-SCATTER-NEXT: vmovq %xmm0, %rax +; GATHER-NO-SCATTER-NEXT: vmovd %xmm3, (%rax) +; GATHER-NO-SCATTER-NEXT: testl $512, %esi # imm = 0x200 +; GATHER-NO-SCATTER-NEXT: je .LBB3_52 +; GATHER-NO-SCATTER-NEXT: .LBB3_51: # %cond.store68 +; GATHER-NO-SCATTER-NEXT: vpextrq $1, %xmm0, %rax +; GATHER-NO-SCATTER-NEXT: vpextrd $1, %xmm3, (%rax) +; GATHER-NO-SCATTER-NEXT: testl $1024, %esi # imm = 0x400 +; GATHER-NO-SCATTER-NEXT: je .LBB3_54 +; GATHER-NO-SCATTER-NEXT: .LBB3_53: # %cond.store72 +; GATHER-NO-SCATTER-NEXT: vextracti128 $1, %ymm0, %xmm1 +; GATHER-NO-SCATTER-NEXT: vmovq %xmm1, %rax +; GATHER-NO-SCATTER-NEXT: vpextrd $2, %xmm3, (%rax) +; GATHER-NO-SCATTER-NEXT: testl $2048, %esi # imm = 0x800 +; GATHER-NO-SCATTER-NEXT: je .LBB3_56 +; GATHER-NO-SCATTER-NEXT: .LBB3_55: # %cond.store76 +; GATHER-NO-SCATTER-NEXT: vextracti128 $1, %ymm0, %xmm0 +; GATHER-NO-SCATTER-NEXT: vpextrq $1, %xmm0, %rax +; GATHER-NO-SCATTER-NEXT: vpextrd $3, %xmm3, (%rax) +; GATHER-NO-SCATTER-NEXT: testl $4096, %esi # imm = 0x1000 +; GATHER-NO-SCATTER-NEXT: je .LBB3_58 +; GATHER-NO-SCATTER-NEXT: .LBB3_57: # %cond.store80 +; GATHER-NO-SCATTER-NEXT: vmovq %xmm7, %rax +; GATHER-NO-SCATTER-NEXT: vmovd %xmm4, (%rax) +; GATHER-NO-SCATTER-NEXT: testl $8192, %esi # imm = 0x2000 +; GATHER-NO-SCATTER-NEXT: je .LBB3_60 +; GATHER-NO-SCATTER-NEXT: .LBB3_59: # %cond.store84 +; GATHER-NO-SCATTER-NEXT: vpextrq $1, %xmm7, %rax +; GATHER-NO-SCATTER-NEXT: vpextrd $1, %xmm4, (%rax) +; GATHER-NO-SCATTER-NEXT: testl $16384, %esi # imm = 0x4000 +; GATHER-NO-SCATTER-NEXT: je .LBB3_62 +; GATHER-NO-SCATTER-NEXT: .LBB3_61: # %cond.store88 +; GATHER-NO-SCATTER-NEXT: vmovq %xmm8, %rax +; GATHER-NO-SCATTER-NEXT: vpextrd $2, %xmm4, (%rax) +; GATHER-NO-SCATTER-NEXT: testl $32768, %esi # imm = 0x8000 +; GATHER-NO-SCATTER-NEXT: je .LBB3_64 +; GATHER-NO-SCATTER-NEXT: .LBB3_63: # %cond.store92 +; GATHER-NO-SCATTER-NEXT: vpextrq $1, %xmm8, %rax +; GATHER-NO-SCATTER-NEXT: vpextrd $3, %xmm4, (%rax) +; GATHER-NO-SCATTER-NEXT: vzeroupper +; GATHER-NO-SCATTER-NEXT: retq ; ; NO-SCATTER-GATHER-LABEL: scatter_test1: -; NO-SCATTER-GATHER-NOT: vpscatterdd +; NO-SCATTER-GATHER: # %bb.0: +; NO-SCATTER-GATHER-NEXT: vpbroadcastq %rdi, %zmm3 +; NO-SCATTER-GATHER-NEXT: vpmovsxdq %ymm0, %zmm2 +; NO-SCATTER-GATHER-NEXT: vpsllq $2, %zmm2, %zmm2 +; NO-SCATTER-GATHER-NEXT: vpaddq %zmm2, %zmm3, %zmm2 +; NO-SCATTER-GATHER-NEXT: testb $1, %sil +; NO-SCATTER-GATHER-NEXT: jne .LBB3_1 +; NO-SCATTER-GATHER-NEXT: # %bb.2: # %else +; NO-SCATTER-GATHER-NEXT: testb $2, %sil +; NO-SCATTER-GATHER-NEXT: jne .LBB3_3 +; NO-SCATTER-GATHER-NEXT: .LBB3_4: # %else2 +; NO-SCATTER-GATHER-NEXT: testb $4, %sil +; NO-SCATTER-GATHER-NEXT: jne .LBB3_5 +; NO-SCATTER-GATHER-NEXT: .LBB3_6: # %else4 +; NO-SCATTER-GATHER-NEXT: testb $8, %sil +; NO-SCATTER-GATHER-NEXT: je .LBB3_8 +; NO-SCATTER-GATHER-NEXT: .LBB3_7: # %cond.store5 +; NO-SCATTER-GATHER-NEXT: vextracti128 $1, %ymm2, %xmm4 +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm4, %rax +; NO-SCATTER-GATHER-NEXT: vpextrd $3, %xmm1, (%rax) +; NO-SCATTER-GATHER-NEXT: .LBB3_8: # %else6 +; NO-SCATTER-GATHER-NEXT: testb $16, %sil +; NO-SCATTER-GATHER-NEXT: vextracti32x4 $2, %zmm2, %xmm5 +; NO-SCATTER-GATHER-NEXT: je .LBB3_10 +; NO-SCATTER-GATHER-NEXT: # %bb.9: # %cond.store7 +; NO-SCATTER-GATHER-NEXT: vextracti128 $1, %ymm1, %xmm4 +; NO-SCATTER-GATHER-NEXT: vmovq %xmm5, %rax +; NO-SCATTER-GATHER-NEXT: vmovd %xmm4, (%rax) +; NO-SCATTER-GATHER-NEXT: .LBB3_10: # %else8 +; NO-SCATTER-GATHER-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; NO-SCATTER-GATHER-NEXT: testb $32, %sil +; NO-SCATTER-GATHER-NEXT: je .LBB3_12 +; NO-SCATTER-GATHER-NEXT: # %bb.11: # %cond.store9 +; NO-SCATTER-GATHER-NEXT: vextracti128 $1, %ymm1, %xmm4 +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm5, %rax +; NO-SCATTER-GATHER-NEXT: vpextrd $1, %xmm4, (%rax) +; NO-SCATTER-GATHER-NEXT: .LBB3_12: # %else10 +; NO-SCATTER-GATHER-NEXT: vpmovsxdq %ymm0, %zmm0 +; NO-SCATTER-GATHER-NEXT: testb $64, %sil +; NO-SCATTER-GATHER-NEXT: vextracti32x4 $3, %zmm2, %xmm6 +; NO-SCATTER-GATHER-NEXT: je .LBB3_14 +; NO-SCATTER-GATHER-NEXT: # %bb.13: # %cond.store11 +; NO-SCATTER-GATHER-NEXT: vextracti128 $1, %ymm1, %xmm4 +; NO-SCATTER-GATHER-NEXT: vmovq %xmm6, %rax +; NO-SCATTER-GATHER-NEXT: vpextrd $2, %xmm4, (%rax) +; NO-SCATTER-GATHER-NEXT: .LBB3_14: # %else12 +; NO-SCATTER-GATHER-NEXT: vpsllq $2, %zmm0, %zmm0 +; NO-SCATTER-GATHER-NEXT: testb %sil, %sil +; NO-SCATTER-GATHER-NEXT: jns .LBB3_16 +; NO-SCATTER-GATHER-NEXT: # %bb.15: # %cond.store13 +; NO-SCATTER-GATHER-NEXT: vextracti128 $1, %ymm1, %xmm4 +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm6, %rax +; NO-SCATTER-GATHER-NEXT: vpextrd $3, %xmm4, (%rax) +; NO-SCATTER-GATHER-NEXT: .LBB3_16: # %else14 +; NO-SCATTER-GATHER-NEXT: vpaddq %zmm0, %zmm3, %zmm0 +; NO-SCATTER-GATHER-NEXT: testl $256, %esi # imm = 0x100 +; NO-SCATTER-GATHER-NEXT: vextracti32x4 $2, %zmm1, %xmm3 +; NO-SCATTER-GATHER-NEXT: jne .LBB3_17 +; NO-SCATTER-GATHER-NEXT: # %bb.18: # %else16 +; NO-SCATTER-GATHER-NEXT: testl $512, %esi # imm = 0x200 +; NO-SCATTER-GATHER-NEXT: jne .LBB3_19 +; NO-SCATTER-GATHER-NEXT: .LBB3_20: # %else18 +; NO-SCATTER-GATHER-NEXT: testl $1024, %esi # imm = 0x400 +; NO-SCATTER-GATHER-NEXT: jne .LBB3_21 +; NO-SCATTER-GATHER-NEXT: .LBB3_22: # %else20 +; NO-SCATTER-GATHER-NEXT: testl $2048, %esi # imm = 0x800 +; NO-SCATTER-GATHER-NEXT: je .LBB3_24 +; NO-SCATTER-GATHER-NEXT: .LBB3_23: # %cond.store21 +; NO-SCATTER-GATHER-NEXT: vextracti128 $1, %ymm0, %xmm4 +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm4, %rax +; NO-SCATTER-GATHER-NEXT: vpextrd $3, %xmm3, (%rax) +; NO-SCATTER-GATHER-NEXT: .LBB3_24: # %else22 +; NO-SCATTER-GATHER-NEXT: testl $4096, %esi # imm = 0x1000 +; NO-SCATTER-GATHER-NEXT: vextracti32x4 $3, %zmm1, %xmm4 +; NO-SCATTER-GATHER-NEXT: vextracti32x4 $2, %zmm0, %xmm7 +; NO-SCATTER-GATHER-NEXT: je .LBB3_26 +; NO-SCATTER-GATHER-NEXT: # %bb.25: # %cond.store23 +; NO-SCATTER-GATHER-NEXT: vmovq %xmm7, %rax +; NO-SCATTER-GATHER-NEXT: vmovd %xmm4, (%rax) +; NO-SCATTER-GATHER-NEXT: .LBB3_26: # %else24 +; NO-SCATTER-GATHER-NEXT: testl $8192, %esi # imm = 0x2000 +; NO-SCATTER-GATHER-NEXT: je .LBB3_28 +; NO-SCATTER-GATHER-NEXT: # %bb.27: # %cond.store25 +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm7, %rax +; NO-SCATTER-GATHER-NEXT: vpextrd $1, %xmm4, (%rax) +; NO-SCATTER-GATHER-NEXT: .LBB3_28: # %else26 +; NO-SCATTER-GATHER-NEXT: testl $16384, %esi # imm = 0x4000 +; NO-SCATTER-GATHER-NEXT: vextracti32x4 $3, %zmm0, %xmm8 +; NO-SCATTER-GATHER-NEXT: jne .LBB3_29 +; NO-SCATTER-GATHER-NEXT: # %bb.30: # %else28 +; NO-SCATTER-GATHER-NEXT: testl $32768, %esi # imm = 0x8000 +; NO-SCATTER-GATHER-NEXT: jne .LBB3_31 +; NO-SCATTER-GATHER-NEXT: .LBB3_32: # %else30 +; NO-SCATTER-GATHER-NEXT: testb $1, %sil +; NO-SCATTER-GATHER-NEXT: jne .LBB3_33 +; NO-SCATTER-GATHER-NEXT: .LBB3_34: # %else35 +; NO-SCATTER-GATHER-NEXT: testb $2, %sil +; NO-SCATTER-GATHER-NEXT: jne .LBB3_35 +; NO-SCATTER-GATHER-NEXT: .LBB3_36: # %else39 +; NO-SCATTER-GATHER-NEXT: testb $4, %sil +; NO-SCATTER-GATHER-NEXT: jne .LBB3_37 +; NO-SCATTER-GATHER-NEXT: .LBB3_38: # %else43 +; NO-SCATTER-GATHER-NEXT: testb $8, %sil +; NO-SCATTER-GATHER-NEXT: jne .LBB3_39 +; NO-SCATTER-GATHER-NEXT: .LBB3_40: # %else47 +; NO-SCATTER-GATHER-NEXT: testb $16, %sil +; NO-SCATTER-GATHER-NEXT: jne .LBB3_41 +; NO-SCATTER-GATHER-NEXT: .LBB3_42: # %else51 +; NO-SCATTER-GATHER-NEXT: testb $32, %sil +; NO-SCATTER-GATHER-NEXT: jne .LBB3_43 +; NO-SCATTER-GATHER-NEXT: .LBB3_44: # %else55 +; NO-SCATTER-GATHER-NEXT: testb $64, %sil +; NO-SCATTER-GATHER-NEXT: jne .LBB3_45 +; NO-SCATTER-GATHER-NEXT: .LBB3_46: # %else59 +; NO-SCATTER-GATHER-NEXT: testb %sil, %sil +; NO-SCATTER-GATHER-NEXT: js .LBB3_47 +; NO-SCATTER-GATHER-NEXT: .LBB3_48: # %else63 +; NO-SCATTER-GATHER-NEXT: testl $256, %esi # imm = 0x100 +; NO-SCATTER-GATHER-NEXT: jne .LBB3_49 +; NO-SCATTER-GATHER-NEXT: .LBB3_50: # %else67 +; NO-SCATTER-GATHER-NEXT: testl $512, %esi # imm = 0x200 +; NO-SCATTER-GATHER-NEXT: jne .LBB3_51 +; NO-SCATTER-GATHER-NEXT: .LBB3_52: # %else71 +; NO-SCATTER-GATHER-NEXT: testl $1024, %esi # imm = 0x400 +; NO-SCATTER-GATHER-NEXT: jne .LBB3_53 +; NO-SCATTER-GATHER-NEXT: .LBB3_54: # %else75 +; NO-SCATTER-GATHER-NEXT: testl $2048, %esi # imm = 0x800 +; NO-SCATTER-GATHER-NEXT: jne .LBB3_55 +; NO-SCATTER-GATHER-NEXT: .LBB3_56: # %else79 +; NO-SCATTER-GATHER-NEXT: testl $4096, %esi # imm = 0x1000 +; NO-SCATTER-GATHER-NEXT: jne .LBB3_57 +; NO-SCATTER-GATHER-NEXT: .LBB3_58: # %else83 +; NO-SCATTER-GATHER-NEXT: testl $8192, %esi # imm = 0x2000 +; NO-SCATTER-GATHER-NEXT: jne .LBB3_59 +; NO-SCATTER-GATHER-NEXT: .LBB3_60: # %else87 +; NO-SCATTER-GATHER-NEXT: testl $16384, %esi # imm = 0x4000 +; NO-SCATTER-GATHER-NEXT: jne .LBB3_61 +; NO-SCATTER-GATHER-NEXT: .LBB3_62: # %else91 +; NO-SCATTER-GATHER-NEXT: testl $32768, %esi # imm = 0x8000 +; NO-SCATTER-GATHER-NEXT: jne .LBB3_63 +; NO-SCATTER-GATHER-NEXT: .LBB3_64: # %else95 +; NO-SCATTER-GATHER-NEXT: vzeroupper +; NO-SCATTER-GATHER-NEXT: retq +; NO-SCATTER-GATHER-NEXT: .LBB3_1: # %cond.store +; NO-SCATTER-GATHER-NEXT: vmovq %xmm2, %rax +; NO-SCATTER-GATHER-NEXT: vmovd %xmm1, (%rax) +; NO-SCATTER-GATHER-NEXT: testb $2, %sil +; NO-SCATTER-GATHER-NEXT: je .LBB3_4 +; NO-SCATTER-GATHER-NEXT: .LBB3_3: # %cond.store1 +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm2, %rax +; NO-SCATTER-GATHER-NEXT: vpextrd $1, %xmm1, (%rax) +; NO-SCATTER-GATHER-NEXT: testb $4, %sil +; NO-SCATTER-GATHER-NEXT: je .LBB3_6 +; NO-SCATTER-GATHER-NEXT: .LBB3_5: # %cond.store3 +; NO-SCATTER-GATHER-NEXT: vextracti128 $1, %ymm2, %xmm4 +; NO-SCATTER-GATHER-NEXT: vmovq %xmm4, %rax +; NO-SCATTER-GATHER-NEXT: vpextrd $2, %xmm1, (%rax) +; NO-SCATTER-GATHER-NEXT: testb $8, %sil +; NO-SCATTER-GATHER-NEXT: jne .LBB3_7 +; NO-SCATTER-GATHER-NEXT: jmp .LBB3_8 +; NO-SCATTER-GATHER-NEXT: .LBB3_17: # %cond.store15 +; NO-SCATTER-GATHER-NEXT: vmovq %xmm0, %rax +; NO-SCATTER-GATHER-NEXT: vmovd %xmm3, (%rax) +; NO-SCATTER-GATHER-NEXT: testl $512, %esi # imm = 0x200 +; NO-SCATTER-GATHER-NEXT: je .LBB3_20 +; NO-SCATTER-GATHER-NEXT: .LBB3_19: # %cond.store17 +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm0, %rax +; NO-SCATTER-GATHER-NEXT: vpextrd $1, %xmm3, (%rax) +; NO-SCATTER-GATHER-NEXT: testl $1024, %esi # imm = 0x400 +; NO-SCATTER-GATHER-NEXT: je .LBB3_22 +; NO-SCATTER-GATHER-NEXT: .LBB3_21: # %cond.store19 +; NO-SCATTER-GATHER-NEXT: vextracti128 $1, %ymm0, %xmm4 +; NO-SCATTER-GATHER-NEXT: vmovq %xmm4, %rax +; NO-SCATTER-GATHER-NEXT: vpextrd $2, %xmm3, (%rax) +; NO-SCATTER-GATHER-NEXT: testl $2048, %esi # imm = 0x800 +; NO-SCATTER-GATHER-NEXT: jne .LBB3_23 +; NO-SCATTER-GATHER-NEXT: jmp .LBB3_24 +; NO-SCATTER-GATHER-NEXT: .LBB3_29: # %cond.store27 +; NO-SCATTER-GATHER-NEXT: vmovq %xmm8, %rax +; NO-SCATTER-GATHER-NEXT: vpextrd $2, %xmm4, (%rax) +; NO-SCATTER-GATHER-NEXT: testl $32768, %esi # imm = 0x8000 +; NO-SCATTER-GATHER-NEXT: je .LBB3_32 +; NO-SCATTER-GATHER-NEXT: .LBB3_31: # %cond.store29 +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm8, %rax +; NO-SCATTER-GATHER-NEXT: vpextrd $3, %xmm4, (%rax) +; NO-SCATTER-GATHER-NEXT: testb $1, %sil +; NO-SCATTER-GATHER-NEXT: je .LBB3_34 +; NO-SCATTER-GATHER-NEXT: .LBB3_33: # %cond.store32 +; NO-SCATTER-GATHER-NEXT: vmovq %xmm2, %rax +; NO-SCATTER-GATHER-NEXT: vmovd %xmm1, (%rax) +; NO-SCATTER-GATHER-NEXT: testb $2, %sil +; NO-SCATTER-GATHER-NEXT: je .LBB3_36 +; NO-SCATTER-GATHER-NEXT: .LBB3_35: # %cond.store36 +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm2, %rax +; NO-SCATTER-GATHER-NEXT: vpextrd $1, %xmm1, (%rax) +; NO-SCATTER-GATHER-NEXT: testb $4, %sil +; NO-SCATTER-GATHER-NEXT: je .LBB3_38 +; NO-SCATTER-GATHER-NEXT: .LBB3_37: # %cond.store40 +; NO-SCATTER-GATHER-NEXT: vextracti128 $1, %ymm2, %xmm9 +; NO-SCATTER-GATHER-NEXT: vmovq %xmm9, %rax +; NO-SCATTER-GATHER-NEXT: vpextrd $2, %xmm1, (%rax) +; NO-SCATTER-GATHER-NEXT: testb $8, %sil +; NO-SCATTER-GATHER-NEXT: je .LBB3_40 +; NO-SCATTER-GATHER-NEXT: .LBB3_39: # %cond.store44 +; NO-SCATTER-GATHER-NEXT: vextracti128 $1, %ymm2, %xmm2 +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm2, %rax +; NO-SCATTER-GATHER-NEXT: vpextrd $3, %xmm1, (%rax) +; NO-SCATTER-GATHER-NEXT: testb $16, %sil +; NO-SCATTER-GATHER-NEXT: je .LBB3_42 +; NO-SCATTER-GATHER-NEXT: .LBB3_41: # %cond.store48 +; NO-SCATTER-GATHER-NEXT: vextracti128 $1, %ymm1, %xmm2 +; NO-SCATTER-GATHER-NEXT: vmovq %xmm5, %rax +; NO-SCATTER-GATHER-NEXT: vmovd %xmm2, (%rax) +; NO-SCATTER-GATHER-NEXT: testb $32, %sil +; NO-SCATTER-GATHER-NEXT: je .LBB3_44 +; NO-SCATTER-GATHER-NEXT: .LBB3_43: # %cond.store52 +; NO-SCATTER-GATHER-NEXT: vextracti128 $1, %ymm1, %xmm2 +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm5, %rax +; NO-SCATTER-GATHER-NEXT: vpextrd $1, %xmm2, (%rax) +; NO-SCATTER-GATHER-NEXT: testb $64, %sil +; NO-SCATTER-GATHER-NEXT: je .LBB3_46 +; NO-SCATTER-GATHER-NEXT: .LBB3_45: # %cond.store56 +; NO-SCATTER-GATHER-NEXT: vextracti128 $1, %ymm1, %xmm2 +; NO-SCATTER-GATHER-NEXT: vmovq %xmm6, %rax +; NO-SCATTER-GATHER-NEXT: vpextrd $2, %xmm2, (%rax) +; NO-SCATTER-GATHER-NEXT: testb %sil, %sil +; NO-SCATTER-GATHER-NEXT: jns .LBB3_48 +; NO-SCATTER-GATHER-NEXT: .LBB3_47: # %cond.store60 +; NO-SCATTER-GATHER-NEXT: vextracti128 $1, %ymm1, %xmm1 +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm6, %rax +; NO-SCATTER-GATHER-NEXT: vpextrd $3, %xmm1, (%rax) +; NO-SCATTER-GATHER-NEXT: testl $256, %esi # imm = 0x100 +; NO-SCATTER-GATHER-NEXT: je .LBB3_50 +; NO-SCATTER-GATHER-NEXT: .LBB3_49: # %cond.store64 +; NO-SCATTER-GATHER-NEXT: vmovq %xmm0, %rax +; NO-SCATTER-GATHER-NEXT: vmovd %xmm3, (%rax) +; NO-SCATTER-GATHER-NEXT: testl $512, %esi # imm = 0x200 +; NO-SCATTER-GATHER-NEXT: je .LBB3_52 +; NO-SCATTER-GATHER-NEXT: .LBB3_51: # %cond.store68 +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm0, %rax +; NO-SCATTER-GATHER-NEXT: vpextrd $1, %xmm3, (%rax) +; NO-SCATTER-GATHER-NEXT: testl $1024, %esi # imm = 0x400 +; NO-SCATTER-GATHER-NEXT: je .LBB3_54 +; NO-SCATTER-GATHER-NEXT: .LBB3_53: # %cond.store72 +; NO-SCATTER-GATHER-NEXT: vextracti128 $1, %ymm0, %xmm1 +; NO-SCATTER-GATHER-NEXT: vmovq %xmm1, %rax +; NO-SCATTER-GATHER-NEXT: vpextrd $2, %xmm3, (%rax) +; NO-SCATTER-GATHER-NEXT: testl $2048, %esi # imm = 0x800 +; NO-SCATTER-GATHER-NEXT: je .LBB3_56 +; NO-SCATTER-GATHER-NEXT: .LBB3_55: # %cond.store76 +; NO-SCATTER-GATHER-NEXT: vextracti128 $1, %ymm0, %xmm0 +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm0, %rax +; NO-SCATTER-GATHER-NEXT: vpextrd $3, %xmm3, (%rax) +; NO-SCATTER-GATHER-NEXT: testl $4096, %esi # imm = 0x1000 +; NO-SCATTER-GATHER-NEXT: je .LBB3_58 +; NO-SCATTER-GATHER-NEXT: .LBB3_57: # %cond.store80 +; NO-SCATTER-GATHER-NEXT: vmovq %xmm7, %rax +; NO-SCATTER-GATHER-NEXT: vmovd %xmm4, (%rax) +; NO-SCATTER-GATHER-NEXT: testl $8192, %esi # imm = 0x2000 +; NO-SCATTER-GATHER-NEXT: je .LBB3_60 +; NO-SCATTER-GATHER-NEXT: .LBB3_59: # %cond.store84 +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm7, %rax +; NO-SCATTER-GATHER-NEXT: vpextrd $1, %xmm4, (%rax) +; NO-SCATTER-GATHER-NEXT: testl $16384, %esi # imm = 0x4000 +; NO-SCATTER-GATHER-NEXT: je .LBB3_62 +; NO-SCATTER-GATHER-NEXT: .LBB3_61: # %cond.store88 +; NO-SCATTER-GATHER-NEXT: vmovq %xmm8, %rax +; NO-SCATTER-GATHER-NEXT: vpextrd $2, %xmm4, (%rax) +; NO-SCATTER-GATHER-NEXT: testl $32768, %esi # imm = 0x8000 +; NO-SCATTER-GATHER-NEXT: je .LBB3_64 +; NO-SCATTER-GATHER-NEXT: .LBB3_63: # %cond.store92 +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm8, %rax +; NO-SCATTER-GATHER-NEXT: vpextrd $3, %xmm4, (%rax) +; NO-SCATTER-GATHER-NEXT: vzeroupper +; NO-SCATTER-GATHER-NEXT: retq %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer @@ -181,17 +2687,168 @@ declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> , <16 x ptr> , i32 , <16 x i1> ) define <8 x i32> @scatter_test2(<8 x i32>%a1, <8 x ptr> %ptr) { +; GATHER-LABEL: scatter_test2: +; GATHER: # %bb.0: +; GATHER-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; GATHER-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; GATHER-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; GATHER-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 +; GATHER-NEXT: vpgatherqd %xmm6, (,%ymm2), %xmm5 +; GATHER-NEXT: vpgatherqd %xmm3, (,%ymm1), %xmm4 +; GATHER-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm3 +; GATHER-NEXT: vmovq %xmm1, %rax +; GATHER-NEXT: vmovss %xmm0, (%rax) +; GATHER-NEXT: vpextrq $1, %xmm1, %rax +; GATHER-NEXT: vextractps $1, %xmm0, (%rax) +; GATHER-NEXT: vextracti128 $1, %ymm1, %xmm1 +; GATHER-NEXT: vmovq %xmm1, %rax +; GATHER-NEXT: vextractps $2, %xmm0, (%rax) +; GATHER-NEXT: vpextrq $1, %xmm1, %rax +; GATHER-NEXT: vextractps $3, %xmm0, (%rax) +; GATHER-NEXT: vextractf128 $1, %ymm0, %xmm0 +; GATHER-NEXT: vmovq %xmm2, %rax +; GATHER-NEXT: vmovss %xmm0, (%rax) +; GATHER-NEXT: vpextrq $1, %xmm2, %rax +; GATHER-NEXT: vextractps $1, %xmm0, (%rax) +; GATHER-NEXT: vextracti128 $1, %ymm2, %xmm1 +; GATHER-NEXT: vmovq %xmm1, %rax +; GATHER-NEXT: vextractps $2, %xmm0, (%rax) +; GATHER-NEXT: vpextrq $1, %xmm1, %rax +; GATHER-NEXT: vextractps $3, %xmm0, (%rax) +; GATHER-NEXT: vmovdqa %ymm3, %ymm0 +; GATHER-NEXT: retq +; +; NO-GATHER-LABEL: scatter_test2: +; NO-GATHER: # %bb.0: +; NO-GATHER-NEXT: vmovq %xmm1, %rax +; NO-GATHER-NEXT: vpextrq $1, %xmm1, %rcx +; NO-GATHER-NEXT: vextracti128 $1, %ymm1, %xmm1 +; NO-GATHER-NEXT: vmovq %xmm1, %rdx +; NO-GATHER-NEXT: vpextrq $1, %xmm1, %rsi +; NO-GATHER-NEXT: vmovq %xmm2, %rdi +; NO-GATHER-NEXT: vpextrq $1, %xmm2, %r8 +; NO-GATHER-NEXT: vextracti128 $1, %ymm2, %xmm1 +; NO-GATHER-NEXT: vmovq %xmm1, %r9 +; NO-GATHER-NEXT: vpextrq $1, %xmm1, %r10 +; NO-GATHER-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; NO-GATHER-NEXT: vpinsrd $1, (%r8), %xmm1, %xmm1 +; NO-GATHER-NEXT: vpinsrd $2, (%r9), %xmm1, %xmm1 +; NO-GATHER-NEXT: vpinsrd $3, (%r10), %xmm1, %xmm1 +; NO-GATHER-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; NO-GATHER-NEXT: vpinsrd $1, (%rcx), %xmm2, %xmm2 +; NO-GATHER-NEXT: vpinsrd $2, (%rdx), %xmm2, %xmm2 +; NO-GATHER-NEXT: vpinsrd $3, (%rsi), %xmm2, %xmm2 +; NO-GATHER-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; NO-GATHER-NEXT: vmovss %xmm0, (%rax) +; NO-GATHER-NEXT: vextractps $1, %xmm0, (%rcx) +; NO-GATHER-NEXT: vextractps $2, %xmm0, (%rdx) +; NO-GATHER-NEXT: vextractps $3, %xmm0, (%rsi) +; NO-GATHER-NEXT: vextractf128 $1, %ymm0, %xmm0 +; NO-GATHER-NEXT: vmovss %xmm0, (%rdi) +; NO-GATHER-NEXT: vextractps $1, %xmm0, (%r8) +; NO-GATHER-NEXT: vextractps $2, %xmm0, (%r9) +; NO-GATHER-NEXT: vextractps $3, %xmm0, (%r10) +; NO-GATHER-NEXT: vmovdqa %ymm1, %ymm0 +; NO-GATHER-NEXT: retq +; ; SCATTER-LABEL: scatter_test2: -; SCATTER: vpscatterqd +; SCATTER: # %bb.0: +; SCATTER-NEXT: kxnorw %k0, %k0, %k1 +; SCATTER-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; SCATTER-NEXT: kxnorw %k0, %k0, %k2 +; SCATTER-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} +; SCATTER-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; SCATTER-NEXT: vmovdqa %ymm2, %ymm0 +; SCATTER-NEXT: retq ; ; SCATTER-NO-GATHER-LABEL: scatter_test2: -; SCATTER-NO-GATHER: vpscatterqd +; SCATTER-NO-GATHER: # %bb.0: +; SCATTER-NO-GATHER-NEXT: vmovq %xmm1, %rax +; SCATTER-NO-GATHER-NEXT: vpextrq $1, %xmm1, %rcx +; SCATTER-NO-GATHER-NEXT: vextracti128 $1, %ymm1, %xmm2 +; SCATTER-NO-GATHER-NEXT: vmovq %xmm2, %rdx +; SCATTER-NO-GATHER-NEXT: vpextrq $1, %xmm2, %rsi +; SCATTER-NO-GATHER-NEXT: vextracti32x4 $2, %zmm1, %xmm2 +; SCATTER-NO-GATHER-NEXT: vmovq %xmm2, %rdi +; SCATTER-NO-GATHER-NEXT: vpextrq $1, %xmm2, %r8 +; SCATTER-NO-GATHER-NEXT: vextracti32x4 $3, %zmm1, %xmm2 +; SCATTER-NO-GATHER-NEXT: vmovq %xmm2, %r9 +; SCATTER-NO-GATHER-NEXT: vpextrq $1, %xmm2, %r10 +; SCATTER-NO-GATHER-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SCATTER-NO-GATHER-NEXT: vpinsrd $1, (%r8), %xmm2, %xmm2 +; SCATTER-NO-GATHER-NEXT: vpinsrd $2, (%r9), %xmm2, %xmm2 +; SCATTER-NO-GATHER-NEXT: vpinsrd $3, (%r10), %xmm2, %xmm2 +; SCATTER-NO-GATHER-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SCATTER-NO-GATHER-NEXT: vpinsrd $1, (%rcx), %xmm3, %xmm3 +; SCATTER-NO-GATHER-NEXT: vpinsrd $2, (%rdx), %xmm3, %xmm3 +; SCATTER-NO-GATHER-NEXT: vpinsrd $3, (%rsi), %xmm3, %xmm3 +; SCATTER-NO-GATHER-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; SCATTER-NO-GATHER-NEXT: kxnorw %k0, %k0, %k1 +; SCATTER-NO-GATHER-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; SCATTER-NO-GATHER-NEXT: vmovdqa %ymm2, %ymm0 +; SCATTER-NO-GATHER-NEXT: retq ; ; GATHER-NO-SCATTER-LABEL: scatter_test2: -; GATHER-NO-SCATTER-NOT: vpscatterqd +; GATHER-NO-SCATTER: # %bb.0: +; GATHER-NO-SCATTER-NEXT: kxnorw %k0, %k0, %k1 +; GATHER-NO-SCATTER-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; GATHER-NO-SCATTER-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k1} +; GATHER-NO-SCATTER-NEXT: vmovq %xmm1, %rax +; GATHER-NO-SCATTER-NEXT: vmovss %xmm0, (%rax) +; GATHER-NO-SCATTER-NEXT: vpextrq $1, %xmm1, %rax +; GATHER-NO-SCATTER-NEXT: vextractps $1, %xmm0, (%rax) +; GATHER-NO-SCATTER-NEXT: vextracti128 $1, %ymm1, %xmm3 +; GATHER-NO-SCATTER-NEXT: vmovq %xmm3, %rax +; GATHER-NO-SCATTER-NEXT: vextractps $2, %xmm0, (%rax) +; GATHER-NO-SCATTER-NEXT: vpextrq $1, %xmm3, %rax +; GATHER-NO-SCATTER-NEXT: vextractps $3, %xmm0, (%rax) +; GATHER-NO-SCATTER-NEXT: vextractf128 $1, %ymm0, %xmm0 +; GATHER-NO-SCATTER-NEXT: vextracti32x4 $2, %zmm1, %xmm3 +; GATHER-NO-SCATTER-NEXT: vmovq %xmm3, %rax +; GATHER-NO-SCATTER-NEXT: vmovss %xmm0, (%rax) +; GATHER-NO-SCATTER-NEXT: vpextrq $1, %xmm3, %rax +; GATHER-NO-SCATTER-NEXT: vextractps $1, %xmm0, (%rax) +; GATHER-NO-SCATTER-NEXT: vextracti32x4 $3, %zmm1, %xmm1 +; GATHER-NO-SCATTER-NEXT: vmovq %xmm1, %rax +; GATHER-NO-SCATTER-NEXT: vextractps $2, %xmm0, (%rax) +; GATHER-NO-SCATTER-NEXT: vpextrq $1, %xmm1, %rax +; GATHER-NO-SCATTER-NEXT: vextractps $3, %xmm0, (%rax) +; GATHER-NO-SCATTER-NEXT: vmovdqa %ymm2, %ymm0 +; GATHER-NO-SCATTER-NEXT: retq ; ; NO-SCATTER-GATHER-LABEL: scatter_test2: -; NO-SCATTER-GATHER-NOT: vpscatterqd +; NO-SCATTER-GATHER: # %bb.0: +; NO-SCATTER-GATHER-NEXT: vmovq %xmm1, %rax +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm1, %rcx +; NO-SCATTER-GATHER-NEXT: vextracti128 $1, %ymm1, %xmm2 +; NO-SCATTER-GATHER-NEXT: vmovq %xmm2, %rdx +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm2, %rsi +; NO-SCATTER-GATHER-NEXT: vextracti32x4 $2, %zmm1, %xmm2 +; NO-SCATTER-GATHER-NEXT: vmovq %xmm2, %rdi +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm2, %r8 +; NO-SCATTER-GATHER-NEXT: vextracti32x4 $3, %zmm1, %xmm1 +; NO-SCATTER-GATHER-NEXT: vmovq %xmm1, %r9 +; NO-SCATTER-GATHER-NEXT: vpextrq $1, %xmm1, %r10 +; NO-SCATTER-GATHER-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; NO-SCATTER-GATHER-NEXT: vpinsrd $1, (%r8), %xmm1, %xmm1 +; NO-SCATTER-GATHER-NEXT: vpinsrd $2, (%r9), %xmm1, %xmm1 +; NO-SCATTER-GATHER-NEXT: vpinsrd $3, (%r10), %xmm1, %xmm1 +; NO-SCATTER-GATHER-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; NO-SCATTER-GATHER-NEXT: vpinsrd $1, (%rcx), %xmm2, %xmm2 +; NO-SCATTER-GATHER-NEXT: vpinsrd $2, (%rdx), %xmm2, %xmm2 +; NO-SCATTER-GATHER-NEXT: vpinsrd $3, (%rsi), %xmm2, %xmm2 +; NO-SCATTER-GATHER-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; NO-SCATTER-GATHER-NEXT: vmovss %xmm0, (%rax) +; NO-SCATTER-GATHER-NEXT: vextractps $1, %xmm0, (%rcx) +; NO-SCATTER-GATHER-NEXT: vextractps $2, %xmm0, (%rdx) +; NO-SCATTER-GATHER-NEXT: vextractps $3, %xmm0, (%rsi) +; NO-SCATTER-GATHER-NEXT: vextractf128 $1, %ymm0, %xmm0 +; NO-SCATTER-GATHER-NEXT: vmovss %xmm0, (%rdi) +; NO-SCATTER-GATHER-NEXT: vextractps $1, %xmm0, (%r8) +; NO-SCATTER-GATHER-NEXT: vextractps $2, %xmm0, (%r9) +; NO-SCATTER-GATHER-NEXT: vextractps $3, %xmm0, (%r10) +; NO-SCATTER-GATHER-NEXT: vmovdqa %ymm1, %ymm0 +; NO-SCATTER-GATHER-NEXT: retq %a = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %ptr, i32 4, <8 x i1> , <8 x i32> undef) call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> ) diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll --- a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll +++ b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll @@ -1178,12 +1178,14 @@ ; ENABLE: ## %bb.0: ## %entry ; ENABLE-NEXT: pushq %rbx ; ENABLE-NEXT: subq $16, %rsp -; ENABLE-NEXT: xorl %eax, %eax -; ENABLE-NEXT: cmpb $0, _b(%rip) -; ENABLE-NEXT: movl $48, %ecx -; ENABLE-NEXT: cmovnel %eax, %ecx -; ENABLE-NEXT: movb %cl, _c(%rip) -; ENABLE-NEXT: je LBB14_4 +; ENABLE-NEXT: movzbl _b(%rip), %eax +; ENABLE-NEXT: xorl %ecx, %ecx +; ENABLE-NEXT: testb %al, %al +; ENABLE-NEXT: movl $48, %r8d +; ENABLE-NEXT: cmovnel %ecx, %r8d +; ENABLE-NEXT: movb %r8b, _c(%rip) +; ENABLE-NEXT: cmpb $1, %al +; ENABLE-NEXT: jne LBB14_4 ; ENABLE-NEXT: ## %bb.1: ## %for.body.lr.ph ; ENABLE-NEXT: ## InlineAsm Start ; ENABLE-NEXT: nop @@ -1213,12 +1215,14 @@ ; DISABLE: ## %bb.0: ## %entry ; DISABLE-NEXT: pushq %rbx ; DISABLE-NEXT: subq $16, %rsp -; DISABLE-NEXT: xorl %eax, %eax -; DISABLE-NEXT: cmpb $0, _b(%rip) -; DISABLE-NEXT: movl $48, %ecx -; DISABLE-NEXT: cmovnel %eax, %ecx -; DISABLE-NEXT: movb %cl, _c(%rip) -; DISABLE-NEXT: je LBB14_4 +; DISABLE-NEXT: movzbl _b(%rip), %eax +; DISABLE-NEXT: xorl %ecx, %ecx +; DISABLE-NEXT: testb %al, %al +; DISABLE-NEXT: movl $48, %r8d +; DISABLE-NEXT: cmovnel %ecx, %r8d +; DISABLE-NEXT: movb %r8b, _c(%rip) +; DISABLE-NEXT: cmpb $1, %al +; DISABLE-NEXT: jne LBB14_4 ; DISABLE-NEXT: ## %bb.1: ## %for.body.lr.ph ; DISABLE-NEXT: ## InlineAsm Start ; DISABLE-NEXT: nop diff --git a/llvm/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll --- a/llvm/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll @@ -8,8 +8,9 @@ define <4 x i64> @broadcast128(<2 x i64> %src) { ; CHECK-LABEL: broadcast128: ; CHECK: ## %bb.0: +; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0 ; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: retq %1 = alloca <2 x i64>, align 16 store <2 x i64> %src, ptr %1, align 16 diff --git a/llvm/test/CodeGen/X86/xop-shifts.ll b/llvm/test/CodeGen/X86/xop-shifts.ll --- a/llvm/test/CodeGen/X86/xop-shifts.ll +++ b/llvm/test/CodeGen/X86/xop-shifts.ll @@ -8,9 +8,10 @@ define <16 x i8> @demandedelts_vpshab(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: demandedelts_vpshab: ; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vpshab %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %shuffle = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> %shift = call <16 x i8> @llvm.x86.xop.vpshab(<16 x i8> %shuffle, <16 x i8> %a1) diff --git a/llvm/test/CodeGen/X86/xor.ll b/llvm/test/CodeGen/X86/xor.ll --- a/llvm/test/CodeGen/X86/xor.ll +++ b/llvm/test/CodeGen/X86/xor.ll @@ -394,8 +394,8 @@ define i32 @PR17487(i1 %tobool) { ; X86-LABEL: PR17487: ; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: notb %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: notl %ecx ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: testb $1, %cl ; X86-NEXT: sete %al diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll @@ -1439,38 +1439,38 @@ ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_v32i8_to_v16i16_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE42-NEXT: paddb (%rdx), %xmm2 ; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v32i8_to_v16i16_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_v32i8_to_v16i16_factor2: @@ -1522,12 +1522,12 @@ ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_v32i8_to_v8i32_factor4: @@ -1606,12 +1606,12 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_v32i8_to_v4i64_factor8: @@ -1830,12 +1830,12 @@ ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_v16i16_to_v8i32_factor2: @@ -1845,23 +1845,23 @@ ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm2 -; SSE42-NEXT: movdqa %xmm2, (%rcx) +; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v16i16_to_v8i32_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_v16i16_to_v8i32_factor2: @@ -1915,12 +1915,12 @@ ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_v16i16_to_v4i64_factor4: @@ -2122,38 +2122,82 @@ ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; -; AVX2-LABEL: vec256_v16i16_to_v1i256_factor16: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: vec256_v16i16_to_v1i256_factor16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq ; -; AVX512F-LABEL: vec256_v16i16_to_v1i256_factor16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX2-FAST-PERLANE-LABEL: vec256_v16i16_to_v1i256_factor16: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; -; AVX512BW-LABEL: vec256_v16i16_to_v1i256_factor16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX2-FAST-LABEL: vec256_v16i16_to_v1i256_factor16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX2-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-SLOW-LABEL: vec256_v16i16_to_v1i256_factor16: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: vec256_v16i16_to_v1i256_factor16: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX512F-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq +; +; AVX512BW-SLOW-LABEL: vec256_v16i16_to_v1i256_factor16: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec256_v16i16_to_v1i256_factor16: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX512BW-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias @@ -2175,12 +2219,12 @@ ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_v8i32_to_v4i64_factor2: @@ -2190,23 +2234,23 @@ ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero ; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm2 -; SSE42-NEXT: movdqa %xmm2, (%rcx) +; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v8i32_to_v4i64_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_v8i32_to_v4i64_factor2: @@ -2291,11 +2335,11 @@ ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -2327,8 +2371,8 @@ ; ; AVX2-FAST-LABEL: vec256_v8i32_to_v2i128_factor4: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,u,1,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -2465,10 +2509,10 @@ ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: paddb 16(%rdx), %xmm0 ; SSE-NEXT: paddb (%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: paddb 16(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec256_v4i64_to_v2i128_factor2: @@ -2478,11 +2522,11 @@ ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -2672,14 +2716,14 @@ ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 ; SSE2-NEXT: paddb 32(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v48i8_to_v24i16_factor2: @@ -2689,15 +2733,15 @@ ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 16(%rsi), %xmm1 ; SSE42-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE42-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE42-NEXT: pxor %xmm3, %xmm3 -; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE42-NEXT: paddb (%rdx), %xmm3 ; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm3, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v48i8_to_v24i16_factor2: @@ -2706,15 +2750,15 @@ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: retq ; @@ -2739,6 +2783,8 @@ ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) @@ -2798,30 +2844,30 @@ ; SSE42-NEXT: movdqa %xmm0, %xmm1 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[11],zero,zero,xmm1[12],zero,zero,xmm1[13],zero,zero,xmm1[14],zero,zero,xmm1[15],zero,zero ; SSE42-NEXT: movdqa %xmm0, %xmm2 -; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero,zero,xmm2[5] -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[6],zero,zero,xmm0[7],zero,zero,xmm0[8],zero,zero,xmm0[9],zero,zero,xmm0[10],zero -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,xmm2[6],zero,zero,xmm2[7],zero,zero,xmm2[8],zero,zero,xmm2[9],zero,zero,xmm2[10],zero +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[1],zero,zero,xmm0[2],zero,zero,xmm0[3],zero,zero,xmm0[4],zero,zero,xmm0[5] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v48i8_to_v16i24_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0],zero,zero,xmm0[1],zero,zero,xmm0[2],zero,zero,xmm0[3],zero,zero,xmm0[4],zero,zero,xmm0[5] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[6],zero,zero,xmm0[7],zero,zero,xmm0[8],zero,zero,xmm0[9],zero,zero,xmm0[10],zero +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[6],zero,zero,xmm0[7],zero,zero,xmm0[8],zero,zero,xmm0[9],zero,zero,xmm0[10],zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0],zero,zero,xmm0[1],zero,zero,xmm0[2],zero,zero,xmm0[3],zero,zero,xmm0[4],zero,zero,xmm0[5] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) -; AVX-NEXT: retq +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_v48i8_to_v16i24_factor3: ; AVX2: # %bb.0: @@ -2844,6 +2890,8 @@ ; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0],zero,zero,ymm1[1],zero,zero,ymm1[2],zero,zero,ymm1[3],zero,zero,ymm1[4],zero,zero,ymm1[5],zero,zero,ymm1[22],zero,zero,ymm1[23],zero,zero,ymm1[24],zero,zero,ymm1[25],zero,zero,ymm1[26],zero ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) @@ -2887,14 +2935,14 @@ ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 ; SSE2-NEXT: paddb 32(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v48i8_to_v12i32_factor4: @@ -2977,6 +3025,8 @@ ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) @@ -3013,16 +3063,16 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 ; SSE2-NEXT: paddb 32(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v48i8_to_v8i48_factor6: @@ -3032,28 +3082,28 @@ ; SSE42-NEXT: movdqa %xmm0, %xmm1 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[6],zero,zero,zero,zero,zero,xmm1[7],zero,zero,zero,zero,zero ; SSE42-NEXT: movdqa %xmm0, %xmm2 -; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,xmm2[2],zero,zero,zero -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,xmm0[5],zero -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,xmm2[3],zero,zero,zero,zero,zero,xmm2[4],zero,zero,zero,zero,zero,xmm2[5],zero +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v48i8_to_v8i48_factor6: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,xmm0[5],zero +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,xmm0[5],zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX-NEXT: retq ; @@ -3076,14 +3126,16 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,u,3,u,2,u,1,u,4,u,5,u,6,u,5,u] -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,u,3,u,2,u,1,u,4,u,5,u,6,u,5,u] +; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3091,10 +3143,8 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,3,0,1,4,0,2,5,0,3,0,1,4,0,2,5] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,u,3,u,2,u,1,u,4,u,5,u,6,u,5,u] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 @@ -3126,14 +3176,14 @@ ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 ; SSE2-NEXT: paddb 32(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v48i8_to_v6i64_factor8: @@ -3216,6 +3266,8 @@ ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) @@ -3275,28 +3327,28 @@ ; SSE42-NEXT: movdqa %xmm0, %xmm1 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: movdqa %xmm0, %xmm2 -; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v48i8_to_v4i96_factor12: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX-NEXT: retq ; @@ -3319,14 +3371,16 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3442,14 +3496,16 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3515,14 +3571,14 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -3586,8 +3642,8 @@ ; SSE-NEXT: movaps 16(%rdx), %xmm1 ; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps %xmm2, 32(%rcx) +; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq ; @@ -3663,14 +3719,14 @@ ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 ; SSE2-NEXT: paddb 32(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v24i16_to_v12i32_factor2: @@ -3683,12 +3739,12 @@ ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm3 +; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm3, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v24i16_to_v12i32_factor2: @@ -3697,15 +3753,15 @@ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: retq ; @@ -3730,6 +3786,8 @@ ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) @@ -3770,16 +3828,16 @@ ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 ; SSE2-NEXT: paddb 32(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v24i16_to_v8i48_factor3: @@ -3789,51 +3847,51 @@ ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1,2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm3 +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,2] +; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm3 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm3, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v24i16_to_v8i48_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2] ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_v24i16_to_v8i48_factor3: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,u,1,1,u,2,2> -; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] -; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,1,1,u,2,2> +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -3841,14 +3899,14 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,u,1,1,u,2,2> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,1,1,u,2,2> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -3856,14 +3914,14 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,u,1,1,u,2,2> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero -; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,1,1,u,2,2> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -3877,6 +3935,8 @@ ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) @@ -3953,14 +4013,14 @@ ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 ; SSE2-NEXT: paddb 32(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v24i16_to_v6i64_factor4: @@ -3984,30 +4044,30 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_v24i16_to_v6i64_factor4: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -4015,12 +4075,12 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -4028,12 +4088,12 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -4084,17 +4144,17 @@ ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb 32(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v24i16_to_v4i96_factor6: @@ -4120,24 +4180,24 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,3],zero,zero -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6,7] -; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4,5,6,7] -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) -; AVX-NEXT: retq -; -; AVX2-SLOW-LABEL: vec384_v24i16_to_v4i96_factor6: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,3],zero,zero +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6,7] +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm3, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: retq +; +; AVX2-SLOW-LABEL: vec384_v24i16_to_v4i96_factor6: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] @@ -4188,10 +4248,12 @@ ; AVX512F-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 ; AVX512F-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) @@ -4297,32 +4359,32 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] -; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_v24i16_to_v3i128_factor8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-SLOW-NEXT: vzeroupper @@ -4330,15 +4392,15 @@ ; ; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v3i128_factor8: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -4346,15 +4408,15 @@ ; ; AVX2-FAST-LABEL: vec384_v24i16_to_v3i128_factor8: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-NEXT: vzeroupper @@ -4367,11 +4429,13 @@ ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) @@ -4551,8 +4615,8 @@ ; SSE2-NEXT: movaps 16(%rdx), %xmm1 ; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: paddb (%rdx), %xmm0 -; SSE2-NEXT: movaps %xmm1, 16(%rcx) ; SSE2-NEXT: movaps %xmm2, 32(%rcx) +; SSE2-NEXT: movaps %xmm1, 16(%rcx) ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; @@ -4565,8 +4629,8 @@ ; SSE42-NEXT: movaps 16(%rdx), %xmm0 ; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movaps %xmm0, 16(%rcx) ; SSE42-NEXT: movaps %xmm2, 32(%rcx) +; SSE42-NEXT: movaps %xmm0, 16(%rcx) ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; @@ -4585,18 +4649,44 @@ ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX2-LABEL: vec384_v24i16_to_v1i384_factor24: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: vec384_v24i16_to_v1i384_factor24: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v1i384_factor24: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec384_v24i16_to_v1i384_factor24: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX2-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: vec384_v24i16_to_v1i384_factor24: ; AVX512F: # %bb.0: @@ -4644,14 +4734,14 @@ ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 ; SSE2-NEXT: paddb 32(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v12i32_to_v6i64_factor2: @@ -4664,12 +4754,12 @@ ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm3 +; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm3, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v12i32_to_v6i64_factor2: @@ -4678,15 +4768,15 @@ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: retq ; @@ -4697,10 +4787,10 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4709,11 +4799,11 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4787,20 +4877,20 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -4808,16 +4898,16 @@ ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -4825,14 +4915,14 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -4840,14 +4930,14 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -4858,11 +4948,11 @@ ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <16,1,2,17,4,5,18,7,8,19,10,11,u,u,u,u> ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4953,36 +5043,36 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_v12i32_to_v3i128_factor4: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-SLOW-NEXT: vzeroupper @@ -4990,15 +5080,15 @@ ; ; AVX2-FAST-PERLANE-LABEL: vec384_v12i32_to_v3i128_factor4: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -5008,15 +5098,15 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,u,u,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -5027,11 +5117,11 @@ ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <16,1,2,3,17,5,6,7,18,9,10,11,u,u,u,u> ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5122,12 +5212,12 @@ ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7] ; AVX-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -5163,13 +5253,13 @@ ; ; AVX2-FAST-LABEL: vec384_v12i32_to_v2i192_factor6: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0] ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) @@ -5181,10 +5271,11 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: movb $65, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z} -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [16,1,2,3,4,5,17,7,16,1,2,3,4,5,17,7] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) @@ -5226,8 +5317,8 @@ ; SSE2-NEXT: movaps 16(%rdx), %xmm0 ; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movaps %xmm0, 16(%rcx) ; SSE2-NEXT: movaps %xmm2, 32(%rcx) +; SSE2-NEXT: movaps %xmm0, 16(%rcx) ; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; @@ -5240,8 +5331,8 @@ ; SSE42-NEXT: movaps 16(%rdx), %xmm0 ; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movaps %xmm0, 16(%rcx) ; SSE42-NEXT: movaps %xmm2, 32(%rcx) +; SSE42-NEXT: movaps %xmm0, 16(%rcx) ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; @@ -5249,9 +5340,9 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx) @@ -5320,12 +5411,12 @@ ; SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero ; SSE-NEXT: movq {{.*#+}} xmm2 = xmm0[0],zero ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: paddb 16(%rdx), %xmm0 ; SSE-NEXT: paddb (%rdx), %xmm2 +; SSE-NEXT: paddb 16(%rdx), %xmm0 ; SSE-NEXT: paddb 32(%rdx), %xmm1 ; SSE-NEXT: movdqa %xmm1, 32(%rcx) -; SSE-NEXT: movdqa %xmm2, (%rcx) ; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: movdqa %xmm2, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec384_v6i64_to_v3i128_factor2: @@ -5338,13 +5429,13 @@ ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[3] ; AVX-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -5371,11 +5462,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,9,1,11,2,13,u,u> ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5445,12 +5536,12 @@ ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -5472,10 +5563,11 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: movb $9, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z} -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [8,1,2,9,8,1,2,9] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) @@ -5516,8 +5608,8 @@ ; SSE-NEXT: movaps 16(%rdx), %xmm1 ; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps %xmm2, 32(%rcx) +; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq ; @@ -5525,8 +5617,8 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx) @@ -5590,8 +5682,8 @@ ; SSE-NEXT: movaps 16(%rdx), %xmm1 ; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps %xmm2, 32(%rcx) +; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq ; @@ -5621,8 +5713,9 @@ ; ; AVX512F-LABEL: vec384_v3i128_to_v1i384_factor3: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) @@ -5661,19 +5754,19 @@ ; SSE2-NEXT: paddb 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm4 -; SSE2-NEXT: paddb 48(%rdx), %xmm1 -; SSE2-NEXT: paddb 32(%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, 32(%rcx) -; SSE2-NEXT: movdqa %xmm1, 48(%rcx) -; SSE2-NEXT: movdqa %xmm4, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm4 +; SSE2-NEXT: paddb 32(%rdx), %xmm1 +; SSE2-NEXT: paddb 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 48(%rcx) +; SSE2-NEXT: movdqa %xmm1, 32(%rcx) +; SSE2-NEXT: movdqa %xmm4, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v64i8_to_v32i16_factor2: @@ -5682,53 +5775,53 @@ ; SSE42-NEXT: movdqa 16(%rdi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 16(%rsi), %xmm1 -; SSE42-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE42-NEXT: pxor %xmm3, %xmm3 -; SSE42-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE42-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSE42-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE42-NEXT: paddb (%rdx), %xmm4 +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb 32(%rdx), %xmm3 ; SSE42-NEXT: paddb 48(%rdx), %xmm1 -; SSE42-NEXT: paddb 32(%rdx), %xmm2 -; SSE42-NEXT: movdqa %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm1, 48(%rcx) -; SSE42-NEXT: movdqa %xmm4, (%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm4, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v64i8_to_v32i16_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rdx), %xmm3, %xmm3 +; AVX-NEXT: vmovdqa %xmm3, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v64i8_to_v32i16_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5736,13 +5829,13 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5774,20 +5867,20 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm4 -; SSE2-NEXT: paddb 48(%rdx), %xmm2 -; SSE2-NEXT: paddb 32(%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, 32(%rcx) -; SSE2-NEXT: movdqa %xmm2, 48(%rcx) -; SSE2-NEXT: movdqa %xmm4, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm4 +; SSE2-NEXT: paddb 32(%rdx), %xmm2 +; SSE2-NEXT: paddb 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 48(%rcx) +; SSE2-NEXT: movdqa %xmm2, 32(%rcx) +; SSE2-NEXT: movdqa %xmm4, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v64i8_to_v16i32_factor4: @@ -5795,19 +5888,19 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb 48(%rdx), %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm2 +; SSE42-NEXT: paddb 32(%rdx), %xmm3 +; SSE42-NEXT: paddb 48(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 48(%rcx) +; SSE42-NEXT: movdqa %xmm2, 48(%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; @@ -5816,20 +5909,20 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v64i8_to_v16i32_factor4: @@ -5851,11 +5944,11 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5888,20 +5981,20 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm4 -; SSE2-NEXT: paddb 48(%rdx), %xmm2 -; SSE2-NEXT: paddb 32(%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, 32(%rcx) -; SSE2-NEXT: movdqa %xmm2, 48(%rcx) -; SSE2-NEXT: movdqa %xmm4, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm4 +; SSE2-NEXT: paddb 32(%rdx), %xmm2 +; SSE2-NEXT: paddb 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 48(%rcx) +; SSE2-NEXT: movdqa %xmm2, 32(%rcx) +; SSE2-NEXT: movdqa %xmm4, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v64i8_to_v8i64_factor8: @@ -5909,20 +6002,20 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: psrlq $48, %xmm2 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: psrlq $48, %xmm3 +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: psrld $16, %xmm0 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb 48(%rdx), %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm2 +; SSE42-NEXT: paddb 32(%rdx), %xmm3 +; SSE42-NEXT: paddb 48(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 48(%rcx) +; SSE42-NEXT: movdqa %xmm2, 48(%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; @@ -5931,26 +6024,26 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX-NEXT: vpsrlq $48, %xmm0, %xmm2 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v64i8_to_v8i64_factor8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero @@ -5966,11 +6059,11 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -6000,20 +6093,20 @@ ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2] ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb 48(%rdx), %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm2 +; SSE-NEXT: paddb 32(%rdx), %xmm3 +; SSE-NEXT: paddb 48(%rdx), %xmm2 ; SSE-NEXT: paddb (%rdx), %xmm1 ; SSE-NEXT: movdqa %xmm1, (%rcx) -; SSE-NEXT: movdqa %xmm2, 32(%rcx) -; SSE-NEXT: movdqa %xmm3, 48(%rcx) +; SSE-NEXT: movdqa %xmm2, 48(%rcx) +; SSE-NEXT: movdqa %xmm3, 32(%rcx) ; SSE-NEXT: movdqa %xmm0, 16(%rcx) ; SSE-NEXT: retq ; @@ -6022,20 +6115,20 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] ; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] ; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16: @@ -6062,18 +6155,18 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -6081,18 +6174,18 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -6108,11 +6201,11 @@ ; AVX512F-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -6127,12 +6220,12 @@ ; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16: @@ -6207,8 +6300,8 @@ ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %xmm3, 48(%rcx) +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: retq @@ -6217,13 +6310,13 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0] +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -6231,13 +6324,13 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0] +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -6273,8 +6366,8 @@ ; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: movaps 48(%rdx), %xmm3 ; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movaps %xmm3, 48(%rcx) +; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq @@ -6348,19 +6441,19 @@ ; SSE2-NEXT: paddb 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm4 -; SSE2-NEXT: paddb 48(%rdx), %xmm1 -; SSE2-NEXT: paddb 32(%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, 32(%rcx) -; SSE2-NEXT: movdqa %xmm1, 48(%rcx) -; SSE2-NEXT: movdqa %xmm4, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm4 +; SSE2-NEXT: paddb 32(%rdx), %xmm1 +; SSE2-NEXT: paddb 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 48(%rcx) +; SSE2-NEXT: movdqa %xmm1, 32(%rcx) +; SSE2-NEXT: movdqa %xmm4, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v32i16_to_v16i32_factor2: @@ -6374,48 +6467,48 @@ ; SSE42-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm4 -; SSE42-NEXT: paddb 48(%rdx), %xmm1 +; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm3 -; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: paddb 48(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 48(%rcx) -; SSE42-NEXT: movdqa %xmm4, (%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm4, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v32i16_to_v16i32_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rdx), %xmm3, %xmm3 +; AVX-NEXT: vmovdqa %xmm3, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v32i16_to_v16i32_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -6424,11 +6517,11 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -6462,20 +6555,20 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm4 -; SSE2-NEXT: paddb 48(%rdx), %xmm2 -; SSE2-NEXT: paddb 32(%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, 32(%rcx) -; SSE2-NEXT: movdqa %xmm2, 48(%rcx) -; SSE2-NEXT: movdqa %xmm4, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm4 +; SSE2-NEXT: paddb 32(%rdx), %xmm2 +; SSE2-NEXT: paddb 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 48(%rcx) +; SSE2-NEXT: movdqa %xmm2, 32(%rcx) +; SSE2-NEXT: movdqa %xmm4, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v32i16_to_v8i64_factor4: @@ -6483,19 +6576,19 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb 48(%rdx), %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm2 +; SSE42-NEXT: paddb 32(%rdx), %xmm3 +; SSE42-NEXT: paddb 48(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 48(%rcx) +; SSE42-NEXT: movdqa %xmm2, 48(%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; @@ -6504,26 +6597,26 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v32i16_to_v8i64_factor4: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero @@ -6539,11 +6632,11 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -6576,18 +6669,18 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: paddb 16(%rdx), %xmm3 -; SSE2-NEXT: paddb 48(%rdx), %xmm2 ; SSE2-NEXT: paddb 32(%rdx), %xmm0 +; SSE2-NEXT: paddb 48(%rdx), %xmm2 ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 32(%rcx) ; SSE2-NEXT: movdqa %xmm2, 48(%rcx) +; SSE2-NEXT: movdqa %xmm0, 32(%rcx) ; SSE2-NEXT: movdqa %xmm3, 16(%rcx) ; SSE2-NEXT: retq ; @@ -6598,18 +6691,18 @@ ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; SSE42-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] ; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: paddb 16(%rdx), %xmm3 -; SSE42-NEXT: paddb 48(%rdx), %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm0 +; SSE42-NEXT: paddb 48(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 32(%rcx) ; SSE42-NEXT: movdqa %xmm2, 48(%rcx) +; SSE42-NEXT: movdqa %xmm0, 32(%rcx) ; SSE42-NEXT: movdqa %xmm3, 16(%rcx) ; SSE42-NEXT: retq ; @@ -6619,20 +6712,20 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec512_v32i16_to_v4i128_factor8: @@ -6659,16 +6752,16 @@ ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -6677,16 +6770,16 @@ ; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -6713,25 +6806,26 @@ ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX512F-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-LABEL: vec512_v32i16_to_v4i128_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,1,2,3,4,5,6,7,33,9,10,11,12,13,14,15,34,17,18,19,20,21,22,23,35,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,1,2,3,4,5,6,7,33,9,10,11,12,13,14,15,50,17,18,19,20,21,22,23,51,25,26,27,28,29,30,31] ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 @@ -6799,52 +6893,107 @@ ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %xmm3, 48(%rcx) +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: retq ; -; AVX2-LABEL: vec512_v32i16_to_v2i256_factor16: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: vec512_v32i16_to_v2i256_factor16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq ; -; AVX512F-LABEL: vec512_v32i16_to_v2i256_factor16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX2-FAST-PERLANE-LABEL: vec512_v32i16_to_v2i256_factor16: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [65535,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpand %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; -; AVX512BW-LABEL: vec512_v32i16_to_v2i256_factor16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; AVX512BW-NEXT: vpand %ymm0, %ymm1, %ymm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX2-FAST-LABEL: vec512_v32i16_to_v2i256_factor16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [65535,0,0,0] +; AVX2-FAST-NEXT: vpand %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-SLOW-LABEL: vec512_v32i16_to_v2i256_factor16: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: vec512_v32i16_to_v2i256_factor16: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [65535,0,0,0] +; AVX512F-FAST-NEXT: vpand %ymm0, %ymm2, %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq +; +; AVX512BW-SLOW-LABEL: vec512_v32i16_to_v2i256_factor16: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec512_v32i16_to_v2i256_factor16: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX512BW-FAST-NEXT: vpand %ymm0, %ymm1, %ymm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias @@ -6867,8 +7016,8 @@ ; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: movaps 48(%rdx), %xmm3 ; SSE2-NEXT: paddb (%rdx), %xmm0 -; SSE2-NEXT: movaps %xmm2, 32(%rcx) ; SSE2-NEXT: movaps %xmm3, 48(%rcx) +; SSE2-NEXT: movaps %xmm2, 32(%rcx) ; SSE2-NEXT: movaps %xmm1, 16(%rcx) ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq @@ -6883,8 +7032,8 @@ ; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: movaps 48(%rdx), %xmm3 ; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movaps %xmm2, 32(%rcx) ; SSE42-NEXT: movaps %xmm3, 48(%rcx) +; SSE42-NEXT: movaps %xmm2, 32(%rcx) ; SSE42-NEXT: movaps %xmm0, 16(%rcx) ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq @@ -6904,18 +7053,44 @@ ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX2-LABEL: vec512_v32i16_to_v1i512_factor32: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: vec512_v32i16_to_v1i512_factor32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec512_v32i16_to_v1i512_factor32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec512_v32i16_to_v1i512_factor32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX2-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: vec512_v32i16_to_v1i512_factor32: ; AVX512F: # %bb.0: @@ -6960,19 +7135,19 @@ ; SSE2-NEXT: paddb 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm4 -; SSE2-NEXT: paddb 48(%rdx), %xmm1 -; SSE2-NEXT: paddb 32(%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, 32(%rcx) -; SSE2-NEXT: movdqa %xmm1, 48(%rcx) -; SSE2-NEXT: movdqa %xmm4, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm4 +; SSE2-NEXT: paddb 32(%rdx), %xmm1 +; SSE2-NEXT: paddb 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 48(%rcx) +; SSE2-NEXT: movdqa %xmm1, 32(%rcx) +; SSE2-NEXT: movdqa %xmm4, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v16i32_to_v8i64_factor2: @@ -6986,48 +7161,48 @@ ; SSE42-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero ; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm4 -; SSE42-NEXT: paddb 48(%rdx), %xmm1 +; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm3 -; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: paddb 48(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 48(%rcx) -; SSE42-NEXT: movdqa %xmm4, (%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm4, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v16i32_to_v8i64_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rdx), %xmm3, %xmm3 +; AVX-NEXT: vmovdqa %xmm3, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v16i32_to_v8i64_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -7036,11 +7211,11 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -7124,16 +7299,16 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm3 +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm3, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -7142,16 +7317,16 @@ ; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -7160,34 +7335,34 @@ ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX2-FAST-LABEL: vec512_v16i32_to_v4i128_factor4: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,u,u,u,3,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,u,u,u,3,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,u,u,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -7198,11 +7373,11 @@ ; AVX512F-NEXT: movw $4369, %ax # imm = 0x1111 ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -7276,18 +7451,17 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[1],zero,zero,zero ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] -; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %xmm3, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec512_v16i32_to_v2i256_factor8: @@ -7309,13 +7483,13 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -7323,13 +7497,13 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -7340,11 +7514,11 @@ ; AVX512F-NEXT: movw $257, %ax # imm = 0x101 ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -7397,8 +7571,8 @@ ; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: movaps 48(%rdx), %xmm3 ; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movaps %xmm2, 32(%rcx) ; SSE2-NEXT: movaps %xmm3, 48(%rcx) +; SSE2-NEXT: movaps %xmm2, 32(%rcx) ; SSE2-NEXT: movaps %xmm0, 16(%rcx) ; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq @@ -7413,8 +7587,8 @@ ; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: movaps 48(%rdx), %xmm3 ; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movaps %xmm2, 32(%rcx) ; SSE42-NEXT: movaps %xmm3, 48(%rcx) +; SSE42-NEXT: movaps %xmm2, 32(%rcx) ; SSE42-NEXT: movaps %xmm0, 16(%rcx) ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq @@ -7423,9 +7597,9 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx) @@ -7493,37 +7667,37 @@ ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: movq {{.*#+}} xmm3 = xmm0[0],zero ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: paddb 16(%rdx), %xmm0 ; SSE-NEXT: paddb (%rdx), %xmm3 -; SSE-NEXT: paddb 48(%rdx), %xmm1 +; SSE-NEXT: paddb 16(%rdx), %xmm0 ; SSE-NEXT: paddb 32(%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, 32(%rcx) +; SSE-NEXT: paddb 48(%rdx), %xmm1 ; SSE-NEXT: movdqa %xmm1, 48(%rcx) -; SSE-NEXT: movdqa %xmm3, (%rcx) +; SSE-NEXT: movdqa %xmm2, 32(%rcx) ; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: movdqa %xmm3, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_v8i64_to_v4i128_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[3] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[3] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[3] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm3 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -7532,14 +7706,14 @@ ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,1,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -7550,11 +7724,11 @@ ; AVX512F-NEXT: movb $85, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -7623,27 +7797,27 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %xmm3, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v8i64_to_v2i256_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = xmm0[0],zero -; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -7654,11 +7828,11 @@ ; AVX512F-NEXT: movb $17, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -7695,8 +7869,8 @@ ; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: movaps 48(%rdx), %xmm3 ; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movaps %xmm3, 48(%rcx) +; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq @@ -7705,8 +7879,8 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx) @@ -7781,16 +7955,16 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3 ; AVX-NEXT: vmovaps %xmm3, 48(%rcx) ; AVX-NEXT: vmovaps %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v4i128_to_v2i256_factor2: @@ -7813,11 +7987,11 @@ ; AVX512F-NEXT: movb $51, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -7853,8 +8027,8 @@ ; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: movaps 48(%rdx), %xmm3 ; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movaps %xmm3, 48(%rcx) +; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq @@ -7885,8 +8059,9 @@ ; ; AVX512F-LABEL: vec512_v4i128_to_v1i512_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) @@ -7919,30 +8094,30 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: paddb 16(%rsi), %xmm1 +; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: movaps 48(%rdx), %xmm3 -; SSE-NEXT: paddb 16(%rdx), %xmm1 ; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movaps %xmm2, 32(%rcx) +; SSE-NEXT: paddb 16(%rdx), %xmm1 ; SSE-NEXT: movaps %xmm3, 48(%rcx) -; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movdqa %xmm1, 16(%rcx) +; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_v2i256_to_v1i512_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -316,7 +316,7 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -326,7 +326,7 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -336,7 +336,7 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -976,38 +976,77 @@ ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; -; AVX512F-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] -; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpermd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq ; -; AVX512DQ-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512F-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,3] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq ; -; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,0,7] -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512DQ-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,3] +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,0,7] +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,0,3] +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias @@ -1027,22 +1066,22 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -1050,21 +1089,21 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> -; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: pshufb %xmm3, %xmm2 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE42-NEXT: pshufb %xmm3, %xmm2 -; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm4 -; SSE42-NEXT: movdqa %xmm4, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm4 +; SSE42-NEXT: movdqa %xmm4, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -1072,21 +1111,21 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX-NEXT: # xmm3 = mem[0,0] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -1160,20 +1199,20 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: paddb 16(%rdx), %xmm3 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -1181,19 +1220,19 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] -; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] -; SSE42-NEXT: pshufb %xmm1, %xmm3 -; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] -; SSE42-NEXT: pshufb %xmm1, %xmm0 -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm3 -; SSE42-NEXT: movdqa %xmm3, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] +; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] +; SSE42-NEXT: pshufb %xmm2, %xmm3 +; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; SSE42-NEXT: pshufb %xmm2, %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm3 +; SSE42-NEXT: movdqa %xmm3, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -1201,18 +1240,18 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -1285,20 +1324,20 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: paddb 16(%rdx), %xmm3 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1306,19 +1345,19 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] -; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] -; SSE42-NEXT: pshufb %xmm1, %xmm3 -; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] -; SSE42-NEXT: pshufb %xmm1, %xmm0 -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm3 -; SSE42-NEXT: movdqa %xmm3, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] +; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] +; SSE42-NEXT: pshufb %xmm2, %xmm3 +; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; SSE42-NEXT: pshufb %xmm2, %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm3 +; SSE42-NEXT: movdqa %xmm3, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1326,18 +1365,18 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1410,19 +1449,19 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: paddb 16(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1430,17 +1469,17 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm1 ; SSE42-NEXT: movdqa 32(%rdi), %xmm2 ; SSE42-NEXT: movdqa 48(%rdi), %xmm3 -; SSE42-NEXT: paddb 48(%rsi), %xmm3 ; SSE42-NEXT: paddb 32(%rsi), %xmm2 +; SSE42-NEXT: paddb 48(%rsi), %xmm3 ; SSE42-NEXT: paddb (%rsi), %xmm1 ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; SSE42-NEXT: movdqa %xmm1, %xmm4 -; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm4 -; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm1 -; SSE42-NEXT: paddb 16(%rdx), %xmm1 -; SSE42-NEXT: paddb (%rdx), %xmm4 -; SSE42-NEXT: movdqa %xmm4, (%rcx) -; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm4 +; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: paddb 16(%rdx), %xmm4 +; SSE42-NEXT: movdqa %xmm4, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1448,16 +1487,16 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm2 +; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1535,25 +1574,25 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,1,2,3,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1561,20 +1600,20 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: pshufb %xmm3, %xmm2 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE42-NEXT: pshufb %xmm3, %xmm2 -; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm4 -; SSE42-NEXT: movdqa %xmm4, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm4 +; SSE42-NEXT: movdqa %xmm4, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1582,19 +1621,20 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15] +; AVX-NEXT: # xmm3 = mem[0,0] ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1666,20 +1706,20 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,0,65535,65535,65535] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: paddb 16(%rdx), %xmm3 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1687,16 +1727,16 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] ; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: paddb 16(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1704,16 +1744,16 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1785,19 +1825,19 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: paddb 16(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1805,15 +1845,15 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1821,15 +1861,15 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1901,19 +1941,19 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: @@ -1921,16 +1961,16 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: paddb 16(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: @@ -1945,11 +1985,11 @@ ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -2059,15 +2099,15 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: paddb 16(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: @@ -2075,15 +2115,15 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: @@ -2097,11 +2137,11 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -2211,15 +2251,15 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm2[1] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: @@ -2227,15 +2267,15 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: @@ -2249,11 +2289,11 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -2285,12 +2325,12 @@ ; ; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,0,7] -; AVX512F-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-FAST-NEXT: vzeroupper @@ -2311,12 +2351,12 @@ ; ; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,0,7] -; AVX512DQ-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-FAST-NEXT: vzeroupper @@ -2337,9 +2377,9 @@ ; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,13,0,15] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper @@ -2415,10 +2455,10 @@ ; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -2441,61 +2481,120 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-SLOW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpbroadcastb %xmm0, %ymm2 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq ; -; AVX512DQ-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512F-FAST-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpbroadcastb %xmm0, %ymm2 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq ; -; AVX512BW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq - %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 - %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 - %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias +; AVX512DQ-SLOW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpbroadcastb %xmm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512DQ-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpbroadcastb %xmm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-SLOW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512BW-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpbroadcastb %xmm0, %ymm2 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512BW-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 @@ -2561,13 +2660,13 @@ ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -2604,10 +2703,12 @@ ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2623,10 +2724,12 @@ ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2710,10 +2813,10 @@ ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -2738,39 +2841,43 @@ ; ; AVX512F-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm3 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastd %xmm0, %ymm3 ; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2857,13 +2964,13 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255] ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -2900,10 +3007,12 @@ ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2919,10 +3028,12 @@ ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3007,10 +3118,10 @@ ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -3035,39 +3146,43 @@ ; ; AVX512F-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm3 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm3 ; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3120,8 +3235,8 @@ ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb 32(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: @@ -3153,13 +3268,13 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -3196,10 +3311,12 @@ ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3215,10 +3332,12 @@ ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3286,8 +3405,8 @@ ; SSE42-NEXT: paddb %xmm1, %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm3, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: @@ -3300,11 +3419,11 @@ ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: @@ -3340,8 +3459,10 @@ ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper @@ -3360,8 +3481,10 @@ ; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -3450,8 +3573,8 @@ ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -3577,8 +3700,8 @@ ; SSE42-NEXT: paddb %xmm2, %xmm1 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: @@ -3590,91 +3713,176 @@ ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; -; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX2-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq ; -; AVX512DQ-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw %xmm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7],ymm2[8],ymm3[9],ymm2[10],ymm3[11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; -; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: -; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47] -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 -; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512BW-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-SLOW-NEXT: vzeroupper -; AVX512BW-SLOW-NEXT: retq +; AVX2-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX2-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7],ymm2[8],ymm3[9],ymm2[10],ymm3[11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; -; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: -; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47] -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero +; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX512F-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7],ymm2[8],ymm3[9],ymm2[10],ymm3[11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7],ymm2[8],ymm3[9],ymm2[10],ymm3[11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero +; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,25,0,27,0,29,0,31] +; AVX512BW-SLOW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 +; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47] +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -3734,8 +3942,8 @@ ; SSE42-NEXT: paddb 16(%rdx), %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm3 ; SSE42-NEXT: movdqa %xmm3, 32(%rcx) -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: @@ -3750,47 +3958,91 @@ ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; -; AVX2-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] -; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14],ymm0[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14],ymm1[15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) @@ -3799,17 +4051,20 @@ ; ; AVX512F-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] -; AVX512F-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14],ymm2[15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-FAST-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) @@ -3818,18 +4073,22 @@ ; ; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14],ymm0[15] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14],ymm1[15] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx) @@ -3838,17 +4097,20 @@ ; ; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14],ymm2[15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rcx) @@ -3858,14 +4120,17 @@ ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 +; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,25,26,0,28,29,0,31] +; AVX512BW-SLOW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 +; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -3934,8 +4199,8 @@ ; SSE42-NEXT: paddb %xmm2, %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: @@ -3949,30 +4214,32 @@ ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7] +; AVX2-SLOW-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7],ymm2[8],ymm3[9,10,11],ymm2[12],ymm3[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -3983,8 +4250,10 @@ ; AVX2-FAST-PERLANE-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7],ymm2[8],ymm3[9,10,11],ymm2[12],ymm3[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -4000,8 +4269,10 @@ ; AVX2-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7] +; AVX2-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7],ymm2[8],ymm3[9,10,11],ymm2[12],ymm3[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -4012,32 +4283,38 @@ ; ; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7] +; AVX512F-SLOW-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7],ymm2[8],ymm3[9,10,11],ymm2[12],ymm3[13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-FAST-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7] +; AVX512F-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7],ymm2[8],ymm3[9,10,11],ymm2[12],ymm3[13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -4048,32 +4325,38 @@ ; ; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX512DQ-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7],ymm2[8],ymm3[9,10,11],ymm2[12],ymm3[13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7] +; AVX512DQ-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7],ymm2[8],ymm3[9,10,11],ymm2[12],ymm3[13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -4085,14 +4368,18 @@ ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 +; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,9,10,11,16,13,14,15,16,9,10,11,16,13,14,15] +; AVX512BW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-SLOW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 +; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -4145,8 +4432,8 @@ ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb 32(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: @@ -4166,8 +4453,8 @@ ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: @@ -4178,52 +4465,96 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX-NEXT: retq ; -; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] -; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) @@ -4232,17 +4563,20 @@ ; ; AVX512F-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] -; AVX512F-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4],ymm3[5,6,7,8,9,10,11],ymm2[12],ymm3[13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-FAST-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) @@ -4251,18 +4585,22 @@ ; ; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx) @@ -4271,17 +4609,20 @@ ; ; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4],ymm3[5,6,7,8,9,10,11],ymm2[12],ymm3[13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rcx) @@ -4291,14 +4632,17 @@ ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 +; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <16,9,10,11,12,13,16,15,u,u,u,u,16,u,u,u> +; AVX512BW-SLOW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 +; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -4365,8 +4709,8 @@ ; SSE42-NEXT: paddb %xmm2, %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: @@ -4379,12 +4723,12 @@ ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: @@ -4393,15 +4737,17 @@ ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4411,15 +4757,19 @@ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4429,32 +4779,55 @@ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,41,42,43,44,45,46,47] -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,9,10,11,12,13,14,15,16,9,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-SLOW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 +; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,41,42,43,44,45,46,47] +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 +; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias @@ -4526,72 +4899,162 @@ ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX2-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq ; -; AVX512F-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; -; AVX512DQ-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX512DQ-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX2-FAST-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; -; AVX512BW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,40,41,42,43,0,45,46,47] -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpermt2w %zmm2, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa %ymm0, %ymm0 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX512F-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512F-FAST-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX512F-FAST-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX512DQ-FAST-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <16,9,10,11,12,13,14,15,u,u,u,u,16,u,u,u> +; AVX512BW-SLOW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 +; AVX512BW-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7,8,9,10,11],ymm2[12],ymm0[13,14,15] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,40,41,42,43,0,45,46,47] +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vpermt2w %zmm2, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias @@ -4623,8 +5086,8 @@ ; SSE2-NEXT: paddb %xmm0, %xmm1 ; SSE2-NEXT: paddb 32(%rdx), %xmm0 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm1, 16(%rcx) +; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: @@ -4642,8 +5105,8 @@ ; SSE42-NEXT: paddb %xmm2, %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: @@ -4654,28 +5117,28 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[1,3],ymm2[4,4],ymm1[5,7] -; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,1,3] -; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] +; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm2, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -4685,8 +5148,8 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -4736,11 +5199,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u> ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4754,25 +5217,25 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u> ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,29,0,31,0,1,0,1] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,0,15] -; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm0, %zmm0, %zmm1 +; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] ; AVX512BW-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -4782,9 +5245,9 @@ ; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,29,0,31,0,1,0,1] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,0,15] -; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm0, %zmm1 ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero @@ -4823,8 +5286,8 @@ ; SSE2-NEXT: paddb 16(%rdx), %xmm2 ; SSE2-NEXT: paddb 32(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx) -; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: @@ -4843,8 +5306,8 @@ ; SSE42-NEXT: paddb 16(%rdx), %xmm1 ; SSE42-NEXT: paddb 32(%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: @@ -4866,8 +5329,8 @@ ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -4907,8 +5370,8 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -4926,8 +5389,8 @@ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -4941,11 +5404,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u> ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4959,11 +5422,11 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u> ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5026,8 +5489,8 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: paddb 32(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 32(%rcx) -; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: @@ -5044,29 +5507,29 @@ ; SSE42-NEXT: paddb %xmm2, %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7] +; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -5084,8 +5547,8 @@ ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5099,11 +5562,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u> ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5117,11 +5580,11 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u> ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5304,8 +5767,8 @@ ; SSE2-NEXT: paddb %xmm0, %xmm2 ; SSE2-NEXT: paddb 32(%rdx), %xmm0 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx) -; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: @@ -5321,8 +5784,8 @@ ; SSE42-NEXT: paddb %xmm0, %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx) -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: @@ -5331,18 +5794,18 @@ ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX-NEXT: vmovq {{.*#+}} xmm2 = xmm0[0],zero -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3] +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[2] +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -5359,8 +5822,8 @@ ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5374,11 +5837,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,7,0,11,0,13,u,u> ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5392,11 +5855,11 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,7,0,11,0,13,u,u> ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5424,7 +5887,7 @@ ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper @@ -5479,19 +5942,19 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2] +; AVX-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -5595,17 +6058,17 @@ ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: movdqa 16(%rdx), %xmm1 +; SSE2-NEXT: movdqa (%rdx), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rdx), %xmm2 +; SSE2-NEXT: movdqa 16(%rdx), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa 32(%rdx), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, 48(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm1, 16(%rcx) +; SSE2-NEXT: paddb 48(%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rcx) +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: @@ -5613,17 +6076,17 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero -; SSE42-NEXT: movdqa 16(%rdx), %xmm1 +; SSE42-NEXT: movdqa (%rdx), %xmm1 ; SSE42-NEXT: paddb %xmm0, %xmm1 -; SSE42-NEXT: movdqa (%rdx), %xmm2 +; SSE42-NEXT: movdqa 16(%rdx), %xmm2 ; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: movdqa 48(%rdx), %xmm3 +; SSE42-NEXT: movdqa 32(%rdx), %xmm3 ; SSE42-NEXT: paddb %xmm0, %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 48(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) -; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: paddb 48(%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 48(%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: @@ -5631,14 +6094,14 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: @@ -5647,10 +6110,10 @@ ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5660,10 +6123,10 @@ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5673,10 +6136,10 @@ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5708,17 +6171,17 @@ ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: movdqa 16(%rdx), %xmm1 +; SSE2-NEXT: movdqa (%rdx), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rdx), %xmm2 +; SSE2-NEXT: movdqa 16(%rdx), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa 32(%rdx), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, 48(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm1, 16(%rcx) +; SSE2-NEXT: paddb 48(%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rcx) +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: @@ -5726,17 +6189,17 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero -; SSE42-NEXT: movdqa 16(%rdx), %xmm1 +; SSE42-NEXT: movdqa (%rdx), %xmm1 ; SSE42-NEXT: paddb %xmm0, %xmm1 -; SSE42-NEXT: movdqa (%rdx), %xmm2 +; SSE42-NEXT: movdqa 16(%rdx), %xmm2 ; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: movdqa 48(%rdx), %xmm3 +; SSE42-NEXT: movdqa 32(%rdx), %xmm3 ; SSE42-NEXT: paddb %xmm0, %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 48(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) -; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: paddb 48(%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 48(%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: @@ -5744,14 +6207,14 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: @@ -5760,10 +6223,10 @@ ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5773,10 +6236,10 @@ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5786,10 +6249,10 @@ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5821,17 +6284,17 @@ ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: movdqa 16(%rdx), %xmm1 +; SSE2-NEXT: movdqa (%rdx), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rdx), %xmm2 +; SSE2-NEXT: movdqa 16(%rdx), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa 32(%rdx), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, 48(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm1, 16(%rcx) +; SSE2-NEXT: paddb 48(%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rcx) +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: @@ -5839,17 +6302,17 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: movdqa 16(%rdx), %xmm1 +; SSE42-NEXT: movdqa (%rdx), %xmm1 ; SSE42-NEXT: paddb %xmm0, %xmm1 -; SSE42-NEXT: movdqa (%rdx), %xmm2 -; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: movdqa 48(%rdx), %xmm3 -; SSE42-NEXT: paddb %xmm0, %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 48(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) -; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: movdqa 16(%rdx), %xmm2 +; SSE42-NEXT: paddb %xmm0, %xmm2 +; SSE42-NEXT: movdqa 32(%rdx), %xmm3 +; SSE42-NEXT: paddb %xmm0, %xmm3 +; SSE42-NEXT: paddb 48(%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 48(%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: @@ -5857,14 +6320,14 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: @@ -5873,10 +6336,10 @@ ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5886,10 +6349,10 @@ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5899,10 +6362,10 @@ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5933,17 +6396,17 @@ ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: movdqa 16(%rdx), %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rdx), %xmm3 +; SSE-NEXT: movdqa 32(%rdx), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rcx) -; SSE-NEXT: movdqa %xmm3, 48(%rcx) -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm1, 16(%rcx) +; SSE-NEXT: paddb 48(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rcx) +; SSE-NEXT: movdqa %xmm3, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: @@ -5951,14 +6414,14 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: @@ -5967,10 +6430,10 @@ ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5980,10 +6443,10 @@ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5993,10 +6456,10 @@ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -6043,14 +6506,14 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %xmm3, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: @@ -6059,10 +6522,10 @@ ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -6072,10 +6535,10 @@ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -6085,10 +6548,10 @@ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -6120,17 +6583,17 @@ ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: movdqa 16(%rdx), %xmm1 +; SSE2-NEXT: movdqa (%rdx), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rdx), %xmm2 +; SSE2-NEXT: movdqa 16(%rdx), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa 32(%rdx), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, 48(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm1, 16(%rcx) +; SSE2-NEXT: paddb 48(%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rcx) +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: @@ -6139,17 +6602,17 @@ ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE42-NEXT: movdqa 16(%rdx), %xmm1 +; SSE42-NEXT: movdqa (%rdx), %xmm1 ; SSE42-NEXT: paddb %xmm0, %xmm1 -; SSE42-NEXT: movdqa (%rdx), %xmm2 +; SSE42-NEXT: movdqa 16(%rdx), %xmm2 ; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: movdqa 48(%rdx), %xmm3 +; SSE42-NEXT: movdqa 32(%rdx), %xmm3 ; SSE42-NEXT: paddb %xmm0, %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 48(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) -; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: paddb 48(%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 48(%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: @@ -6158,14 +6621,14 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: @@ -6174,10 +6637,10 @@ ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -6187,10 +6650,10 @@ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -6200,10 +6663,10 @@ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -6237,17 +6700,17 @@ ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: movdqa 16(%rdx), %xmm1 +; SSE2-NEXT: movdqa (%rdx), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rdx), %xmm2 +; SSE2-NEXT: movdqa 16(%rdx), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa 32(%rdx), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, 48(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm1, 16(%rcx) +; SSE2-NEXT: paddb 48(%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rcx) +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: @@ -6257,17 +6720,17 @@ ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; SSE42-NEXT: movdqa 16(%rdx), %xmm1 +; SSE42-NEXT: movdqa (%rdx), %xmm1 ; SSE42-NEXT: paddb %xmm0, %xmm1 -; SSE42-NEXT: movdqa (%rdx), %xmm2 +; SSE42-NEXT: movdqa 16(%rdx), %xmm2 ; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: movdqa 48(%rdx), %xmm3 +; SSE42-NEXT: movdqa 32(%rdx), %xmm3 ; SSE42-NEXT: paddb %xmm0, %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 48(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) -; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: paddb 48(%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 48(%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: @@ -6277,14 +6740,14 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: @@ -6293,10 +6756,10 @@ ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -6306,10 +6769,10 @@ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -6319,10 +6782,10 @@ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -6355,17 +6818,17 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: movdqa 16(%rdx), %xmm1 +; SSE2-NEXT: movdqa (%rdx), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rdx), %xmm2 +; SSE2-NEXT: movdqa 16(%rdx), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa 32(%rdx), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, 48(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm1, 16(%rcx) +; SSE2-NEXT: paddb 48(%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rcx) +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: @@ -6374,17 +6837,17 @@ ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE42-NEXT: movdqa 16(%rdx), %xmm0 +; SSE42-NEXT: movdqa (%rdx), %xmm0 ; SSE42-NEXT: paddb %xmm1, %xmm0 -; SSE42-NEXT: movdqa (%rdx), %xmm2 +; SSE42-NEXT: movdqa 16(%rdx), %xmm2 ; SSE42-NEXT: paddb %xmm1, %xmm2 -; SSE42-NEXT: movdqa 48(%rdx), %xmm3 +; SSE42-NEXT: movdqa 32(%rdx), %xmm3 ; SSE42-NEXT: paddb %xmm1, %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 48(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: paddb 48(%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 48(%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: @@ -6393,14 +6856,14 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: @@ -6410,10 +6873,10 @@ ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -6424,10 +6887,10 @@ ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -6438,10 +6901,10 @@ ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -6508,54 +6971,106 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %xmm3, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; -; AVX2-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX2-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-SLOW-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX512F-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq ; -; AVX512F-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512DQ-SLOW-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq ; -; AVX512DQ-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512DQ-FAST-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX512DQ-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: ; AVX512BW: # %bb.0: @@ -6587,17 +7102,17 @@ ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movdqa 16(%rdx), %xmm1 +; SSE2-NEXT: movdqa (%rdx), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rdx), %xmm2 +; SSE2-NEXT: movdqa 16(%rdx), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa 32(%rdx), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, 48(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm1, 16(%rcx) +; SSE2-NEXT: paddb 48(%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rcx) +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: @@ -6607,17 +7122,17 @@ ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE42-NEXT: movdqa 16(%rdx), %xmm1 +; SSE42-NEXT: movdqa (%rdx), %xmm1 ; SSE42-NEXT: paddb %xmm0, %xmm1 -; SSE42-NEXT: movdqa (%rdx), %xmm2 +; SSE42-NEXT: movdqa 16(%rdx), %xmm2 ; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: movdqa 48(%rdx), %xmm3 +; SSE42-NEXT: movdqa 32(%rdx), %xmm3 ; SSE42-NEXT: paddb %xmm0, %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 48(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) -; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: paddb 48(%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 48(%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: @@ -6628,15 +7143,15 @@ ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm3 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -6647,10 +7162,10 @@ ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -6661,11 +7176,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -6676,11 +7191,11 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -6714,17 +7229,17 @@ ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE2-NEXT: movdqa 16(%rdx), %xmm0 +; SSE2-NEXT: movdqa (%rdx), %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm0 -; SSE2-NEXT: movdqa (%rdx), %xmm2 +; SSE2-NEXT: movdqa 16(%rdx), %xmm2 ; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: movdqa 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa 32(%rdx), %xmm3 ; SSE2-NEXT: paddb %xmm1, %xmm3 -; SSE2-NEXT: paddb 32(%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, 48(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: paddb 48(%rdx), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 48(%rcx) +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: @@ -6733,17 +7248,17 @@ ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE42-NEXT: movdqa 16(%rdx), %xmm0 +; SSE42-NEXT: movdqa (%rdx), %xmm0 ; SSE42-NEXT: paddb %xmm1, %xmm0 -; SSE42-NEXT: movdqa (%rdx), %xmm2 +; SSE42-NEXT: movdqa 16(%rdx), %xmm2 ; SSE42-NEXT: paddb %xmm1, %xmm2 -; SSE42-NEXT: movdqa 48(%rdx), %xmm3 +; SSE42-NEXT: movdqa 32(%rdx), %xmm3 ; SSE42-NEXT: paddb %xmm1, %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 48(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: paddb 48(%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 48(%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: @@ -6753,15 +7268,15 @@ ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm3 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -6772,10 +7287,10 @@ ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -6786,11 +7301,11 @@ ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -6801,11 +7316,11 @@ ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -6873,14 +7388,14 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %xmm3, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: @@ -6889,10 +7404,10 @@ ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -6903,11 +7418,11 @@ ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -6918,11 +7433,11 @@ ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -6954,17 +7469,17 @@ ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero -; SSE-NEXT: movdqa 16(%rdx), %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rdx), %xmm3 +; SSE-NEXT: movdqa 32(%rdx), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rcx) -; SSE-NEXT: movdqa %xmm3, 48(%rcx) -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm1, 16(%rcx) +; SSE-NEXT: paddb 48(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rcx) +; SSE-NEXT: movdqa %xmm3, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: @@ -6974,15 +7489,15 @@ ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm3 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -6993,10 +7508,10 @@ ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -7007,11 +7522,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -7022,11 +7537,11 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -7075,14 +7590,14 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %xmm3, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: @@ -7090,10 +7605,10 @@ ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -7104,11 +7619,11 @@ ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,1,2,3,8,5,6,7] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -7119,11 +7634,11 @@ ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,1,2,3,8,5,6,7] ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -7169,24 +7684,24 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %xmm3, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -7197,11 +7712,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -7212,11 +7727,11 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -283,7 +283,7 @@ ; AVX512F-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper @@ -292,7 +292,7 @@ ; AVX512DQ-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper @@ -301,7 +301,7 @@ ; AVX512BW-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper @@ -609,16 +609,16 @@ ; ; AVX512F-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX512F-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512F-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX512F-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512F-NEXT: movl (%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX512F-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] +; AVX512F-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] +; AVX512F-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper @@ -626,40 +626,37 @@ ; ; AVX512DQ-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-NEXT: vmovd %xmm0, %eax -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX512DQ-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX512DQ-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQ-NEXT: movl (%rdi), %eax +; AVX512DQ-NEXT: vmovd %eax, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX512DQ-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] +; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] +; AVX512DQ-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: -; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-SLOW-NEXT: vzeroupper -; AVX512BW-SLOW-NEXT: retq -; -; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: -; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0 -; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7] -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-FAST-NEXT: vzeroupper -; AVX512BW-FAST-NEXT: retq +; AVX512BW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512BW-NEXT: movl (%rdi), %eax +; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX512BW-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] +; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] +; AVX512BW-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <8 x i32> @@ -711,12 +708,12 @@ ; ; AVX512F-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7] -; AVX512F-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512F-NEXT: movl (%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512F-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper @@ -724,36 +721,29 @@ ; ; AVX512DQ-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-NEXT: vmovd %xmm0, %eax -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQ-NEXT: movl (%rdi), %eax +; AVX512DQ-NEXT: vmovd %eax, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: -; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-SLOW-NEXT: vzeroupper -; AVX512BW-SLOW-NEXT: retq -; -; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: -; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7] -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-FAST-NEXT: vzeroupper -; AVX512BW-FAST-NEXT: retq +; AVX512BW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512BW-NEXT: movl (%rdi), %eax +; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <8 x i32> @@ -855,19 +845,19 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm3 -; SSE2-NEXT: movdqa %xmm3, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -876,17 +866,17 @@ ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> -; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: pshufb %xmm3, %xmm2 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE42-NEXT: pshufb %xmm3, %xmm2 -; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm4 -; SSE42-NEXT: movdqa %xmm4, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm4 +; SSE42-NEXT: movdqa %xmm4, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -896,16 +886,16 @@ ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX-NEXT: # xmm3 = mem[0,0] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -966,18 +956,18 @@ ; SSE2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: pand 32(%rdi), %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -989,10 +979,10 @@ ; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; SSE42-NEXT: pshufb %xmm2, %xmm0 -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -1003,10 +993,10 @@ ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -1064,18 +1054,18 @@ ; SSE2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,1,0,1] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: pand 32(%rdi), %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1087,10 +1077,10 @@ ; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; SSE42-NEXT: pshufb %xmm2, %xmm0 -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1101,10 +1091,10 @@ ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1164,15 +1154,15 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pandn (%rdi), %xmm1 -; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: pand 32(%rdi), %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1180,24 +1170,24 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm1 ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; SSE42-NEXT: movdqa %xmm1, %xmm2 -; SSE42-NEXT: pblendvb %xmm0, 32(%rdi), %xmm2 -; SSE42-NEXT: pblendvb %xmm0, 48(%rdi), %xmm1 -; SSE42-NEXT: paddb 16(%rsi), %xmm1 -; SSE42-NEXT: paddb (%rsi), %xmm2 -; SSE42-NEXT: movdqa %xmm2, (%rdx) -; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: pblendvb %xmm0, 48(%rdi), %xmm2 +; SSE42-NEXT: pblendvb %xmm0, 32(%rdi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm1 +; SSE42-NEXT: paddb 16(%rsi), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vpblendvb %xmm1, 32(%rdi), %xmm0, %xmm2 -; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm2 +; AVX-NEXT: vpblendvb %xmm1, 32(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1268,10 +1258,10 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1279,16 +1269,16 @@ ; SSE42-NEXT: movdqa 32(%rdi), %xmm0 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; SSE42-NEXT: pshufb %xmm2, %xmm0 +; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = mem[0,0,0,0,4,5,6,7] ; SSE42-NEXT: movdqa %xmm3, %xmm4 -; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE42-NEXT: pshufb %xmm2, %xmm1 -; SSE42-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE42-NEXT: paddb 16(%rsi), %xmm3 -; SSE42-NEXT: paddb (%rsi), %xmm4 -; SSE42-NEXT: movdqa %xmm4, (%rdx) -; SSE42-NEXT: movdqa %xmm3, 16(%rdx) +; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE42-NEXT: pshufb %xmm2, %xmm0 +; SSE42-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE42-NEXT: paddb (%rsi), %xmm3 +; SSE42-NEXT: paddb 16(%rsi), %xmm4 +; SSE42-NEXT: movdqa %xmm4, 16(%rdx) +; SSE42-NEXT: movdqa %xmm3, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1296,15 +1286,15 @@ ; AVX-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = mem[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = mem[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1361,30 +1351,30 @@ ; SSE2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,0,65535,65535,65535] -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,1,0,1] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: pand 32(%rdi), %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1392,10 +1382,10 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1452,27 +1442,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,65535] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pandn (%rdi), %xmm1 -; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: pand 32(%rdi), %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5,6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1480,10 +1470,10 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1542,22 +1532,22 @@ ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: paddb 16(%rsi), %xmm1 -; SSE2-NEXT: paddb (%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) -; SSE2-NEXT: retq -; -; SSE42-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: +; SSE2-NEXT: paddb (%rsi), %xmm1 +; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: @@ -1565,11 +1555,11 @@ ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],mem[1,3],ymm0[4,4],mem[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -1626,35 +1616,35 @@ ; SSE2-NEXT: movaps (%rdi), %xmm0 ; SSE2-NEXT: movaps 32(%rdi), %xmm1 ; SSE2-NEXT: movaps 48(%rdi), %xmm2 -; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: paddb 16(%rsi), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -1738,36 +1728,36 @@ ; SSE2-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; SSE2: # %bb.0: ; SSE2-NEXT: movapd (%rdi), %xmm0 -; SSE2-NEXT: movapd 32(%rdi), %xmm1 +; SSE2-NEXT: movapd 48(%rdi), %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],mem[1] -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],mem[4,5,6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -1780,62 +1770,32 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vpbroadcastq (%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-SLOW-NEXT: vzeroupper -; AVX512F-SLOW-NEXT: retq -; -; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] -; AVX512F-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-FAST-NEXT: vzeroupper -; AVX512F-FAST-NEXT: retq -; -; AVX512DQ-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vpbroadcastq (%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-SLOW-NEXT: vzeroupper -; AVX512DQ-SLOW-NEXT: retq -; -; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] -; AVX512DQ-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-FAST-NEXT: vzeroupper -; AVX512DQ-FAST-NEXT: retq +; AVX512F-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq ; -; AVX512BW-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vpbroadcastq (%rdi), %ymm0 -; AVX512BW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-SLOW-NEXT: vzeroupper -; AVX512BW-SLOW-NEXT: retq +; AVX512DQ-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] -; AVX512BW-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm1, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-FAST-NEXT: vzeroupper -; AVX512BW-FAST-NEXT: retq +; AVX512BW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64> %broadcast.of.zextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <4 x i32> @@ -1897,10 +1857,10 @@ ; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; @@ -1922,33 +1882,33 @@ ; ; AVX512F-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm1 -; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm0 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm2 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm1 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm0 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm2 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2024,13 +1984,13 @@ ; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] ; AVX-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; @@ -2061,10 +2021,12 @@ ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2077,10 +2039,12 @@ ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2150,10 +2114,10 @@ ; AVX-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; @@ -2183,10 +2147,12 @@ ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero -; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2200,10 +2166,12 @@ ; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2277,13 +2245,13 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255] ; AVX-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; @@ -2314,10 +2282,12 @@ ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2330,10 +2300,12 @@ ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2404,10 +2376,10 @@ ; AVX-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; @@ -2429,35 +2401,39 @@ ; ; AVX512F-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm3 -; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 +; AVX512F-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm3 -; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 +; AVX512DQ-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm1 +; AVX512DQ-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2465,7 +2441,7 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm2 +; AVX512BW-NEXT: vpbroadcastq (%rdi), %ymm2 ; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} @@ -2504,8 +2480,8 @@ ; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 32(%rdx) -; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: @@ -2531,13 +2507,13 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] ; AVX-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; @@ -2568,10 +2544,12 @@ ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2584,10 +2562,12 @@ ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2645,8 +2625,8 @@ ; SSE42-NEXT: paddb %xmm1, %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rdx) -; SSE42-NEXT: movdqa %xmm2, (%rdx) ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: movdqa %xmm2, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: @@ -2656,11 +2636,11 @@ ; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm1 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: @@ -2691,10 +2671,12 @@ ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2709,10 +2691,12 @@ ; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2772,8 +2756,8 @@ ; SSE42-NEXT: paddb (%rsi), %xmm2 ; SSE42-NEXT: paddb 16(%rsi), %xmm1 ; SSE42-NEXT: movaps %xmm0, 32(%rdx) -; SSE42-NEXT: movdqa %xmm2, (%rdx) ; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm2, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: @@ -2786,8 +2770,8 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -2905,61 +2889,65 @@ ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpbroadcastw (%rdi), %xmm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm2 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm1 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3014,109 +3002,155 @@ ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] -; SSE42-NEXT: paddb 16(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 ; SSE42-NEXT: paddb 32(%rsi), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 32(%rdx) -; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] -; AVX-NEXT: vmovdqa (%rdi), %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[0,1],zero,zero,zero,zero,xmm2[0,1],zero,zero,zero,zero +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] +; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] ; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2],xmm1[3],mem[4,5],xmm1[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw (%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX2-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero -; AVX2-FAST-NEXT: vpbroadcastw (%rdi), %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2],xmm1[3],mem[4,5],xmm1[6],mem[7] +; AVX2-FAST-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] ; AVX2-FAST-NEXT: vpbroadcastw (%rdi), %xmm2 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; -; AVX512F-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] -; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] +; AVX512F-SLOW-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq ; -; AVX512DQ-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512F-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX512F-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] +; AVX512F-FAST-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] +; AVX512DQ-SLOW-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] +; AVX512DQ-FAST-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512BW: # %bb.0: @@ -3161,110 +3195,115 @@ ; SSE42-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; SSE42: # %bb.0: ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE42-NEXT: movdqa 48(%rdi), %xmm1 +; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; SSE42-NEXT: movdqa 16(%rsi), %xmm0 -; SSE42-NEXT: paddb %xmm2, %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: paddb 32(%rsi), %xmm2 -; SSE42-NEXT: movdqa %xmm2, 32(%rdx) -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: movdqa 16(%rsi), %xmm2 +; SSE42-NEXT: paddb %xmm1, %xmm2 +; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 32(%rdx) +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX: # %bb.0: ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] +; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %xmm1 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[0,1],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX2-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[0,1],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vpbroadcastw (%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] -; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[0,1],zero,zero,zero,zero,zero,zero ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-FAST-NEXT: vzeroupper @@ -3272,31 +3311,34 @@ ; ; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX512DQ-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastw (%rdi), %xmm1 ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] -; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[0,1],zero,zero,zero,zero,zero,zero ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-FAST-NEXT: vzeroupper @@ -3342,8 +3384,8 @@ ; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 32(%rdx) -; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: @@ -3352,115 +3394,161 @@ ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],mem[1,2,3,4,5],xmm3[6],mem[7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm3 +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm3 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rdx) -; SSE42-NEXT: movdqa %xmm3, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: movdqa %xmm3, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],mem[1,2,3,4,5],xmm2[6],mem[7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] ; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw (%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX2-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpbroadcastw (%rdi), %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7] +; AVX2-FAST-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] ; AVX2-FAST-NEXT: vpbroadcastw (%rdi), %xmm2 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; -; AVX512F-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] -; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] +; AVX512F-SLOW-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq ; -; AVX512DQ-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512F-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX512F-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] +; AVX512F-FAST-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] +; AVX512DQ-SLOW-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] +; AVX512DQ-FAST-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512BW: # %bb.0: @@ -3504,76 +3592,84 @@ ; SSE42-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 48(%rdi), %xmm1 +; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; SSE42-NEXT: movdqa 16(%rsi), %xmm0 -; SSE42-NEXT: paddb %xmm2, %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: paddb 32(%rsi), %xmm2 -; SSE42-NEXT: movdqa %xmm2, 32(%rdx) -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5,6,7] +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: movdqa 16(%rsi), %xmm2 +; SSE42-NEXT: paddb %xmm1, %xmm2 +; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 32(%rdx) +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7] -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],mem[1,2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm1[1,2,3,4,5,6,7] +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],mem[1,2,3,4,5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],mem[1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3648,47 +3744,109 @@ ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX2-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] -; AVX2-NEXT: vpbroadcastw (%rdi), %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq ; -; AVX512F-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] -; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX512F-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw (%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; -; AVX512DQ-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm1 -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX512DQ-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX2-FAST-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastw (%rdi), %xmm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpbroadcastw (%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX512F-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] +; AVX512F-FAST-NEXT: vpbroadcastw (%rdi), %xmm1 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX512F-FAST-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastw (%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastw (%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512BW: # %bb.0: @@ -3713,35 +3871,34 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.elt.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = mem[0,0,1,1] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movdqa 16(%rsi), %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm2 -; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, 32(%rdx) -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,1,1] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = mem[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: paddb %xmm2, %xmm1 +; SSE2-NEXT: paddb 32(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 32(%rdx) +; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; SSE42: # %bb.0: ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE42-NEXT: movdqa 48(%rdi), %xmm1 +; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE42-NEXT: movdqa 16(%rsi), %xmm0 -; SSE42-NEXT: paddb %xmm2, %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: paddb 32(%rsi), %xmm2 -; SSE42-NEXT: movdqa %xmm2, 32(%rdx) -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: movdqa 16(%rsi), %xmm2 +; SSE42-NEXT: paddb %xmm1, %xmm2 +; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 32(%rdx) +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: @@ -3752,70 +3909,36 @@ ; AVX-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[1,3],ymm2[4,4],ymm1[5,7] -; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,1,3] -; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm2 +; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7] +; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm2, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX2-SLOW-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastd (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] -; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rdx) -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq -; -; AVX2-FAST-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] -; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rdx) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd (%rdi), %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7] +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] +; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX512F: # %bb.0: @@ -3823,11 +3946,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u> ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3837,11 +3960,11 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u> ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3873,15 +3996,15 @@ ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],mem[1,2] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1] -; SSE2-NEXT: paddb 16(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 ; SSE2-NEXT: paddb 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 32(%rdx) -; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: @@ -3890,88 +4013,53 @@ ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5],xmm0[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1] -; SSE42-NEXT: paddb 16(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 ; SSE42-NEXT: paddb 32(%rsi), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 32(%rdx) -; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX-NEXT: vbroadcastss (%rdi), %ymm2 -; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1] -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vbroadcastss (%rdi), %ymm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = mem[0,1],xmm1[2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,1,1] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX2-SLOW-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,1,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,6,0] -; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6],ymm2[7] -; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,6,0] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq -; -; AVX2-FAST-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,6,0] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6],ymm2[7] -; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,1,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,6,0] +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6],ymm2[7] +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX512F: # %bb.0: @@ -3979,11 +4067,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u> ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3993,11 +4081,11 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u> ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4030,45 +4118,45 @@ ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE2-NEXT: paddb (%rsi), %xmm1 ; SSE2-NEXT: movdqa 16(%rsi), %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm1 ; SSE2-NEXT: paddb 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 32(%rdx) -; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 48(%rdi), %xmm1 +; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; SSE42-NEXT: movdqa 16(%rsi), %xmm0 -; SSE42-NEXT: paddb %xmm2, %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: paddb 32(%rsi), %xmm2 -; SSE42-NEXT: movdqa %xmm2, 32(%rdx) -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: movdqa 16(%rsi), %xmm2 +; SSE42-NEXT: paddb %xmm1, %xmm2 +; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 32(%rdx) +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3] -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX-NEXT: vmovaps 48(%rdi), %xmm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -4093,11 +4181,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u> ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4107,11 +4195,11 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u> ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4254,13 +4342,13 @@ ; SSE2-NEXT: movapd 48(%rdi), %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE2-NEXT: paddb (%rsi), %xmm1 ; SSE2-NEXT: movdqa 16(%rsi), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm1 ; SSE2-NEXT: paddb 32(%rsi), %xmm0 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: @@ -4269,29 +4357,30 @@ ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE42-NEXT: paddb (%rsi), %xmm1 ; SSE42-NEXT: movdqa 16(%rsi), %xmm2 ; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm1 ; SSE42-NEXT: paddb 32(%rsi), %xmm0 ; SSE42-NEXT: movdqa %xmm0, 32(%rdx) -; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3] -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX-NEXT: vmovaps 48(%rdi), %xmm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -4316,11 +4405,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,7,0,11,0,13,u,u> ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4330,11 +4419,11 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,7,0,11,0,13,u,u> ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4367,11 +4456,11 @@ ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; SSE2-NEXT: movaps 32(%rsi), %xmm2 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm1 +; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: movaps %xmm2, 32(%rdx) -; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: @@ -4381,28 +4470,27 @@ ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; SSE42-NEXT: movaps 32(%rsi), %xmm2 -; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm1 +; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: movaps %xmm2, 32(%rdx) -; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX-NEXT: vmovdqa (%rdi), %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX-NEXT: vmovaps 32(%rsi), %ymm2 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovapd (%rdi), %ymm0 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm0[0,1] +; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2] +; AVX-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -4477,58 +4565,58 @@ ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa (%rsi), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rsi), %xmm2 +; SSE2-NEXT: movdqa 16(%rsi), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 +; SSE2-NEXT: movdqa 32(%rsi), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rsi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm3, 48(%rdx) -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: paddb 48(%rsi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm3, 32(%rdx) +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero -; SSE42-NEXT: movdqa 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa (%rsi), %xmm1 ; SSE42-NEXT: paddb %xmm0, %xmm1 -; SSE42-NEXT: movdqa (%rsi), %xmm2 +; SSE42-NEXT: movdqa 16(%rsi), %xmm2 ; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: movdqa 48(%rsi), %xmm3 +; SSE42-NEXT: movdqa 32(%rsi), %xmm3 ; SSE42-NEXT: paddb %xmm0, %xmm3 -; SSE42-NEXT: paddb 32(%rsi), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rdx) -; SSE42-NEXT: movdqa %xmm3, 48(%rdx) -; SSE42-NEXT: movdqa %xmm2, (%rdx) -; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: paddb 48(%rsi), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 48(%rdx) +; SSE42-NEXT: movdqa %xmm3, 32(%rdx) +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4536,10 +4624,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4547,10 +4635,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4576,58 +4664,58 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa (%rsi), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rsi), %xmm2 +; SSE2-NEXT: movdqa 16(%rsi), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 +; SSE2-NEXT: movdqa 32(%rsi), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rsi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm3, 48(%rdx) -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: paddb 48(%rsi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm3, 32(%rdx) +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero -; SSE42-NEXT: movdqa 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa (%rsi), %xmm1 ; SSE42-NEXT: paddb %xmm0, %xmm1 -; SSE42-NEXT: movdqa (%rsi), %xmm2 +; SSE42-NEXT: movdqa 16(%rsi), %xmm2 ; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: movdqa 48(%rsi), %xmm3 +; SSE42-NEXT: movdqa 32(%rsi), %xmm3 ; SSE42-NEXT: paddb %xmm0, %xmm3 -; SSE42-NEXT: paddb 32(%rsi), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rdx) -; SSE42-NEXT: movdqa %xmm3, 48(%rdx) -; SSE42-NEXT: movdqa %xmm2, (%rdx) -; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: paddb 48(%rsi), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 48(%rdx) +; SSE42-NEXT: movdqa %xmm3, 32(%rdx) +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4635,10 +4723,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4646,10 +4734,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4675,58 +4763,58 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa (%rsi), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rsi), %xmm2 +; SSE2-NEXT: movdqa 16(%rsi), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 +; SSE2-NEXT: movdqa 32(%rsi), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rsi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm3, 48(%rdx) -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: paddb 48(%rsi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm3, 32(%rdx) +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: movdqa 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa (%rsi), %xmm1 ; SSE42-NEXT: paddb %xmm0, %xmm1 -; SSE42-NEXT: movdqa (%rsi), %xmm2 +; SSE42-NEXT: movdqa 16(%rsi), %xmm2 ; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: movdqa 48(%rsi), %xmm3 +; SSE42-NEXT: movdqa 32(%rsi), %xmm3 ; SSE42-NEXT: paddb %xmm0, %xmm3 -; SSE42-NEXT: paddb 32(%rsi), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rdx) -; SSE42-NEXT: movdqa %xmm3, 48(%rdx) -; SSE42-NEXT: movdqa %xmm2, (%rdx) -; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: paddb 48(%rsi), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 48(%rdx) +; SSE42-NEXT: movdqa %xmm3, 32(%rdx) +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4734,10 +4822,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4745,10 +4833,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4774,41 +4862,41 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4816,10 +4904,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4827,10 +4915,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4871,24 +4959,24 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rsi), %xmm2 ; AVX-NEXT: vmovaps 48(%rsi), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rdx) ; AVX-NEXT: vmovaps %xmm3, 48(%rdx) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX-NEXT: vmovaps %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0] ; AVX2-NEXT: vpand (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4896,10 +4984,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0] ; AVX512F-NEXT: vpand (%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4907,10 +4995,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0] ; AVX512DQ-NEXT: vpand (%rdi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4939,58 +5027,58 @@ ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa (%rsi), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rsi), %xmm2 +; SSE2-NEXT: movdqa 16(%rsi), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 +; SSE2-NEXT: movdqa 32(%rsi), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rsi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm3, 48(%rdx) -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: paddb 48(%rsi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm3, 32(%rdx) +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: ; SSE42: # %bb.0: ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE42-NEXT: movdqa 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa (%rsi), %xmm1 ; SSE42-NEXT: paddb %xmm0, %xmm1 -; SSE42-NEXT: movdqa (%rsi), %xmm2 +; SSE42-NEXT: movdqa 16(%rsi), %xmm2 ; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: movdqa 48(%rsi), %xmm3 +; SSE42-NEXT: movdqa 32(%rsi), %xmm3 ; SSE42-NEXT: paddb %xmm0, %xmm3 -; SSE42-NEXT: paddb 32(%rsi), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rdx) -; SSE42-NEXT: movdqa %xmm3, 48(%rdx) -; SSE42-NEXT: movdqa %xmm2, (%rdx) -; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: paddb 48(%rsi), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 48(%rdx) +; SSE42-NEXT: movdqa %xmm3, 32(%rdx) +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: ; AVX: # %bb.0: ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4998,10 +5086,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5009,10 +5097,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5041,17 +5129,17 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa (%rsi), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rsi), %xmm2 +; SSE2-NEXT: movdqa 16(%rsi), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 +; SSE2-NEXT: movdqa 32(%rsi), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rsi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm3, 48(%rdx) -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: paddb 48(%rsi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm3, 32(%rdx) +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: @@ -5059,17 +5147,17 @@ ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; SSE42-NEXT: movdqa 16(%rsi), %xmm0 +; SSE42-NEXT: movdqa (%rsi), %xmm0 ; SSE42-NEXT: paddb %xmm1, %xmm0 -; SSE42-NEXT: movdqa (%rsi), %xmm2 +; SSE42-NEXT: movdqa 16(%rsi), %xmm2 ; SSE42-NEXT: paddb %xmm1, %xmm2 -; SSE42-NEXT: movdqa 48(%rsi), %xmm3 +; SSE42-NEXT: movdqa 32(%rsi), %xmm3 ; SSE42-NEXT: paddb %xmm1, %xmm3 -; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rdx) -; SSE42-NEXT: movdqa %xmm3, 48(%rdx) -; SSE42-NEXT: movdqa %xmm2, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb 48(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 48(%rdx) +; SSE42-NEXT: movdqa %xmm3, 32(%rdx) +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: @@ -5077,24 +5165,24 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1] ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5102,10 +5190,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5113,10 +5201,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5144,48 +5232,48 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa (%rsi), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rsi), %xmm2 +; SSE2-NEXT: movdqa 16(%rsi), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 +; SSE2-NEXT: movdqa 32(%rsi), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rsi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm3, 48(%rdx) -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: paddb 48(%rsi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm3, 32(%rdx) +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: pxor %xmm0, %xmm0 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] -; SSE42-NEXT: movdqa 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa (%rsi), %xmm1 ; SSE42-NEXT: paddb %xmm0, %xmm1 -; SSE42-NEXT: movdqa (%rsi), %xmm2 +; SSE42-NEXT: movdqa 16(%rsi), %xmm2 ; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: movdqa 48(%rsi), %xmm3 +; SSE42-NEXT: movdqa 32(%rsi), %xmm3 ; SSE42-NEXT: paddb %xmm0, %xmm3 -; SSE42-NEXT: paddb 32(%rsi), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rdx) -; SSE42-NEXT: movdqa %xmm3, 48(%rdx) -; SSE42-NEXT: movdqa %xmm2, (%rdx) -; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: paddb 48(%rsi), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 48(%rdx) +; SSE42-NEXT: movdqa %xmm3, 32(%rdx) +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: @@ -5193,10 +5281,10 @@ ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5205,10 +5293,10 @@ ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5217,10 +5305,10 @@ ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5278,48 +5366,92 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rsi), %xmm2 ; AVX-NEXT: vmovaps 48(%rsi), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rdx) ; AVX-NEXT: vmovaps %xmm3, 48(%rdx) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX-NEXT: vmovaps %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; -; AVX2-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,0,0,0] -; AVX2-NEXT: vpand (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpand (%rdi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,0,0,0] +; AVX2-FAST-NEXT: vpand (%rdi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-SLOW-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,0,0,0] +; AVX512F-FAST-NEXT: vpand (%rdi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq ; -; AVX512F-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,0,0,0] -; AVX512F-NEXT: vpand (%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512DQ-SLOW-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq ; -; AVX512DQ-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,0,0,0] -; AVX512DQ-NEXT: vpand (%rdi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512DQ-FAST-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,0,0,0] +; AVX512DQ-FAST-NEXT: vpand (%rdi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: ; AVX512BW: # %bb.0: @@ -5346,17 +5478,17 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,1,1] ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa (%rsi), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rsi), %xmm2 +; SSE2-NEXT: movdqa 16(%rsi), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 +; SSE2-NEXT: movdqa 32(%rsi), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rsi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm3, 48(%rdx) -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: paddb 48(%rsi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm3, 32(%rdx) +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: @@ -5364,17 +5496,17 @@ ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE42-NEXT: movdqa 16(%rsi), %xmm0 +; SSE42-NEXT: movdqa (%rsi), %xmm0 ; SSE42-NEXT: paddb %xmm1, %xmm0 -; SSE42-NEXT: movdqa (%rsi), %xmm2 +; SSE42-NEXT: movdqa 16(%rsi), %xmm2 ; SSE42-NEXT: paddb %xmm1, %xmm2 -; SSE42-NEXT: movdqa 48(%rsi), %xmm3 +; SSE42-NEXT: movdqa 32(%rsi), %xmm3 ; SSE42-NEXT: paddb %xmm1, %xmm3 -; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rdx) -; SSE42-NEXT: movdqa %xmm3, 48(%rdx) -; SSE42-NEXT: movdqa %xmm2, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb 48(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 48(%rdx) +; SSE42-NEXT: movdqa %xmm3, 32(%rdx) +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: @@ -5383,15 +5515,15 @@ ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm3 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -5401,10 +5533,10 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -5412,10 +5544,10 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -5423,10 +5555,10 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero -; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -5436,11 +5568,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5450,11 +5582,11 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5482,17 +5614,17 @@ ; SSE-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: ; SSE: # %bb.0: ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: @@ -5500,15 +5632,15 @@ ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm3 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -5517,10 +5649,10 @@ ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5529,11 +5661,11 @@ ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5542,11 +5674,11 @@ ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5587,23 +5719,24 @@ ; AVX-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rsi), %xmm2 ; AVX-NEXT: vmovaps 48(%rsi), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rdx) ; AVX-NEXT: vmovaps %xmm3, 48(%rdx) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX-NEXT: vmovaps %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5612,11 +5745,11 @@ ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5625,11 +5758,11 @@ ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5656,17 +5789,17 @@ ; SSE-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: ; SSE: # %bb.0: ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: @@ -5674,15 +5807,15 @@ ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm3 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -5691,10 +5824,10 @@ ; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5704,11 +5837,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5718,11 +5851,11 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5764,23 +5897,24 @@ ; AVX-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rsi), %xmm2 ; AVX-NEXT: vmovaps 48(%rsi), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rdx) ; AVX-NEXT: vmovaps %xmm3, 48(%rdx) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX-NEXT: vmovaps %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5789,11 +5923,11 @@ ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [8,1,2,3,8,5,6,7] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5802,11 +5936,11 @@ ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [8,1,2,3,8,5,6,7] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5835,13 +5969,13 @@ ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movaps 16(%rsi), %xmm1 ; SSE-NEXT: movaps 48(%rsi), %xmm2 -; SSE-NEXT: movdqa 32(%rsi), %xmm3 +; SSE-NEXT: movdqa (%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: paddb 32(%rsi), %xmm0 ; SSE-NEXT: movaps %xmm2, 48(%rdx) ; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movdqa %xmm0, (%rdx) -; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: movdqa %xmm3, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: @@ -5891,14 +6025,14 @@ ; AVX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; AVX-NEXT: shrq $56, %rax ; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rsi), %xmm2 ; AVX-NEXT: vmovaps 48(%rsi), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rdx) ; AVX-NEXT: vmovaps %xmm3, 48(%rdx) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX-NEXT: vmovaps %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: popq %rbx ; AVX-NEXT: retq ; @@ -5949,50 +6083,175 @@ ; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: shrq $56, %rax ; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: popq %rbx ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: movq (%rdi), %rax +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: movq %rax, %r8 +; AVX512F-NEXT: movq %rax, %r9 +; AVX512F-NEXT: movq %rax, %r10 +; AVX512F-NEXT: movl %eax, %r11d +; AVX512F-NEXT: movl %eax, %ebx +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: shrl $16, %ebx +; AVX512F-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 +; AVX512F-NEXT: shrl $24, %r11d +; AVX512F-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; AVX512F-NEXT: shrq $32, %r10 +; AVX512F-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0 +; AVX512F-NEXT: shrq $40, %r9 +; AVX512F-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 +; AVX512F-NEXT: shrq $48, %r8 +; AVX512F-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 +; AVX512F-NEXT: movq 8(%rdi), %rax +; AVX512F-NEXT: shrq $56, %rcx +; AVX512F-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: shrl $8, %ecx +; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: shrl $16, %ecx +; AVX512F-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: shrl $24, %ecx +; AVX512F-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $32, %rcx +; AVX512F-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $40, %rcx +; AVX512F-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $48, %rcx +; AVX512F-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: shrq $56, %rax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: popq %rbx ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 +; AVX512DQ-NEXT: pushq %rbx +; AVX512DQ-NEXT: movq (%rdi), %rax +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: movq %rax, %r9 +; AVX512DQ-NEXT: movq %rax, %r10 +; AVX512DQ-NEXT: movl %eax, %r11d +; AVX512DQ-NEXT: movl %eax, %ebx +; AVX512DQ-NEXT: vmovd %eax, %xmm0 +; AVX512DQ-NEXT: shrl $8, %eax +; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrl $16, %ebx +; AVX512DQ-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrl $24, %r11d +; AVX512DQ-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrq $32, %r10 +; AVX512DQ-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrq $40, %r9 +; AVX512DQ-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrq $48, %r8 +; AVX512DQ-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 +; AVX512DQ-NEXT: movq 8(%rdi), %rax +; AVX512DQ-NEXT: shrq $56, %rcx +; AVX512DQ-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movl %eax, %ecx +; AVX512DQ-NEXT: shrl $8, %ecx +; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movl %eax, %ecx +; AVX512DQ-NEXT: shrl $16, %ecx +; AVX512DQ-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movl %eax, %ecx +; AVX512DQ-NEXT: shrl $24, %ecx +; AVX512DQ-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shrq $32, %rcx +; AVX512DQ-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shrq $40, %rcx +; AVX512DQ-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shrq $48, %rcx +; AVX512DQ-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrq $56, %rax +; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: popq %rbx ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0 +; AVX512BW-NEXT: pushq %rbx +; AVX512BW-NEXT: movq (%rdi), %rax +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: movq %rax, %r9 +; AVX512BW-NEXT: movq %rax, %r10 +; AVX512BW-NEXT: movl %eax, %r11d +; AVX512BW-NEXT: movl %eax, %ebx +; AVX512BW-NEXT: vmovd %eax, %xmm0 +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: shrl $16, %ebx +; AVX512BW-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 +; AVX512BW-NEXT: shrl $24, %r11d +; AVX512BW-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; AVX512BW-NEXT: shrq $32, %r10 +; AVX512BW-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0 +; AVX512BW-NEXT: shrq $40, %r9 +; AVX512BW-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 +; AVX512BW-NEXT: shrq $48, %r8 +; AVX512BW-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 +; AVX512BW-NEXT: movq 8(%rdi), %rax +; AVX512BW-NEXT: shrq $56, %rcx +; AVX512BW-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $8, %ecx +; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $16, %ecx +; AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $24, %ecx +; AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shrq $32, %rcx +; AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shrq $40, %rcx +; AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shrq $48, %rcx +; AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: shrq $56, %rax +; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: popq %rbx ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64 diff --git a/llvm/test/CodeGen/X86/zext-demanded.ll b/llvm/test/CodeGen/X86/zext-demanded.ll --- a/llvm/test/CodeGen/X86/zext-demanded.ll +++ b/llvm/test/CodeGen/X86/zext-demanded.ll @@ -140,7 +140,8 @@ define i32 @PR36689(ptr) { ; CHECK-LABEL: PR36689: ; CHECK: # %bb.0: -; CHECK-NEXT: movzwl (%rdi), %eax +; CHECK-NEXT: movzbl 1(%rdi), %eax +; CHECK-NEXT: shll $8, %eax ; CHECK-NEXT: orl $255, %eax ; CHECK-NEXT: retq %2 = load i32, ptr %0 diff --git a/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll b/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll --- a/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll +++ b/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll @@ -14,7 +14,7 @@ ; ; X64-LABEL: test1: ; X64: # %bb.0: # %entry -; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movzbl (%rdi), %eax ; X64-NEXT: shll $2, %eax ; X64-NEXT: andl $60, %eax ; X64-NEXT: retq @@ -37,7 +37,7 @@ ; ; X64-LABEL: test2: ; X64: # %bb.0: # %entry -; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movzbl (%rdi), %eax ; X64-NEXT: andl $15, %eax ; X64-NEXT: leaq (%rdi,%rax,4), %rax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/zext-shl.ll b/llvm/test/CodeGen/X86/zext-shl.ll --- a/llvm/test/CodeGen/X86/zext-shl.ll +++ b/llvm/test/CodeGen/X86/zext-shl.ll @@ -51,7 +51,7 @@ ; X64-LABEL: i64_zext_shift_i16_zext_i8: ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: shlq $5, %rax +; X64-NEXT: shll $5, %eax ; X64-NEXT: retq %t0 = zext i8 %a0 to i16 %t1 = shl i16 %t0, 5